136 lines
3.8 KiB
Plaintext
136 lines
3.8 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"import pandas as pd\n",
|
|
"import re\n",
|
|
"from bs4 import BeautifulSoup\n",
|
|
"from glob import glob\n",
|
|
"import json\n",
|
|
"from tqdm import tqdm_notebook as tqdm"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"files = glob('D:\\\\Desktop\\\\diplomka\\\\wikidump\\\\parsed\\\\*')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "83cff3cf94b54fb4a030c5c493c90ddc",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"HBox(children=(IntProgress(value=0, max=327), HTML(value='')))"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_140 is has a formatting issue near <a> ), vysoký tón </a>\n",
|
|
"file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_140 is has a formatting issue near <a> ).\n",
|
|
"\n",
|
|
"Podľa tónu a výslovnosti sa výrazne mení význam toho istého slova. Napríklad \"ike\" môže znamenať \"zadok\", \"silu\", \"rozdeliť\", \"spájať\". Slovo \"oke\" podobne môže znamenať \"diel\", \"hranica\", \"muž\" alebo \"potkan\".\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"</a>\n",
|
|
"file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_190 is has a formatting issue near <a>\" vyhovuje predpokladom </a>\n",
|
|
"file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_88 is has a formatting issue near <a ...=\"\" r=\"\">\n",
|
|
"\n",
|
|
"Dôležitým relačným systémom je </a>\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"parsed = []\n",
|
|
"\n",
|
|
"index = 0\n",
|
|
"for i in tqdm(range(len(files))):\n",
|
|
" f = open(files[i], \"r\", encoding=\"utf-8\")\n",
|
|
" content = f.read()\n",
|
|
" soup = BeautifulSoup(content)\n",
|
|
" \n",
|
|
" docs = soup.find_all('doc')\n",
|
|
" \n",
|
|
" for doc in docs:\n",
|
|
" id = doc['id']\n",
|
|
" title = doc['title']\n",
|
|
" url = doc['url']\n",
|
|
" paragraphs = doc.text.replace('\\n', '').replace('\\xa0', ' ').split('Section::::')\n",
|
|
" hrefs = doc.find_all('a')\n",
|
|
" \n",
|
|
" references = []\n",
|
|
" for href in hrefs:\n",
|
|
" try:\n",
|
|
" references.append(href['href'])\n",
|
|
" except KeyError:\n",
|
|
" print(f'file {files[i]} is has a formatting issue near {href}')\n",
|
|
" \n",
|
|
" parsed.append({\n",
|
|
" title:{\n",
|
|
" 'id': id,\n",
|
|
" 'url': url,\n",
|
|
" 'paragraphs': paragraphs, \n",
|
|
" 'references': list(set(references))\n",
|
|
" }\n",
|
|
" })\n",
|
|
" \n",
|
|
" with open(f'json/file{index}.json', 'w+') as fp:\n",
|
|
" json.dump(parsed, fp)\n",
|
|
" parsed = []\n",
|
|
" index += 1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|