{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import re\n", "from bs4 import BeautifulSoup\n", "from glob import glob\n", "import json\n", "from tqdm import tqdm_notebook as tqdm" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "files = glob('D:\\\\Desktop\\\\diplomka\\\\wikidump\\\\parsed\\\\*')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "83cff3cf94b54fb4a030c5c493c90ddc", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=327), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_140 is has a formatting issue near ), vysoký tón \n", "file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_140 is has a formatting issue near ).\n", "\n", "Podľa tónu a výslovnosti sa výrazne mení význam toho istého slova. Napríklad \"ike\" môže znamenať \"zadok\", \"silu\", \"rozdeliť\", \"spájať\". Slovo \"oke\" podobne môže znamenať \"diel\", \"hranica\", \"muž\" alebo \"potkan\".\n", "\n", "\n", "\n", "\n", "\n", "file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_190 is has a formatting issue near \" vyhovuje predpokladom \n", "file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_88 is has a formatting issue near \n", "\n", "Dôležitým relačným systémom je \n" ] } ], "source": [ "parsed = []\n", "\n", "index = 0\n", "for i in tqdm(range(len(files))):\n", " f = open(files[i], \"r\", encoding=\"utf-8\")\n", " content = f.read()\n", " soup = BeautifulSoup(content)\n", " \n", " docs = soup.find_all('doc')\n", " \n", " for doc in docs:\n", " id = doc['id']\n", " title = doc['title']\n", " url = doc['url']\n", " paragraphs = doc.text.replace('\\n', '').replace('\\xa0', ' ').split('Section::::')\n", " hrefs = doc.find_all('a')\n", " \n", " references = []\n", " for href in hrefs:\n", " try:\n", " references.append(href['href'])\n", " except KeyError:\n", " print(f'file {files[i]} is has a formatting issue near {href}')\n", " \n", " parsed.append({\n", " title:{\n", " 'id': id,\n", " 'url': url,\n", " 'paragraphs': paragraphs, \n", " 'references': list(set(references))\n", " }\n", " })\n", " \n", " with open(f'json/file{index}.json', 'w+') as fp:\n", " json.dump(parsed, fp)\n", " parsed = []\n", " index += 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }