dp2021/bz2tojson.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import re\n",
    "from bs4 import BeautifulSoup\n",
    "from glob import glob\n",
    "import json\n",
    "from tqdm import tqdm_notebook as tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = glob('D:\\\\Desktop\\\\diplomka\\\\wikidump\\\\parsed\\\\*')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "83cff3cf94b54fb4a030c5c493c90ddc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=327), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_140 is has a formatting issue near <a> ), vysoký tón </a>\n",
      "file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_140 is has a formatting issue near <a> ).\n",
      "\n",
      "Podľa tónu a výslovnosti sa výrazne mení význam toho istého slova. Napríklad \"ike\" môže znamenať \"zadok\", \"silu\", \"rozdeliť\", \"spájať\". Slovo \"oke\" podobne môže znamenať \"diel\", \"hranica\", \"muž\" alebo \"potkan\".\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "</a>\n",
      "file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_190 is has a formatting issue near <a>\" vyhovuje predpokladom </a>\n",
      "file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_88 is has a formatting issue near <a ...=\"\" r=\"\">\n",
      "\n",
      "Dôležitým relačným systémom je </a>\n"
     ]
    }
   ],
   "source": [
    "parsed = []\n",
    "\n",
    "index = 0\n",
    "for i in tqdm(range(len(files))):\n",
    "    f = open(files[i], \"r\", encoding=\"utf-8\")\n",
    "    content = f.read()\n",
    "    soup = BeautifulSoup(content)\n",
    "    \n",
    "    docs = soup.find_all('doc')\n",
    "    \n",
    "    for doc in docs:\n",
    "        id = doc['id']\n",
    "        title = doc['title']\n",
    "        url = doc['url']\n",
    "        paragraphs = doc.text.replace('\\n', '').replace('\\xa0', ' ').split('Section::::')\n",
    "        hrefs = doc.find_all('a')\n",
    "        \n",
    "        references = []\n",
    "        for href in hrefs:\n",
    "            try:\n",
    "                references.append(href['href'])\n",
    "            except KeyError:\n",
    "                print(f'file {files[i]} is has a formatting issue near {href}')\n",
    "            \n",
    "        parsed.append({\n",
    "            title:{\n",
    "                'id': id,\n",
    "                'url': url,\n",
    "                'paragraphs': paragraphs, \n",
    "                'references': list(set(references))\n",
    "            }\n",
    "        })\n",
    "        \n",
    "    with open(f'json/file{index}.json', 'w+') as fp:\n",
    "        json.dump(parsed, fp)\n",
    "        parsed = []\n",
    "        index += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}