Pridanie parsovacieho nastroja a notebooku pre konvertovanie parsovanych clankov

2020-04-16 13:18:04 +02:00 · 2020-04-16 13:18:04 +02:00 · 06ad8f769a
commit 06ad8f769a
parent 030dd1b4a1
8 changed files with 40453 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+wikidump/
+json/
+.ipynb_checkpoints/
+parsed.json
--- a/bz2tojson.ipynb
+++ b/bz2tojson.ipynb
@ -0,0 +1,135 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "from bs4 import BeautifulSoup\n",
+    "from glob import glob\n",
+    "import json\n",
+    "from tqdm import tqdm_notebook as tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = glob('D:\\\\Desktop\\\\diplomka\\\\wikidump\\\\parsed\\\\*')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "83cff3cf94b54fb4a030c5c493c90ddc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=327), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_140 is has a formatting issue near <a> ), vysoký tón </a>\n",
+      "file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_140 is has a formatting issue near <a> ).\n",
+      "\n",
+      "Podľa tónu a výslovnosti sa výrazne mení význam toho istého slova. Napríklad \"ike\" môže znamenať \"zadok\", \"silu\", \"rozdeliť\", \"spájať\". Slovo \"oke\" podobne môže znamenať \"diel\", \"hranica\", \"muž\" alebo \"potkan\".\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "</a>\n",
+      "file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_190 is has a formatting issue near <a>\" vyhovuje predpokladom </a>\n",
+      "file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_88 is has a formatting issue near <a ...=\"\" r=\"\">\n",
+      "\n",
+      "Dôležitým relačným systémom je </a>\n"
+     ]
+    }
+   ],
+   "source": [
+    "parsed = []\n",
+    "\n",
+    "index = 0\n",
+    "for i in tqdm(range(len(files))):\n",
+    "    f = open(files[i], \"r\", encoding=\"utf-8\")\n",
+    "    content = f.read()\n",
+    "    soup = BeautifulSoup(content)\n",
+    "    \n",
+    "    docs = soup.find_all('doc')\n",
+    "    \n",
+    "    for doc in docs:\n",
+    "        id = doc['id']\n",
+    "        title = doc['title']\n",
+    "        url = doc['url']\n",
+    "        paragraphs = doc.text.replace('\\n', '').replace('\\xa0', ' ').split('Section::::')\n",
+    "        hrefs = doc.find_all('a')\n",
+    "        \n",
+    "        references = []\n",
+    "        for href in hrefs:\n",
+    "            try:\n",
+    "                references.append(href['href'])\n",
+    "            except KeyError:\n",
+    "                print(f'file {files[i]} is has a formatting issue near {href}')\n",
+    "            \n",
+    "        parsed.append({\n",
+    "            title:{\n",
+    "                'id': id,\n",
+    "                'url': url,\n",
+    "                'paragraphs': paragraphs, \n",
+    "                'references': list(set(references))\n",
+    "            }\n",
+    "        })\n",
+    "        \n",
+    "    with open(f'json/file{index}.json', 'w+') as fp:\n",
+    "        json.dump(parsed, fp)\n",
+    "        parsed = []\n",
+    "        index += 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/wikiextractor/.gitignore
+++ b/wikiextractor/.gitignore
@ -0,0 +1,93 @@
+local/
+tmp/
+
+### https://raw.github.com/github/gitignore/c699a4f4684e9e294c9c550f820ca330f019b6f9/python.gitignore
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask instance folder
+instance/
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Editor files
+*.idea
--- a/wikiextractor/README.md
+++ b/wikiextractor/README.md
@ -0,0 +1,135 @@
+# WikiExtractor
+[WikiExtractor.py](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) is a Python script that extracts and cleans text from a [Wikipedia database dump](http://download.wikimedia.org/).
+
+The tool is written in Python and requires Python 2.7 or Python 3.3+ but no additional library.
+
+For further information, see the [project Home Page](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) or the [Wiki](https://github.com/attardi/wikiextractor/wiki).
+
+# Wikipedia Cirrus Extractor
+
+`cirrus-extractor.py` is a version of the script that performs extraction from a Wikipedia Cirrus dump.
+Cirrus dumps contain text with already expanded templates.
+
+Cirrus dumps are available at:
+[cirrussearch](http://dumps.wikimedia.org/other/cirrussearch/).
+
+# Details
+
+WikiExtractor performs template expansion by preprocessing the whole dump and extracting template definitions.
+
+In order to speed up processing:
+
+- multiprocessing is used for dealing with articles in parallel
+- a cache is kept of parsed templates (only useful for repeated extractions).
+
+## Installation
+
+The script may be invoked directly, however it can be installed by doing:
+
+    (sudo) python setup.py install
+
+## Usage
+The script is invoked with a Wikipedia dump file as an argument.
+The output is stored in several files of similar size in a given directory.
+Each file will contains several documents in this [document format](http://medialab.di.unipi.it/wiki/Document_Format).
+
+    usage: WikiExtractor.py [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--json] [--html]
+                            [-l] [-s] [--lists] [-ns ns1,ns2]
+                            [--templates TEMPLATES] [--no-templates] [-r]
+                            [--min_text_length MIN_TEXT_LENGTH]
+                            [--filter_category path_of_categories_file]
+                            [--filter_disambig_pages] [-it abbr,b,big]
+                            [-de gallery,timeline,noinclude] [--keep_tables]
+                            [--processes PROCESSES] [-q] [--debug] [-a] [-v]
+                            [--log_file]
+                            input
+
+    Wikipedia Extractor:
+    Extracts and cleans text from a Wikipedia database dump and stores output in a
+    number of files of similar size in a given directory.
+    Each file will contain several documents in the format:
+
+        <doc id="" revid="" url="" title="">
+            ...
+            </doc>
+
+    If the program is invoked with the --json flag, then each file will
+    contain several documents formatted as json ojects, one per line, with
+    the following structure
+
+        {"id": "", "revid": "", "url":"", "title": "", "text": "..."}
+
+    Template expansion requires preprocesssng first the whole dump and
+    collecting template definitions.
+
+    positional arguments:
+      input                 XML wiki dump file
+
+    optional arguments:
+      -h, --help            show this help message and exit
+      --processes PROCESSES
+                            Number of processes to use (default 1)
+
+    Output:
+      -o OUTPUT, --output OUTPUT
+                            directory for extracted files (or '-' for dumping to
+                            stdout)
+      -b n[KMG], --bytes n[KMG]
+                            maximum bytes per output file (default 1M)
+      -c, --compress        compress output files using bzip
+      --json                write output in json format instead of the default one
+
+    Processing:
+      --html                produce HTML output, subsumes --links
+      -l, --links           preserve links
+      -s, --sections        preserve sections
+      --lists               preserve lists
+      -ns ns1,ns2, --namespaces ns1,ns2
+                            accepted namespaces in links
+      --templates TEMPLATES
+                            use or create file containing templates
+      --no-templates        Do not expand templates
+      -r, --revision        Include the document revision id (default=False)
+      --min_text_length MIN_TEXT_LENGTH
+                            Minimum expanded text length required to write
+                            document (default=0)
+      --filter_category path_of_categories_file
+                            Include or exclude specific categories from the dataset. Specify the categories in
+                            file 'path_of_categories_file'. Format:
+                            One category one line, and if the line starts with:
+                                1) #: Comments, ignored;
+                                2) ^: the categories will be in excluding-categories
+                                3) others: the categories will be in including-categories.
+                            Priority:
+                                1) If excluding-categories is not empty, and any category of a page exists in excluding-categories, the page will be excluded; else
+                                2) If including-categories is not empty, and no category of a page exists in including-categories, the page will be excluded; else
+                                3) the page will be included
+
+      --filter_disambig_pages
+                            Remove pages from output that contain disabmiguation
+                            markup (default=False)
+      -it abbr,b,big, --ignored_tags abbr,b,big
+                            comma separated list of tags that will be dropped,
+                            keeping their content
+      -de gallery,timeline,noinclude, --discard_elements gallery,timeline,noinclude
+                            comma separated list of elements that will be removed
+                            from the article text
+      --keep_tables         Preserve tables in the output article text
+                            (default=False)
+
+    Special:
+      -q, --quiet           suppress reporting progress info
+      --debug               print debug info
+      -a, --article         analyze a file containing a single article (debug
+                            option)
+      -v, --version         print program version
+      --log_file            specify a file to save the log information.
+
+
+Saving templates to a file will speed up performing extraction the next time,
+assuming template definitions have not changed.
+
+Option --no-templates significantly speeds up the extractor, avoiding the cost
+of expanding [MediaWiki templates](https://www.mediawiki.org/wiki/Help:Templates).
+
+For further information, visit [the documentation](http://attardi.github.io/wikiextractor).
--- a/wikiextractor/WikiExtractor.py
+++ b/wikiextractor/WikiExtractor.py
--- a/wikiextractor/categories.filter
+++ b/wikiextractor/categories.filter
--- a/wikiextractor/cirrus-extract.py
+++ b/wikiextractor/cirrus-extract.py
@ -0,0 +1,248 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# =============================================================================
+#  Version: 1.00 (December 15, 2015)
+#  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
+#
+# =============================================================================
+#  Copyright (c) 2015. Giuseppe Attardi (attardi@di.unipi.it).
+# =============================================================================
+#  This file is part of Tanl.
+#
+#  Tanl is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU General Public License, version 3,
+#  as published by the Free Software Foundation.
+#
+#  Tanl is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# =============================================================================
+
+"""Wikipedia Cirrus Extractor:
+Extracts and cleans text from a Wikipedia Cirrus dump and stores output in a
+number of files of similar size in a given directory.
+Each file will contain several documents in the format:
+
+	<doc id="" url="" title="" language="" revision="">
+        ...
+        </doc>
+
+"""
+
+import sys, os.path, time
+import re
+import json
+import argparse
+import bz2
+import gzip
+import logging
+
+# Program version
+version = '1.00'
+
+urlbase = 'http://it.wikipedia.org/'
+
+# ----------------------------------------------------------------------
+
+class NextFile(object):
+    """
+    Synchronous generation of next available file name.
+    """
+
+    filesPerDir = 100
+
+    def __init__(self, path_name):
+        self.path_name = path_name
+        self.dir_index = -1
+        self.file_index = -1
+
+    def next(self):
+        self.file_index = (self.file_index + 1) % NextFile.filesPerDir
+        if self.file_index == 0:
+            self.dir_index += 1
+        dirname = self._dirname()
+        if not os.path.isdir(dirname):
+            os.makedirs(dirname)
+        return self._filepath()
+
+    def _dirname(self):
+        char1 = self.dir_index % 26
+        char2 = self.dir_index / 26 % 26
+        return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
+
+    def _filepath(self):
+        return '%s/wiki_%02d' % (self._dirname(), self.file_index)
+
+class OutputSplitter(object):
+    """
+    File-like object, that splits output to multiple files of a given max size.
+    """
+
+    def __init__(self, nextFile, max_file_size=0, compress=True):
+        """
+        :param nextfile: a NextFile object from which to obtain filenames
+            to use.
+        :param max_file_size: the maximum size of each file.
+        :para compress: whether to write data with bzip compression.
+        """
+        self.nextFile = nextFile
+        self.compress = compress
+        self.max_file_size = max_file_size
+        self.file = self.open(self.nextFile.next())
+
+    def reserve(self, size):
+        if self.file.tell() + size > self.max_file_size:
+            self.close()
+            self.file = self.open(self.nextFile.next())
+
+    def write(self, data):
+        self.reserve(len(data))
+        self.file.write(data)
+
+    def close(self):
+        self.file.close()
+
+    def open(self, filename):
+        if self.compress:
+            return bz2.BZ2File(filename + '.bz2', 'w')
+        else:
+            return open(filename, 'w')
+
+# ----------------------------------------------------------------------
+
+class Extractor(object):
+
+    def extract(self, out):
+        """
+        :param out: output file.
+        """
+        logging.debug("%s\t%s", self.id, self.title)
+        text = ''.join(self.page)
+        url = get_url(self.id)
+        header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (self.id, url, self.title, self.language, self.revision)
+        # Separate header from text with a newline.
+        header += self.title + '\n\n'
+        header = header.encode('utf-8')
+        footer = "\n</doc>\n"
+        out.write(header)
+        text = clean(self, text)
+        for line in compact(text):
+            out.write(line.encode('utf-8'))
+            out.write('\n')
+        out.write(footer)
+
+def process_dump(input_file, out_file, file_size, file_compress):
+    """
+    :param input_file: name of the wikipedia dump file; '-' to read from stdin
+    :param out_file: directory where to store extracted data, or '-' for stdout
+    :param file_size: max size of each extracted file, or None for no max (one file)
+    :param file_compress: whether to compress files with bzip.
+    """
+
+    if input_file == '-':
+        input = sys.stdin
+    else:
+        input = gzip.open(input_file)
+
+    if out_file == '-':
+        output = sys.stdout
+        if file_compress:
+            logging.warn("writing to stdout, so no output compression (use external tool)")
+    else:
+        nextFile = NextFile(out_file)
+        output = OutputSplitter(nextFile, file_size, file_compress)
+
+    # process dump
+    # format
+    # {"index":{"_type":"page","_id":"3825914"}}
+    # {"namespace":0,"title":TITLE,"timestamp":"2014-06-29T15:51:09Z","text":TEXT,...}
+    while True:
+        line = input.readline()
+        if not line:
+            break
+        index = json.loads(line)
+        content = json.loads(input.readline())
+        type = index['index']['_type']
+        id = index['index']['_id']
+        language = content['language']
+        revision = content['version']
+        if type == 'page' and content['namespace'] == 0:
+            title = content['title']
+            text = content['text']
+            # drop references:
+            # ^ The Penguin Dictionary
+            text = re.sub(r'  \^ .*', '', text)
+            url = urlbase + 'wiki?curid=' + id
+            header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (id, url, title, language, revision)
+            page = header + title + '\n\n' + text + '\n</doc>\n'
+            output.write(page.encode('utf-8'))
+
+# ----------------------------------------------------------------------
+
+# Minimum size of output files
+minFileSize = 200 * 1024
+
+def main():
+    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+                                     description=__doc__)
+    parser.add_argument("input",
+                        help="Cirrus Json wiki dump file")
+    groupO = parser.add_argument_group('Output')
+    groupO.add_argument("-o", "--output", default="text",
+                        help="directory for extracted files (or '-' for dumping to stdin)")
+    groupO.add_argument("-b", "--bytes", default="1M",
+                        help="maximum bytes per output file (default %(default)s)",
+                        metavar="n[KMG]")
+    groupO.add_argument("-c", "--compress", action="store_true",
+                        help="compress output files using bzip")
+
+    groupP = parser.add_argument_group('Processing')
+    groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
+                        help="accepted namespaces")
+
+    groupS = parser.add_argument_group('Special')
+    groupS.add_argument("-q", "--quiet", action="store_true",
+                        help="suppress reporting progress info")
+    groupS.add_argument("-v", "--version", action="version",
+                        version='%(prog)s ' + version,
+                        help="print program version")
+
+    args = parser.parse_args()
+
+    try:
+        power = 'kmg'.find(args.bytes[-1].lower()) + 1
+        file_size = int(args.bytes[:-1]) * 1024 ** power
+        if file_size < minFileSize:
+            raise ValueError()
+    except ValueError:
+        logging.error('Insufficient or invalid size: %s', args.bytes)
+        return
+
+    FORMAT = '%(levelname)s: %(message)s'
+    logging.basicConfig(format=FORMAT)
+
+    logger = logging.getLogger()
+    if not args.quiet:
+        logger.setLevel(logging.INFO)
+
+    input_file = args.input
+
+    output_path = args.output
+    if output_path != '-' and not os.path.isdir(output_path):
+        try:
+            os.makedirs(output_path)
+        except:
+            logging.error('Could not create: %s', output_path)
+            return
+
+    process_dump(input_file, output_path, file_size, args.compress)
+
+
+if __name__ == '__main__':
+    main()
--- a/wikiextractor/extract.sh
+++ b/wikiextractor/extract.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+#
+# NOTES
+#
+# - Must expand templates to avoid a large loss of content.
+# - Text will not (redundantly) contain the title string.
+# - Keep sections. Section title will be marked by "Section::::".
+# - Keep lists. List bullets will be marked by "BULLET::::".
+# - Keep tables. They're mostly garbage but can be removed later (remove "^!*").
+# - Remove disambiguation pages. Right now there is no use for them.
+
+INPUT=$1
+PROCESSES=$2
+TEMPLATES=$3
+OUTPUT=$4
+
+python WikiExtractor.py $INPUT \
+       --json \
+       --processes $PROCESSES \
+       --templates $TEMPLATES \
+       --output $OUTPUT \
+       --bytes 1M \
+       --compress \
+       --links \
+       --sections \
+       --lists \
+       --keep_tables \
+       --min_text_length 0 \
+       --filter_disambig_pages