Pridanie parsovacieho nastroja a notebooku pre konvertovanie parsovanych clankov
This commit is contained in:
parent
030dd1b4a1
commit
06ad8f769a
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
wikidump/
|
||||||
|
json/
|
||||||
|
.ipynb_checkpoints/
|
||||||
|
parsed.json
|
135
bz2tojson.ipynb
Normal file
135
bz2tojson.ipynb
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import re\n",
|
||||||
|
"from bs4 import BeautifulSoup\n",
|
||||||
|
"from glob import glob\n",
|
||||||
|
"import json\n",
|
||||||
|
"from tqdm import tqdm_notebook as tqdm"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"files = glob('D:\\\\Desktop\\\\diplomka\\\\wikidump\\\\parsed\\\\*')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "83cff3cf94b54fb4a030c5c493c90ddc",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"HBox(children=(IntProgress(value=0, max=327), HTML(value='')))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_140 is has a formatting issue near <a> ), vysoký tón </a>\n",
|
||||||
|
"file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_140 is has a formatting issue near <a> ).\n",
|
||||||
|
"\n",
|
||||||
|
"Podľa tónu a výslovnosti sa výrazne mení význam toho istého slova. Napríklad \"ike\" môže znamenať \"zadok\", \"silu\", \"rozdeliť\", \"spájať\". Slovo \"oke\" podobne môže znamenať \"diel\", \"hranica\", \"muž\" alebo \"potkan\".\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"</a>\n",
|
||||||
|
"file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_190 is has a formatting issue near <a>\" vyhovuje predpokladom </a>\n",
|
||||||
|
"file D:\\Desktop\\diplomka\\wikidump\\parsed\\wiki_88 is has a formatting issue near <a ...=\"\" r=\"\">\n",
|
||||||
|
"\n",
|
||||||
|
"Dôležitým relačným systémom je </a>\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"parsed = []\n",
|
||||||
|
"\n",
|
||||||
|
"index = 0\n",
|
||||||
|
"for i in tqdm(range(len(files))):\n",
|
||||||
|
" f = open(files[i], \"r\", encoding=\"utf-8\")\n",
|
||||||
|
" content = f.read()\n",
|
||||||
|
" soup = BeautifulSoup(content)\n",
|
||||||
|
" \n",
|
||||||
|
" docs = soup.find_all('doc')\n",
|
||||||
|
" \n",
|
||||||
|
" for doc in docs:\n",
|
||||||
|
" id = doc['id']\n",
|
||||||
|
" title = doc['title']\n",
|
||||||
|
" url = doc['url']\n",
|
||||||
|
" paragraphs = doc.text.replace('\\n', '').replace('\\xa0', ' ').split('Section::::')\n",
|
||||||
|
" hrefs = doc.find_all('a')\n",
|
||||||
|
" \n",
|
||||||
|
" references = []\n",
|
||||||
|
" for href in hrefs:\n",
|
||||||
|
" try:\n",
|
||||||
|
" references.append(href['href'])\n",
|
||||||
|
" except KeyError:\n",
|
||||||
|
" print(f'file {files[i]} is has a formatting issue near {href}')\n",
|
||||||
|
" \n",
|
||||||
|
" parsed.append({\n",
|
||||||
|
" title:{\n",
|
||||||
|
" 'id': id,\n",
|
||||||
|
" 'url': url,\n",
|
||||||
|
" 'paragraphs': paragraphs, \n",
|
||||||
|
" 'references': list(set(references))\n",
|
||||||
|
" }\n",
|
||||||
|
" })\n",
|
||||||
|
" \n",
|
||||||
|
" with open(f'json/file{index}.json', 'w+') as fp:\n",
|
||||||
|
" json.dump(parsed, fp)\n",
|
||||||
|
" parsed = []\n",
|
||||||
|
" index += 1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
93
wikiextractor/.gitignore
vendored
Normal file
93
wikiextractor/.gitignore
vendored
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
local/
|
||||||
|
tmp/
|
||||||
|
|
||||||
|
### https://raw.github.com/github/gitignore/c699a4f4684e9e294c9c550f820ca330f019b6f9/python.gitignore
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
env/
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*,cover
|
||||||
|
.hypothesis/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
|
||||||
|
# Flask instance folder
|
||||||
|
instance/
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
target/
|
||||||
|
|
||||||
|
# IPython Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
.python-version
|
||||||
|
|
||||||
|
# celery beat schedule file
|
||||||
|
celerybeat-schedule
|
||||||
|
|
||||||
|
# dotenv
|
||||||
|
.env
|
||||||
|
|
||||||
|
# virtualenv
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
|
||||||
|
# Editor files
|
||||||
|
*.idea
|
135
wikiextractor/README.md
Normal file
135
wikiextractor/README.md
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
# WikiExtractor
|
||||||
|
[WikiExtractor.py](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) is a Python script that extracts and cleans text from a [Wikipedia database dump](http://download.wikimedia.org/).
|
||||||
|
|
||||||
|
The tool is written in Python and requires Python 2.7 or Python 3.3+ but no additional library.
|
||||||
|
|
||||||
|
For further information, see the [project Home Page](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) or the [Wiki](https://github.com/attardi/wikiextractor/wiki).
|
||||||
|
|
||||||
|
# Wikipedia Cirrus Extractor
|
||||||
|
|
||||||
|
`cirrus-extractor.py` is a version of the script that performs extraction from a Wikipedia Cirrus dump.
|
||||||
|
Cirrus dumps contain text with already expanded templates.
|
||||||
|
|
||||||
|
Cirrus dumps are available at:
|
||||||
|
[cirrussearch](http://dumps.wikimedia.org/other/cirrussearch/).
|
||||||
|
|
||||||
|
# Details
|
||||||
|
|
||||||
|
WikiExtractor performs template expansion by preprocessing the whole dump and extracting template definitions.
|
||||||
|
|
||||||
|
In order to speed up processing:
|
||||||
|
|
||||||
|
- multiprocessing is used for dealing with articles in parallel
|
||||||
|
- a cache is kept of parsed templates (only useful for repeated extractions).
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
The script may be invoked directly, however it can be installed by doing:
|
||||||
|
|
||||||
|
(sudo) python setup.py install
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
The script is invoked with a Wikipedia dump file as an argument.
|
||||||
|
The output is stored in several files of similar size in a given directory.
|
||||||
|
Each file will contains several documents in this [document format](http://medialab.di.unipi.it/wiki/Document_Format).
|
||||||
|
|
||||||
|
usage: WikiExtractor.py [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--json] [--html]
|
||||||
|
[-l] [-s] [--lists] [-ns ns1,ns2]
|
||||||
|
[--templates TEMPLATES] [--no-templates] [-r]
|
||||||
|
[--min_text_length MIN_TEXT_LENGTH]
|
||||||
|
[--filter_category path_of_categories_file]
|
||||||
|
[--filter_disambig_pages] [-it abbr,b,big]
|
||||||
|
[-de gallery,timeline,noinclude] [--keep_tables]
|
||||||
|
[--processes PROCESSES] [-q] [--debug] [-a] [-v]
|
||||||
|
[--log_file]
|
||||||
|
input
|
||||||
|
|
||||||
|
Wikipedia Extractor:
|
||||||
|
Extracts and cleans text from a Wikipedia database dump and stores output in a
|
||||||
|
number of files of similar size in a given directory.
|
||||||
|
Each file will contain several documents in the format:
|
||||||
|
|
||||||
|
<doc id="" revid="" url="" title="">
|
||||||
|
...
|
||||||
|
</doc>
|
||||||
|
|
||||||
|
If the program is invoked with the --json flag, then each file will
|
||||||
|
contain several documents formatted as json ojects, one per line, with
|
||||||
|
the following structure
|
||||||
|
|
||||||
|
{"id": "", "revid": "", "url":"", "title": "", "text": "..."}
|
||||||
|
|
||||||
|
Template expansion requires preprocesssng first the whole dump and
|
||||||
|
collecting template definitions.
|
||||||
|
|
||||||
|
positional arguments:
|
||||||
|
input XML wiki dump file
|
||||||
|
|
||||||
|
optional arguments:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
--processes PROCESSES
|
||||||
|
Number of processes to use (default 1)
|
||||||
|
|
||||||
|
Output:
|
||||||
|
-o OUTPUT, --output OUTPUT
|
||||||
|
directory for extracted files (or '-' for dumping to
|
||||||
|
stdout)
|
||||||
|
-b n[KMG], --bytes n[KMG]
|
||||||
|
maximum bytes per output file (default 1M)
|
||||||
|
-c, --compress compress output files using bzip
|
||||||
|
--json write output in json format instead of the default one
|
||||||
|
|
||||||
|
Processing:
|
||||||
|
--html produce HTML output, subsumes --links
|
||||||
|
-l, --links preserve links
|
||||||
|
-s, --sections preserve sections
|
||||||
|
--lists preserve lists
|
||||||
|
-ns ns1,ns2, --namespaces ns1,ns2
|
||||||
|
accepted namespaces in links
|
||||||
|
--templates TEMPLATES
|
||||||
|
use or create file containing templates
|
||||||
|
--no-templates Do not expand templates
|
||||||
|
-r, --revision Include the document revision id (default=False)
|
||||||
|
--min_text_length MIN_TEXT_LENGTH
|
||||||
|
Minimum expanded text length required to write
|
||||||
|
document (default=0)
|
||||||
|
--filter_category path_of_categories_file
|
||||||
|
Include or exclude specific categories from the dataset. Specify the categories in
|
||||||
|
file 'path_of_categories_file'. Format:
|
||||||
|
One category one line, and if the line starts with:
|
||||||
|
1) #: Comments, ignored;
|
||||||
|
2) ^: the categories will be in excluding-categories
|
||||||
|
3) others: the categories will be in including-categories.
|
||||||
|
Priority:
|
||||||
|
1) If excluding-categories is not empty, and any category of a page exists in excluding-categories, the page will be excluded; else
|
||||||
|
2) If including-categories is not empty, and no category of a page exists in including-categories, the page will be excluded; else
|
||||||
|
3) the page will be included
|
||||||
|
|
||||||
|
--filter_disambig_pages
|
||||||
|
Remove pages from output that contain disabmiguation
|
||||||
|
markup (default=False)
|
||||||
|
-it abbr,b,big, --ignored_tags abbr,b,big
|
||||||
|
comma separated list of tags that will be dropped,
|
||||||
|
keeping their content
|
||||||
|
-de gallery,timeline,noinclude, --discard_elements gallery,timeline,noinclude
|
||||||
|
comma separated list of elements that will be removed
|
||||||
|
from the article text
|
||||||
|
--keep_tables Preserve tables in the output article text
|
||||||
|
(default=False)
|
||||||
|
|
||||||
|
Special:
|
||||||
|
-q, --quiet suppress reporting progress info
|
||||||
|
--debug print debug info
|
||||||
|
-a, --article analyze a file containing a single article (debug
|
||||||
|
option)
|
||||||
|
-v, --version print program version
|
||||||
|
--log_file specify a file to save the log information.
|
||||||
|
|
||||||
|
|
||||||
|
Saving templates to a file will speed up performing extraction the next time,
|
||||||
|
assuming template definitions have not changed.
|
||||||
|
|
||||||
|
Option --no-templates significantly speeds up the extractor, avoiding the cost
|
||||||
|
of expanding [MediaWiki templates](https://www.mediawiki.org/wiki/Help:Templates).
|
||||||
|
|
||||||
|
For further information, visit [the documentation](http://attardi.github.io/wikiextractor).
|
3297
wikiextractor/WikiExtractor.py
Normal file
3297
wikiextractor/WikiExtractor.py
Normal file
File diff suppressed because it is too large
Load Diff
36512
wikiextractor/categories.filter
Normal file
36512
wikiextractor/categories.filter
Normal file
File diff suppressed because it is too large
Load Diff
248
wikiextractor/cirrus-extract.py
Normal file
248
wikiextractor/cirrus-extract.py
Normal file
@ -0,0 +1,248 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
#
|
||||||
|
# =============================================================================
|
||||||
|
# Version: 1.00 (December 15, 2015)
|
||||||
|
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
|
||||||
|
#
|
||||||
|
# =============================================================================
|
||||||
|
# Copyright (c) 2015. Giuseppe Attardi (attardi@di.unipi.it).
|
||||||
|
# =============================================================================
|
||||||
|
# This file is part of Tanl.
|
||||||
|
#
|
||||||
|
# Tanl is free software; you can redistribute it and/or modify it
|
||||||
|
# under the terms of the GNU General Public License, version 3,
|
||||||
|
# as published by the Free Software Foundation.
|
||||||
|
#
|
||||||
|
# Tanl is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
"""Wikipedia Cirrus Extractor:
|
||||||
|
Extracts and cleans text from a Wikipedia Cirrus dump and stores output in a
|
||||||
|
number of files of similar size in a given directory.
|
||||||
|
Each file will contain several documents in the format:
|
||||||
|
|
||||||
|
<doc id="" url="" title="" language="" revision="">
|
||||||
|
...
|
||||||
|
</doc>
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys, os.path, time
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
import bz2
|
||||||
|
import gzip
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Program version
|
||||||
|
version = '1.00'
|
||||||
|
|
||||||
|
urlbase = 'http://it.wikipedia.org/'
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
class NextFile(object):
|
||||||
|
"""
|
||||||
|
Synchronous generation of next available file name.
|
||||||
|
"""
|
||||||
|
|
||||||
|
filesPerDir = 100
|
||||||
|
|
||||||
|
def __init__(self, path_name):
|
||||||
|
self.path_name = path_name
|
||||||
|
self.dir_index = -1
|
||||||
|
self.file_index = -1
|
||||||
|
|
||||||
|
def next(self):
|
||||||
|
self.file_index = (self.file_index + 1) % NextFile.filesPerDir
|
||||||
|
if self.file_index == 0:
|
||||||
|
self.dir_index += 1
|
||||||
|
dirname = self._dirname()
|
||||||
|
if not os.path.isdir(dirname):
|
||||||
|
os.makedirs(dirname)
|
||||||
|
return self._filepath()
|
||||||
|
|
||||||
|
def _dirname(self):
|
||||||
|
char1 = self.dir_index % 26
|
||||||
|
char2 = self.dir_index / 26 % 26
|
||||||
|
return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
|
||||||
|
|
||||||
|
def _filepath(self):
|
||||||
|
return '%s/wiki_%02d' % (self._dirname(), self.file_index)
|
||||||
|
|
||||||
|
class OutputSplitter(object):
|
||||||
|
"""
|
||||||
|
File-like object, that splits output to multiple files of a given max size.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, nextFile, max_file_size=0, compress=True):
|
||||||
|
"""
|
||||||
|
:param nextfile: a NextFile object from which to obtain filenames
|
||||||
|
to use.
|
||||||
|
:param max_file_size: the maximum size of each file.
|
||||||
|
:para compress: whether to write data with bzip compression.
|
||||||
|
"""
|
||||||
|
self.nextFile = nextFile
|
||||||
|
self.compress = compress
|
||||||
|
self.max_file_size = max_file_size
|
||||||
|
self.file = self.open(self.nextFile.next())
|
||||||
|
|
||||||
|
def reserve(self, size):
|
||||||
|
if self.file.tell() + size > self.max_file_size:
|
||||||
|
self.close()
|
||||||
|
self.file = self.open(self.nextFile.next())
|
||||||
|
|
||||||
|
def write(self, data):
|
||||||
|
self.reserve(len(data))
|
||||||
|
self.file.write(data)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.file.close()
|
||||||
|
|
||||||
|
def open(self, filename):
|
||||||
|
if self.compress:
|
||||||
|
return bz2.BZ2File(filename + '.bz2', 'w')
|
||||||
|
else:
|
||||||
|
return open(filename, 'w')
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
class Extractor(object):
|
||||||
|
|
||||||
|
def extract(self, out):
|
||||||
|
"""
|
||||||
|
:param out: output file.
|
||||||
|
"""
|
||||||
|
logging.debug("%s\t%s", self.id, self.title)
|
||||||
|
text = ''.join(self.page)
|
||||||
|
url = get_url(self.id)
|
||||||
|
header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (self.id, url, self.title, self.language, self.revision)
|
||||||
|
# Separate header from text with a newline.
|
||||||
|
header += self.title + '\n\n'
|
||||||
|
header = header.encode('utf-8')
|
||||||
|
footer = "\n</doc>\n"
|
||||||
|
out.write(header)
|
||||||
|
text = clean(self, text)
|
||||||
|
for line in compact(text):
|
||||||
|
out.write(line.encode('utf-8'))
|
||||||
|
out.write('\n')
|
||||||
|
out.write(footer)
|
||||||
|
|
||||||
|
def process_dump(input_file, out_file, file_size, file_compress):
|
||||||
|
"""
|
||||||
|
:param input_file: name of the wikipedia dump file; '-' to read from stdin
|
||||||
|
:param out_file: directory where to store extracted data, or '-' for stdout
|
||||||
|
:param file_size: max size of each extracted file, or None for no max (one file)
|
||||||
|
:param file_compress: whether to compress files with bzip.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if input_file == '-':
|
||||||
|
input = sys.stdin
|
||||||
|
else:
|
||||||
|
input = gzip.open(input_file)
|
||||||
|
|
||||||
|
if out_file == '-':
|
||||||
|
output = sys.stdout
|
||||||
|
if file_compress:
|
||||||
|
logging.warn("writing to stdout, so no output compression (use external tool)")
|
||||||
|
else:
|
||||||
|
nextFile = NextFile(out_file)
|
||||||
|
output = OutputSplitter(nextFile, file_size, file_compress)
|
||||||
|
|
||||||
|
# process dump
|
||||||
|
# format
|
||||||
|
# {"index":{"_type":"page","_id":"3825914"}}
|
||||||
|
# {"namespace":0,"title":TITLE,"timestamp":"2014-06-29T15:51:09Z","text":TEXT,...}
|
||||||
|
while True:
|
||||||
|
line = input.readline()
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
|
index = json.loads(line)
|
||||||
|
content = json.loads(input.readline())
|
||||||
|
type = index['index']['_type']
|
||||||
|
id = index['index']['_id']
|
||||||
|
language = content['language']
|
||||||
|
revision = content['version']
|
||||||
|
if type == 'page' and content['namespace'] == 0:
|
||||||
|
title = content['title']
|
||||||
|
text = content['text']
|
||||||
|
# drop references:
|
||||||
|
# ^ The Penguin Dictionary
|
||||||
|
text = re.sub(r' \^ .*', '', text)
|
||||||
|
url = urlbase + 'wiki?curid=' + id
|
||||||
|
header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (id, url, title, language, revision)
|
||||||
|
page = header + title + '\n\n' + text + '\n</doc>\n'
|
||||||
|
output.write(page.encode('utf-8'))
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Minimum size of output files
|
||||||
|
minFileSize = 200 * 1024
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
description=__doc__)
|
||||||
|
parser.add_argument("input",
|
||||||
|
help="Cirrus Json wiki dump file")
|
||||||
|
groupO = parser.add_argument_group('Output')
|
||||||
|
groupO.add_argument("-o", "--output", default="text",
|
||||||
|
help="directory for extracted files (or '-' for dumping to stdin)")
|
||||||
|
groupO.add_argument("-b", "--bytes", default="1M",
|
||||||
|
help="maximum bytes per output file (default %(default)s)",
|
||||||
|
metavar="n[KMG]")
|
||||||
|
groupO.add_argument("-c", "--compress", action="store_true",
|
||||||
|
help="compress output files using bzip")
|
||||||
|
|
||||||
|
groupP = parser.add_argument_group('Processing')
|
||||||
|
groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
|
||||||
|
help="accepted namespaces")
|
||||||
|
|
||||||
|
groupS = parser.add_argument_group('Special')
|
||||||
|
groupS.add_argument("-q", "--quiet", action="store_true",
|
||||||
|
help="suppress reporting progress info")
|
||||||
|
groupS.add_argument("-v", "--version", action="version",
|
||||||
|
version='%(prog)s ' + version,
|
||||||
|
help="print program version")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
try:
|
||||||
|
power = 'kmg'.find(args.bytes[-1].lower()) + 1
|
||||||
|
file_size = int(args.bytes[:-1]) * 1024 ** power
|
||||||
|
if file_size < minFileSize:
|
||||||
|
raise ValueError()
|
||||||
|
except ValueError:
|
||||||
|
logging.error('Insufficient or invalid size: %s', args.bytes)
|
||||||
|
return
|
||||||
|
|
||||||
|
FORMAT = '%(levelname)s: %(message)s'
|
||||||
|
logging.basicConfig(format=FORMAT)
|
||||||
|
|
||||||
|
logger = logging.getLogger()
|
||||||
|
if not args.quiet:
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
input_file = args.input
|
||||||
|
|
||||||
|
output_path = args.output
|
||||||
|
if output_path != '-' and not os.path.isdir(output_path):
|
||||||
|
try:
|
||||||
|
os.makedirs(output_path)
|
||||||
|
except:
|
||||||
|
logging.error('Could not create: %s', output_path)
|
||||||
|
return
|
||||||
|
|
||||||
|
process_dump(input_file, output_path, file_size, args.compress)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
29
wikiextractor/extract.sh
Normal file
29
wikiextractor/extract.sh
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# NOTES
|
||||||
|
#
|
||||||
|
# - Must expand templates to avoid a large loss of content.
|
||||||
|
# - Text will not (redundantly) contain the title string.
|
||||||
|
# - Keep sections. Section title will be marked by "Section::::".
|
||||||
|
# - Keep lists. List bullets will be marked by "BULLET::::".
|
||||||
|
# - Keep tables. They're mostly garbage but can be removed later (remove "^!*").
|
||||||
|
# - Remove disambiguation pages. Right now there is no use for them.
|
||||||
|
|
||||||
|
INPUT=$1
|
||||||
|
PROCESSES=$2
|
||||||
|
TEMPLATES=$3
|
||||||
|
OUTPUT=$4
|
||||||
|
|
||||||
|
python WikiExtractor.py $INPUT \
|
||||||
|
--json \
|
||||||
|
--processes $PROCESSES \
|
||||||
|
--templates $TEMPLATES \
|
||||||
|
--output $OUTPUT \
|
||||||
|
--bytes 1M \
|
||||||
|
--compress \
|
||||||
|
--links \
|
||||||
|
--sections \
|
||||||
|
--lists \
|
||||||
|
--keep_tables \
|
||||||
|
--min_text_length 0 \
|
||||||
|
--filter_disambig_pages
|
Loading…
Reference in New Issue
Block a user