dp2021/wikiextractor/cirrus-extract.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# =============================================================================
#  Version: 1.00 (December 15, 2015)
#  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
#
# =============================================================================
#  Copyright (c) 2015. Giuseppe Attardi (attardi@di.unipi.it).
# =============================================================================
#  This file is part of Tanl.
#
#  Tanl is free software; you can redistribute it and/or modify it
#  under the terms of the GNU General Public License, version 3,
#  as published by the Free Software Foundation.
#
#  Tanl is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
# =============================================================================

"""Wikipedia Cirrus Extractor:
Extracts and cleans text from a Wikipedia Cirrus dump and stores output in a
number of files of similar size in a given directory.
Each file will contain several documents in the format:

	<doc id="" url="" title="" language="" revision="">
        ...
        </doc>

"""

import sys, os.path, time
import re
import json
import argparse
import bz2
import gzip
import logging

# Program version
version = '1.00'

urlbase = 'http://it.wikipedia.org/'

# ----------------------------------------------------------------------

class NextFile(object):
    """
    Synchronous generation of next available file name.
    """

    filesPerDir = 100

    def __init__(self, path_name):
        self.path_name = path_name
        self.dir_index = -1
        self.file_index = -1

    def next(self):
        self.file_index = (self.file_index + 1) % NextFile.filesPerDir
        if self.file_index == 0:
            self.dir_index += 1
        dirname = self._dirname()
        if not os.path.isdir(dirname):
            os.makedirs(dirname)
        return self._filepath()

    def _dirname(self):
        char1 = self.dir_index % 26
        char2 = self.dir_index / 26 % 26
        return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))

    def _filepath(self):
        return '%s/wiki_%02d' % (self._dirname(), self.file_index)

class OutputSplitter(object):
    """
    File-like object, that splits output to multiple files of a given max size.
    """

    def __init__(self, nextFile, max_file_size=0, compress=True):
        """
        :param nextfile: a NextFile object from which to obtain filenames
            to use.
        :param max_file_size: the maximum size of each file.
        :para compress: whether to write data with bzip compression.
        """
        self.nextFile = nextFile
        self.compress = compress
        self.max_file_size = max_file_size
        self.file = self.open(self.nextFile.next())

    def reserve(self, size):
        if self.file.tell() + size > self.max_file_size:
            self.close()
            self.file = self.open(self.nextFile.next())

    def write(self, data):
        self.reserve(len(data))
        self.file.write(data)

    def close(self):
        self.file.close()

    def open(self, filename):
        if self.compress:
            return bz2.BZ2File(filename + '.bz2', 'w')
        else:
            return open(filename, 'w')

# ----------------------------------------------------------------------

class Extractor(object):

    def extract(self, out):
        """
        :param out: output file.
        """
        logging.debug("%s\t%s", self.id, self.title)
        text = ''.join(self.page)
        url = get_url(self.id)
        header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (self.id, url, self.title, self.language, self.revision)
        # Separate header from text with a newline.
        header += self.title + '\n\n'
        header = header.encode('utf-8')
        footer = "\n</doc>\n"
        out.write(header)
        text = clean(self, text)
        for line in compact(text):
            out.write(line.encode('utf-8'))
            out.write('\n')
        out.write(footer)

def process_dump(input_file, out_file, file_size, file_compress):
    """
    :param input_file: name of the wikipedia dump file; '-' to read from stdin
    :param out_file: directory where to store extracted data, or '-' for stdout
    :param file_size: max size of each extracted file, or None for no max (one file)
    :param file_compress: whether to compress files with bzip.
    """

    if input_file == '-':
        input = sys.stdin
    else:
        input = gzip.open(input_file)

    if out_file == '-':
        output = sys.stdout
        if file_compress:
            logging.warn("writing to stdout, so no output compression (use external tool)")
    else:
        nextFile = NextFile(out_file)
        output = OutputSplitter(nextFile, file_size, file_compress)

    # process dump
    # format
    # {"index":{"_type":"page","_id":"3825914"}}
    # {"namespace":0,"title":TITLE,"timestamp":"2014-06-29T15:51:09Z","text":TEXT,...}
    while True:
        line = input.readline()
        if not line:
            break
        index = json.loads(line)
        content = json.loads(input.readline())
        type = index['index']['_type']
        id = index['index']['_id']
        language = content['language']
        revision = content['version']
        if type == 'page' and content['namespace'] == 0:
            title = content['title']
            text = content['text']
            # drop references:
            # ^ The Penguin Dictionary
            text = re.sub(r'  \^ .*', '', text)
            url = urlbase + 'wiki?curid=' + id
            header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (id, url, title, language, revision)
            page = header + title + '\n\n' + text + '\n</doc>\n'
            output.write(page.encode('utf-8'))

# ----------------------------------------------------------------------

# Minimum size of output files
minFileSize = 200 * 1024

def main():
    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.RawDescriptionHelpFormatter,
                                     description=__doc__)
    parser.add_argument("input",
                        help="Cirrus Json wiki dump file")
    groupO = parser.add_argument_group('Output')
    groupO.add_argument("-o", "--output", default="text",
                        help="directory for extracted files (or '-' for dumping to stdin)")
    groupO.add_argument("-b", "--bytes", default="1M",
                        help="maximum bytes per output file (default %(default)s)",
                        metavar="n[KMG]")
    groupO.add_argument("-c", "--compress", action="store_true",
                        help="compress output files using bzip")

    groupP = parser.add_argument_group('Processing')
    groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
                        help="accepted namespaces")

    groupS = parser.add_argument_group('Special')
    groupS.add_argument("-q", "--quiet", action="store_true",
                        help="suppress reporting progress info")
    groupS.add_argument("-v", "--version", action="version",
                        version='%(prog)s ' + version,
                        help="print program version")

    args = parser.parse_args()

    try:
        power = 'kmg'.find(args.bytes[-1].lower()) + 1
        file_size = int(args.bytes[:-1]) * 1024 ** power
        if file_size < minFileSize:
            raise ValueError()
    except ValueError:
        logging.error('Insufficient or invalid size: %s', args.bytes)
        return

    FORMAT = '%(levelname)s: %(message)s'
    logging.basicConfig(format=FORMAT)

    logger = logging.getLogger()
    if not args.quiet:
        logger.setLevel(logging.INFO)

    input_file = args.input

    output_path = args.output
    if output_path != '-' and not os.path.isdir(output_path):
        try:
            os.makedirs(output_path)
        except:
            logging.error('Could not create: %s', output_path)
            return

    process_dump(input_file, output_path, file_size, args.compress)


if __name__ == '__main__':
    main()
Pridanie parsovacieho nastroja a notebooku pre konvertovanie parsovanych clankov 2020-04-16 11:18:04 +00:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`
			`#`
			`# =============================================================================`
			`# Version: 1.00 (December 15, 2015)`
			`# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa`
			`#`
			`# =============================================================================`
			`# Copyright (c) 2015. Giuseppe Attardi (attardi@di.unipi.it).`
			`# =============================================================================`
			`# This file is part of Tanl.`
			`#`
			`# Tanl is free software; you can redistribute it and/or modify it`
			`# under the terms of the GNU General Public License, version 3,`
			`# as published by the Free Software Foundation.`
			`#`
			`# Tanl is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`# =============================================================================`

			`"""Wikipedia Cirrus Extractor:`
			`Extracts and cleans text from a Wikipedia Cirrus dump and stores output in a`
			`number of files of similar size in a given directory.`
			`Each file will contain several documents in the format:`

			`<doc id="" url="" title="" language="" revision="">`
			`...`
			`</doc>`

			`"""`

			`import sys, os.path, time`
			`import re`
			`import json`
			`import argparse`
			`import bz2`
			`import gzip`
			`import logging`

			`# Program version`
			`version = '1.00'`

			`urlbase = 'http://it.wikipedia.org/'`

			`# ----------------------------------------------------------------------`

			`class NextFile(object):`
			`"""`
			`Synchronous generation of next available file name.`
			`"""`

			`filesPerDir = 100`

			`def __init__(self, path_name):`
			`self.path_name = path_name`
			`self.dir_index = -1`
			`self.file_index = -1`

			`def next(self):`
			`self.file_index = (self.file_index + 1) % NextFile.filesPerDir`
			`if self.file_index == 0:`
			`self.dir_index += 1`
			`dirname = self._dirname()`
			`if not os.path.isdir(dirname):`
			`os.makedirs(dirname)`
			`return self._filepath()`

			`def _dirname(self):`
			`char1 = self.dir_index % 26`
			`char2 = self.dir_index / 26 % 26`
			`return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))`

			`def _filepath(self):`
			`return '%s/wiki_%02d' % (self._dirname(), self.file_index)`

			`class OutputSplitter(object):`
			`"""`
			`File-like object, that splits output to multiple files of a given max size.`
			`"""`

			`def __init__(self, nextFile, max_file_size=0, compress=True):`
			`"""`
			`:param nextfile: a NextFile object from which to obtain filenames`
			`to use.`
			`:param max_file_size: the maximum size of each file.`
			`:para compress: whether to write data with bzip compression.`
			`"""`
			`self.nextFile = nextFile`
			`self.compress = compress`
			`self.max_file_size = max_file_size`
			`self.file = self.open(self.nextFile.next())`

			`def reserve(self, size):`
			`if self.file.tell() + size > self.max_file_size:`
			`self.close()`
			`self.file = self.open(self.nextFile.next())`

			`def write(self, data):`
			`self.reserve(len(data))`
			`self.file.write(data)`

			`def close(self):`
			`self.file.close()`

			`def open(self, filename):`
			`if self.compress:`
			`return bz2.BZ2File(filename + '.bz2', 'w')`
			`else:`
			`return open(filename, 'w')`

			`# ----------------------------------------------------------------------`

			`class Extractor(object):`

			`def extract(self, out):`
			`"""`
			`:param out: output file.`
			`"""`
			`logging.debug("%s\t%s", self.id, self.title)`
			`text = ''.join(self.page)`
			`url = get_url(self.id)`
			`header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (self.id, url, self.title, self.language, self.revision)`
			`# Separate header from text with a newline.`
			`header += self.title + '\n\n'`
			`header = header.encode('utf-8')`
			`footer = "\n</doc>\n"`
			`out.write(header)`
			`text = clean(self, text)`
			`for line in compact(text):`
			`out.write(line.encode('utf-8'))`
			`out.write('\n')`
			`out.write(footer)`

			`def process_dump(input_file, out_file, file_size, file_compress):`
			`"""`
			`:param input_file: name of the wikipedia dump file; '-' to read from stdin`
			`:param out_file: directory where to store extracted data, or '-' for stdout`
			`:param file_size: max size of each extracted file, or None for no max (one file)`
			`:param file_compress: whether to compress files with bzip.`
			`"""`

			`if input_file == '-':`
			`input = sys.stdin`
			`else:`
			`input = gzip.open(input_file)`

			`if out_file == '-':`
			`output = sys.stdout`
			`if file_compress:`
			`logging.warn("writing to stdout, so no output compression (use external tool)")`
			`else:`
			`nextFile = NextFile(out_file)`
			`output = OutputSplitter(nextFile, file_size, file_compress)`

			`# process dump`
			`# format`
			`# {"index":{"_type":"page","_id":"3825914"}}`
			`# {"namespace":0,"title":TITLE,"timestamp":"2014-06-29T15:51:09Z","text":TEXT,...}`
			`while True:`
			`line = input.readline()`
			`if not line:`
			`break`
			`index = json.loads(line)`
			`content = json.loads(input.readline())`
			`type = index['index']['_type']`
			`id = index['index']['_id']`
			`language = content['language']`
			`revision = content['version']`
			`if type == 'page' and content['namespace'] == 0:`
			`title = content['title']`
			`text = content['text']`
			`# drop references:`
			`# ^ The Penguin Dictionary`
			`text = re.sub(r' \^ .*', '', text)`
			`url = urlbase + 'wiki?curid=' + id`
			`header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (id, url, title, language, revision)`
			`page = header + title + '\n\n' + text + '\n</doc>\n'`
			`output.write(page.encode('utf-8'))`

			`# ----------------------------------------------------------------------`

			`# Minimum size of output files`
			`minFileSize = 200 * 1024`

			`def main():`
			`parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),`
			`formatter_class=argparse.RawDescriptionHelpFormatter,`
			`description=__doc__)`
			`parser.add_argument("input",`
			`help="Cirrus Json wiki dump file")`
			`groupO = parser.add_argument_group('Output')`
			`groupO.add_argument("-o", "--output", default="text",`
			`help="directory for extracted files (or '-' for dumping to stdin)")`
			`groupO.add_argument("-b", "--bytes", default="1M",`
			`help="maximum bytes per output file (default %(default)s)",`
			`metavar="n[KMG]")`
			`groupO.add_argument("-c", "--compress", action="store_true",`
			`help="compress output files using bzip")`

			`groupP = parser.add_argument_group('Processing')`
			`groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",`
			`help="accepted namespaces")`

			`groupS = parser.add_argument_group('Special')`
			`groupS.add_argument("-q", "--quiet", action="store_true",`
			`help="suppress reporting progress info")`
			`groupS.add_argument("-v", "--version", action="version",`
			`version='%(prog)s ' + version,`
			`help="print program version")`

			`args = parser.parse_args()`

			`try:`
			`power = 'kmg'.find(args.bytes[-1].lower()) + 1`
			`file_size = int(args.bytes[:-1]) * 1024 ** power`
			`if file_size < minFileSize:`
			`raise ValueError()`
			`except ValueError:`
			`logging.error('Insufficient or invalid size: %s', args.bytes)`
			`return`

			`FORMAT = '%(levelname)s: %(message)s'`
			`logging.basicConfig(format=FORMAT)`

			`logger = logging.getLogger()`
			`if not args.quiet:`
			`logger.setLevel(logging.INFO)`

			`input_file = args.input`

			`output_path = args.output`
			`if output_path != '-' and not os.path.isdir(output_path):`
			`try:`
			`os.makedirs(output_path)`
			`except:`
			`logging.error('Could not create: %s', output_path)`
			`return`

			`process_dump(input_file, output_path, file_size, args.compress)`


			`if __name__ == '__main__':`
			`main()`