#!/usr/bin/env python # -*- coding: utf-8 -*- # # ============================================================================= # Version: 1.00 (December 15, 2015) # Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa # # ============================================================================= # Copyright (c) 2015. Giuseppe Attardi (attardi@di.unipi.it). # ============================================================================= # This file is part of Tanl. # # Tanl is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License, version 3, # as published by the Free Software Foundation. # # Tanl is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ============================================================================= """Wikipedia Cirrus Extractor: Extracts and cleans text from a Wikipedia Cirrus dump and stores output in a number of files of similar size in a given directory. Each file will contain several documents in the format: ... """ import sys, os.path, time import re import json import argparse import bz2 import gzip import logging # Program version version = '1.00' urlbase = 'http://it.wikipedia.org/' # ---------------------------------------------------------------------- class NextFile(object): """ Synchronous generation of next available file name. """ filesPerDir = 100 def __init__(self, path_name): self.path_name = path_name self.dir_index = -1 self.file_index = -1 def next(self): self.file_index = (self.file_index + 1) % NextFile.filesPerDir if self.file_index == 0: self.dir_index += 1 dirname = self._dirname() if not os.path.isdir(dirname): os.makedirs(dirname) return self._filepath() def _dirname(self): char1 = self.dir_index % 26 char2 = self.dir_index / 26 % 26 return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1)) def _filepath(self): return '%s/wiki_%02d' % (self._dirname(), self.file_index) class OutputSplitter(object): """ File-like object, that splits output to multiple files of a given max size. """ def __init__(self, nextFile, max_file_size=0, compress=True): """ :param nextfile: a NextFile object from which to obtain filenames to use. :param max_file_size: the maximum size of each file. :para compress: whether to write data with bzip compression. """ self.nextFile = nextFile self.compress = compress self.max_file_size = max_file_size self.file = self.open(self.nextFile.next()) def reserve(self, size): if self.file.tell() + size > self.max_file_size: self.close() self.file = self.open(self.nextFile.next()) def write(self, data): self.reserve(len(data)) self.file.write(data) def close(self): self.file.close() def open(self, filename): if self.compress: return bz2.BZ2File(filename + '.bz2', 'w') else: return open(filename, 'w') # ---------------------------------------------------------------------- class Extractor(object): def extract(self, out): """ :param out: output file. """ logging.debug("%s\t%s", self.id, self.title) text = ''.join(self.page) url = get_url(self.id) header = '\n' % (self.id, url, self.title, self.language, self.revision) # Separate header from text with a newline. header += self.title + '\n\n' header = header.encode('utf-8') footer = "\n\n" out.write(header) text = clean(self, text) for line in compact(text): out.write(line.encode('utf-8')) out.write('\n') out.write(footer) def process_dump(input_file, out_file, file_size, file_compress): """ :param input_file: name of the wikipedia dump file; '-' to read from stdin :param out_file: directory where to store extracted data, or '-' for stdout :param file_size: max size of each extracted file, or None for no max (one file) :param file_compress: whether to compress files with bzip. """ if input_file == '-': input = sys.stdin else: input = gzip.open(input_file) if out_file == '-': output = sys.stdout if file_compress: logging.warn("writing to stdout, so no output compression (use external tool)") else: nextFile = NextFile(out_file) output = OutputSplitter(nextFile, file_size, file_compress) # process dump # format # {"index":{"_type":"page","_id":"3825914"}} # {"namespace":0,"title":TITLE,"timestamp":"2014-06-29T15:51:09Z","text":TEXT,...} while True: line = input.readline() if not line: break index = json.loads(line) content = json.loads(input.readline()) type = index['index']['_type'] id = index['index']['_id'] language = content['language'] revision = content['version'] if type == 'page' and content['namespace'] == 0: title = content['title'] text = content['text'] # drop references: # ^ The Penguin Dictionary text = re.sub(r' \^ .*', '', text) url = urlbase + 'wiki?curid=' + id header = '\n' % (id, url, title, language, revision) page = header + title + '\n\n' + text + '\n\n' output.write(page.encode('utf-8')) # ---------------------------------------------------------------------- # Minimum size of output files minFileSize = 200 * 1024 def main(): parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__) parser.add_argument("input", help="Cirrus Json wiki dump file") groupO = parser.add_argument_group('Output') groupO.add_argument("-o", "--output", default="text", help="directory for extracted files (or '-' for dumping to stdin)") groupO.add_argument("-b", "--bytes", default="1M", help="maximum bytes per output file (default %(default)s)", metavar="n[KMG]") groupO.add_argument("-c", "--compress", action="store_true", help="compress output files using bzip") groupP = parser.add_argument_group('Processing') groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2", help="accepted namespaces") groupS = parser.add_argument_group('Special') groupS.add_argument("-q", "--quiet", action="store_true", help="suppress reporting progress info") groupS.add_argument("-v", "--version", action="version", version='%(prog)s ' + version, help="print program version") args = parser.parse_args() try: power = 'kmg'.find(args.bytes[-1].lower()) + 1 file_size = int(args.bytes[:-1]) * 1024 ** power if file_size < minFileSize: raise ValueError() except ValueError: logging.error('Insufficient or invalid size: %s', args.bytes) return FORMAT = '%(levelname)s: %(message)s' logging.basicConfig(format=FORMAT) logger = logging.getLogger() if not args.quiet: logger.setLevel(logging.INFO) input_file = args.input output_path = args.output if output_path != '-' and not os.path.isdir(output_path): try: os.makedirs(output_path) except: logging.error('Could not create: %s', output_path) return process_dump(input_file, output_path, file_size, args.compress) if __name__ == '__main__': main()