zpwiki/pages/students/2016/darius_lindvai/dp2021/prepare_text.py

from __future__ import division, print_function
from nltk.tokenize import word_tokenize

import nltk
import os
from io import open
import re
import sys

nltk.download('punkt')

NUM = '<NUM>'

PUNCTS = {".": ".PER", ",": ".COM", "?": "?QUE", "!": ".PER", ":": ",COM", ";": ".PER", "-": ",COM"}

forbidden_symbols = re.compile(r"[\[\]\(\)\/\\\>\<\=\+\_\*]")
numbers = re.compile(r"\d")
multiple_punct = re.compile(r'([\.\?\!\,\:\;\-])(?:[\.\?\!\,\:\;\-]){1,}')

is_number = lambda x: len(numbers.sub("", x)) / len(x) < 0.6

def untokenize(line):
    return line.replace(" '", "'").replace(" n't", "n't").replace("can not", "cannot")

def skip(line):

    if line.strip() == '':
        return True

    last_symbol = line[-1]
    if not last_symbol in PUNCTS:
        return True

    if forbidden_symbols.search(line) is not None:
        return True

    return False

def process_line(line):

    tokens = word_tokenize(line)
    output_tokens = []

    for token in tokens:

        if token in PUNCTS:
            output_tokens.append(PUNCTS[token])
        elif is_number(token):
            output_tokens.append(NUM)
        else:
            output_tokens.append(token.lower())

    return untokenize(" ".join(output_tokens) + " ")

skipped = 0

with open(sys.argv[2], 'w', encoding='utf-8') as out_txt:
    with open(sys.argv[1], 'r', encoding='utf-8') as text:

        for line in text:

            line = line.replace("\"", "").strip()
            line = multiple_punct.sub(r"\g<1>", line)

            if skip(line):
                skipped += 1
                continue

            line = process_line(line)

            out_txt.write(line)

print("Skipped %d lines" % skipped)