zpwiki/pages/students/2016/darius_lindvai/dp2021/prepare_text.py
2020-06-05 14:49:42 +02:00

74 lines
1.7 KiB
Python

from __future__ import division, print_function
from nltk.tokenize import word_tokenize
import nltk
import os
from io import open
import re
import sys
nltk.download('punkt')
NUM = '<NUM>'
PUNCTS = {".": ".PER", ",": ".COM", "?": "?QUE", "!": ".PER", ":": ",COM", ";": ".PER", "-": ",COM"}
forbidden_symbols = re.compile(r"[\[\]\(\)\/\\\>\<\=\+\_\*]")
numbers = re.compile(r"\d")
multiple_punct = re.compile(r'([\.\?\!\,\:\;\-])(?:[\.\?\!\,\:\;\-]){1,}')
is_number = lambda x: len(numbers.sub("", x)) / len(x) < 0.6
def untokenize(line):
return line.replace(" '", "'").replace(" n't", "n't").replace("can not", "cannot")
def skip(line):
if line.strip() == '':
return True
last_symbol = line[-1]
if not last_symbol in PUNCTS:
return True
if forbidden_symbols.search(line) is not None:
return True
return False
def process_line(line):
tokens = word_tokenize(line)
output_tokens = []
for token in tokens:
if token in PUNCTS:
output_tokens.append(PUNCTS[token])
elif is_number(token):
output_tokens.append(NUM)
else:
output_tokens.append(token.lower())
return untokenize(" ".join(output_tokens) + " ")
skipped = 0
with open(sys.argv[2], 'w', encoding='utf-8') as out_txt:
with open(sys.argv[1], 'r', encoding='utf-8') as text:
for line in text:
line = line.replace("\"", "").strip()
line = multiple_punct.sub(r"\g<1>", line)
if skip(line):
skipped += 1
continue
line = process_line(line)
out_txt.write(line)
print("Skipped %d lines" % skipped)