74 lines
1.7 KiB
Python
74 lines
1.7 KiB
Python
|
from __future__ import division, print_function
|
||
|
from nltk.tokenize import word_tokenize
|
||
|
|
||
|
import nltk
|
||
|
import os
|
||
|
from io import open
|
||
|
import re
|
||
|
import sys
|
||
|
|
||
|
nltk.download('punkt')
|
||
|
|
||
|
NUM = '<NUM>'
|
||
|
|
||
|
PUNCTS = {".": ".PER", ",": ".COM", "?": "?QUE", "!": ".PER", ":": ",COM", ";": ".PER", "-": ",COM"}
|
||
|
|
||
|
forbidden_symbols = re.compile(r"[\[\]\(\)\/\\\>\<\=\+\_\*]")
|
||
|
numbers = re.compile(r"\d")
|
||
|
multiple_punct = re.compile(r'([\.\?\!\,\:\;\-])(?:[\.\?\!\,\:\;\-]){1,}')
|
||
|
|
||
|
is_number = lambda x: len(numbers.sub("", x)) / len(x) < 0.6
|
||
|
|
||
|
def untokenize(line):
|
||
|
return line.replace(" '", "'").replace(" n't", "n't").replace("can not", "cannot")
|
||
|
|
||
|
def skip(line):
|
||
|
|
||
|
if line.strip() == '':
|
||
|
return True
|
||
|
|
||
|
last_symbol = line[-1]
|
||
|
if not last_symbol in PUNCTS:
|
||
|
return True
|
||
|
|
||
|
if forbidden_symbols.search(line) is not None:
|
||
|
return True
|
||
|
|
||
|
return False
|
||
|
|
||
|
def process_line(line):
|
||
|
|
||
|
tokens = word_tokenize(line)
|
||
|
output_tokens = []
|
||
|
|
||
|
for token in tokens:
|
||
|
|
||
|
if token in PUNCTS:
|
||
|
output_tokens.append(PUNCTS[token])
|
||
|
elif is_number(token):
|
||
|
output_tokens.append(NUM)
|
||
|
else:
|
||
|
output_tokens.append(token.lower())
|
||
|
|
||
|
return untokenize(" ".join(output_tokens) + " ")
|
||
|
|
||
|
skipped = 0
|
||
|
|
||
|
with open(sys.argv[2], 'w', encoding='utf-8') as out_txt:
|
||
|
with open(sys.argv[1], 'r', encoding='utf-8') as text:
|
||
|
|
||
|
for line in text:
|
||
|
|
||
|
line = line.replace("\"", "").strip()
|
||
|
line = multiple_punct.sub(r"\g<1>", line)
|
||
|
|
||
|
if skip(line):
|
||
|
skipped += 1
|
||
|
continue
|
||
|
|
||
|
line = process_line(line)
|
||
|
|
||
|
out_txt.write(line)
|
||
|
|
||
|
print("Skipped %d lines" % skipped)
|