forked from KEMT/zpwiki
74 lines
1.7 KiB
Python
74 lines
1.7 KiB
Python
from __future__ import division, print_function
|
|
from nltk.tokenize import word_tokenize
|
|
|
|
import nltk
|
|
import os
|
|
from io import open
|
|
import re
|
|
import sys
|
|
|
|
nltk.download('punkt')
|
|
|
|
NUM = '<NUM>'
|
|
|
|
PUNCTS = {".": ".PER", ",": ".COM", "?": "?QUE", "!": ".PER", ":": ",COM", ";": ".PER", "-": ",COM"}
|
|
|
|
forbidden_symbols = re.compile(r"[\[\]\(\)\/\\\>\<\=\+\_\*]")
|
|
numbers = re.compile(r"\d")
|
|
multiple_punct = re.compile(r'([\.\?\!\,\:\;\-])(?:[\.\?\!\,\:\;\-]){1,}')
|
|
|
|
is_number = lambda x: len(numbers.sub("", x)) / len(x) < 0.6
|
|
|
|
def untokenize(line):
|
|
return line.replace(" '", "'").replace(" n't", "n't").replace("can not", "cannot")
|
|
|
|
def skip(line):
|
|
|
|
if line.strip() == '':
|
|
return True
|
|
|
|
last_symbol = line[-1]
|
|
if not last_symbol in PUNCTS:
|
|
return True
|
|
|
|
if forbidden_symbols.search(line) is not None:
|
|
return True
|
|
|
|
return False
|
|
|
|
def process_line(line):
|
|
|
|
tokens = word_tokenize(line)
|
|
output_tokens = []
|
|
|
|
for token in tokens:
|
|
|
|
if token in PUNCTS:
|
|
output_tokens.append(PUNCTS[token])
|
|
elif is_number(token):
|
|
output_tokens.append(NUM)
|
|
else:
|
|
output_tokens.append(token.lower())
|
|
|
|
return untokenize(" ".join(output_tokens) + " ")
|
|
|
|
skipped = 0
|
|
|
|
with open(sys.argv[2], 'w', encoding='utf-8') as out_txt:
|
|
with open(sys.argv[1], 'r', encoding='utf-8') as text:
|
|
|
|
for line in text:
|
|
|
|
line = line.replace("\"", "").strip()
|
|
line = multiple_punct.sub(r"\g<1>", line)
|
|
|
|
if skip(line):
|
|
skipped += 1
|
|
continue
|
|
|
|
line = process_line(line)
|
|
|
|
out_txt.write(line)
|
|
|
|
print("Skipped %d lines" % skipped)
|