forked from KEMT/zpwiki
		
	
		
			
				
	
	
		
			74 lines
		
	
	
		
			1.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			74 lines
		
	
	
		
			1.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from __future__ import division, print_function
 | 
						|
from nltk.tokenize import word_tokenize
 | 
						|
 | 
						|
import nltk
 | 
						|
import os
 | 
						|
from io import open
 | 
						|
import re
 | 
						|
import sys
 | 
						|
 | 
						|
nltk.download('punkt')
 | 
						|
 | 
						|
NUM = '<NUM>'
 | 
						|
 | 
						|
PUNCTS = {".": ".PER", ",": ".COM", "?": "?QUE", "!": ".PER", ":": ",COM", ";": ".PER", "-": ",COM"}
 | 
						|
 | 
						|
forbidden_symbols = re.compile(r"[\[\]\(\)\/\\\>\<\=\+\_\*]")
 | 
						|
numbers = re.compile(r"\d")
 | 
						|
multiple_punct = re.compile(r'([\.\?\!\,\:\;\-])(?:[\.\?\!\,\:\;\-]){1,}')
 | 
						|
 | 
						|
is_number = lambda x: len(numbers.sub("", x)) / len(x) < 0.6
 | 
						|
 | 
						|
def untokenize(line):
 | 
						|
    return line.replace(" '", "'").replace(" n't", "n't").replace("can not", "cannot")
 | 
						|
 | 
						|
def skip(line):
 | 
						|
 | 
						|
    if line.strip() == '':
 | 
						|
        return True
 | 
						|
 | 
						|
    last_symbol = line[-1]
 | 
						|
    if not last_symbol in PUNCTS:
 | 
						|
        return True
 | 
						|
 | 
						|
    if forbidden_symbols.search(line) is not None:
 | 
						|
        return True
 | 
						|
 | 
						|
    return False
 | 
						|
 | 
						|
def process_line(line):
 | 
						|
 | 
						|
    tokens = word_tokenize(line)
 | 
						|
    output_tokens = []
 | 
						|
 | 
						|
    for token in tokens:
 | 
						|
 | 
						|
        if token in PUNCTS:
 | 
						|
            output_tokens.append(PUNCTS[token])
 | 
						|
        elif is_number(token):
 | 
						|
            output_tokens.append(NUM)
 | 
						|
        else:
 | 
						|
            output_tokens.append(token.lower())
 | 
						|
 | 
						|
    return untokenize(" ".join(output_tokens) + " ")
 | 
						|
 | 
						|
skipped = 0
 | 
						|
 | 
						|
with open(sys.argv[2], 'w', encoding='utf-8') as out_txt:
 | 
						|
    with open(sys.argv[1], 'r', encoding='utf-8') as text:
 | 
						|
 | 
						|
        for line in text:
 | 
						|
 | 
						|
            line = line.replace("\"", "").strip()
 | 
						|
            line = multiple_punct.sub(r"\g<1>", line)
 | 
						|
 | 
						|
            if skip(line):
 | 
						|
                skipped += 1
 | 
						|
                continue
 | 
						|
 | 
						|
            line = process_line(line)
 | 
						|
 | 
						|
            out_txt.write(line)
 | 
						|
 | 
						|
print("Skipped %d lines" % skipped)
 |