BC_Matsunych_2020_Final/Create_data_2.py

65 lines
1.3 KiB
Python
Raw Normal View History

2020-08-28 10:39:32 +00:00
# load doc into memory
def load_doc(filename):
# open the file as read only
file = open(filename, 'r')
# read all text
text = file.read()
# close the file
file.close()
return text
# save tokens to file, one dialog per line
def save_doc(lines, filename):
data = '\n'.join(lines)
file = open(filename, 'w')
file.write(data)
file.close()
# load text
raw_gen = load_doc('gen.txt')
raw_test = load_doc('test.txt')
# raw_text = load_doc('rhyme.txt')
def preparation(raw_text):
out = ""
for sim in raw_text:
if 97 <= ord(sim.lower()) <= 122 or sim.lower() == ' ' or sim.lower() == '\n':
out = out + sim.lower()
raw_text = out
# clean
tokens = raw_text.split()
raw_text = ' '.join(tokens)
# organize into sequences of characters
length = 10
sequences = list()
a, b = 0, 10
# for i in range(length, len(raw_text)):
while b <= len(raw_text):
# select sequence of tokens
seq = raw_text[a:b]
k = (b - a)
a = a + k
b = b + k
# store
sequences.append(seq)
return sequences
# save sequences to file
out_filename = 'gen_seq.txt'
out_filename2 = 'test_seq.txt'
save_doc(preparation(raw_gen), out_filename)
save_doc(preparation(raw_test), out_filename2)