54 lines
1.1 KiB
Python
54 lines
1.1 KiB
Python
# load doc into memory
|
|
def load_doc(filename):
|
|
# open the file as read only
|
|
file = open(filename, 'r')
|
|
# read all text
|
|
text = file.read()
|
|
# close the file
|
|
file.close()
|
|
return text
|
|
|
|
|
|
# save tokens to file, one dialog per line
|
|
def save_doc(lines, filename):
|
|
data = '\n'.join(lines)
|
|
file = open(filename, 'w')
|
|
file.write(data)
|
|
file.close()
|
|
|
|
|
|
# load text
|
|
raw_text = load_doc('input_data.txt')
|
|
print(raw_text)
|
|
# Clear
|
|
out = ""
|
|
for sim in raw_text:
|
|
|
|
if 97 <= ord(sim.lower()) <= 122 or sim.lower() == ' ' or sim.lower() == '\n':
|
|
out = out + sim.lower()
|
|
|
|
raw_text = out
|
|
# clean
|
|
tokens = raw_text.split()
|
|
raw_text = ' '.join(tokens)
|
|
|
|
# organize into sequences of characters
|
|
length = 10
|
|
sequences = list()
|
|
for i in range(length, len(raw_text)):
|
|
# select sequence of tokens
|
|
seq = raw_text[i - length:i + 1]
|
|
# store
|
|
sequences.append(seq)
|
|
print(sequences)
|
|
|
|
print(sequences)
|
|
print('Total Sequences: %d' % len(sequences))
|
|
|
|
# save sequences to file
|
|
out_filename = 'char_sequences.txt'
|
|
|
|
save_doc(sequences, out_filename)
|
|
|
|
|