# load doc into memory def load_doc(filename): # open the file as read only file = open(filename, 'r') # read all text text = file.read() # close the file file.close() return text # save tokens to file, one dialog per line def save_doc(lines, filename): data = '\n'.join(lines) file = open(filename, 'w') file.write(data) file.close() # load text raw_gen = load_doc('gen.txt') raw_test = load_doc('test.txt') # raw_text = load_doc('rhyme.txt') def preparation(raw_text): out = "" for sim in raw_text: if 97 <= ord(sim.lower()) <= 122 or sim.lower() == ' ' or sim.lower() == '\n': out = out + sim.lower() raw_text = out # clean tokens = raw_text.split() raw_text = ' '.join(tokens) # organize into sequences of characters length = 10 sequences = list() a, b = 0, 10 # for i in range(length, len(raw_text)): while b <= len(raw_text): # select sequence of tokens seq = raw_text[a:b] k = (b - a) a = a + k b = b + k # store sequences.append(seq) return sequences # save sequences to file out_filename = 'gen_seq.txt' out_filename2 = 'test_seq.txt' save_doc(preparation(raw_gen), out_filename) save_doc(preparation(raw_test), out_filename2)