# load doc into memory def load_doc(filename): # open the file as read only file = open(filename, 'r') # read all text text = file.read() # close the file file.close() return text # save tokens to file, one dialog per line def save_doc(lines, filename): data = '\n'.join(lines) file = open(filename, 'w') file.write(data) file.close() # load text raw_text = load_doc('input_data.txt') print(raw_text) # Clear out = "" for sim in raw_text: if 97 <= ord(sim.lower()) <= 122 or sim.lower() == ' ' or sim.lower() == '\n': out = out + sim.lower() raw_text = out # clean tokens = raw_text.split() raw_text = ' '.join(tokens) # organize into sequences of characters length = 10 sequences = list() for i in range(length, len(raw_text)): # select sequence of tokens seq = raw_text[i - length:i + 1] # store sequences.append(seq) print(sequences) print(sequences) print('Total Sequences: %d' % len(sequences)) # save sequences to file out_filename = 'char_sequences.txt' save_doc(sequences, out_filename)