Загрузить файлы ''
This commit is contained in:
parent
760852f176
commit
bedafe06de
64
Create_data_2.py
Normal file
64
Create_data_2.py
Normal file
@ -0,0 +1,64 @@
|
||||
# load doc into memory
|
||||
def load_doc(filename):
|
||||
# open the file as read only
|
||||
file = open(filename, 'r')
|
||||
# read all text
|
||||
text = file.read()
|
||||
# close the file
|
||||
file.close()
|
||||
return text
|
||||
|
||||
|
||||
# save tokens to file, one dialog per line
|
||||
def save_doc(lines, filename):
|
||||
data = '\n'.join(lines)
|
||||
file = open(filename, 'w')
|
||||
file.write(data)
|
||||
file.close()
|
||||
|
||||
|
||||
# load text
|
||||
raw_gen = load_doc('gen.txt')
|
||||
raw_test = load_doc('test.txt')
|
||||
|
||||
|
||||
# raw_text = load_doc('rhyme.txt')
|
||||
|
||||
def preparation(raw_text):
|
||||
out = ""
|
||||
for sim in raw_text:
|
||||
|
||||
if 97 <= ord(sim.lower()) <= 122 or sim.lower() == ' ' or sim.lower() == '\n':
|
||||
out = out + sim.lower()
|
||||
|
||||
raw_text = out
|
||||
|
||||
# clean
|
||||
tokens = raw_text.split()
|
||||
raw_text = ' '.join(tokens)
|
||||
|
||||
# organize into sequences of characters
|
||||
length = 10
|
||||
sequences = list()
|
||||
a, b = 0, 10
|
||||
# for i in range(length, len(raw_text)):
|
||||
while b <= len(raw_text):
|
||||
# select sequence of tokens
|
||||
seq = raw_text[a:b]
|
||||
k = (b - a)
|
||||
a = a + k
|
||||
b = b + k
|
||||
|
||||
# store
|
||||
sequences.append(seq)
|
||||
return sequences
|
||||
|
||||
|
||||
|
||||
# save sequences to file
|
||||
out_filename = 'gen_seq.txt'
|
||||
out_filename2 = 'test_seq.txt'
|
||||
|
||||
|
||||
save_doc(preparation(raw_gen), out_filename)
|
||||
save_doc(preparation(raw_test), out_filename2)
|
55
Perplexity.py
Normal file
55
Perplexity.py
Normal file
@ -0,0 +1,55 @@
|
||||
from numpy import array
|
||||
from keras.utils import to_categorical
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
# load doc into memory
|
||||
def load_doc(filename):
|
||||
# open the file as read only
|
||||
file = open(filename, 'r')
|
||||
# read all text
|
||||
text = file.read()
|
||||
# close the file
|
||||
file.close()
|
||||
return text
|
||||
|
||||
|
||||
def input_tensor(in_filename):
|
||||
raw_text = load_doc(in_filename)
|
||||
lines = raw_text.split('\n')
|
||||
|
||||
# integer encode sequences of characters
|
||||
chars = sorted(list(set(raw_text)))
|
||||
mapping = dict((c, i) for i, c in enumerate(chars))
|
||||
sequences = list()
|
||||
|
||||
for line in lines:
|
||||
# integer encode line
|
||||
encoded_seq = [mapping[char] for char in line]
|
||||
# store
|
||||
sequences.append(encoded_seq)
|
||||
|
||||
vocab_size = len(mapping)
|
||||
print('Vocabulary Size: %d' % vocab_size)
|
||||
|
||||
# separate into input and output
|
||||
sequences = array(sequences)
|
||||
|
||||
X = sequences[:, :-1]
|
||||
|
||||
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
|
||||
|
||||
X = array(sequences)
|
||||
|
||||
return X
|
||||
|
||||
|
||||
Y = input_tensor('gen_seq.txt')
|
||||
X = input_tensor('test_seq.txt')
|
||||
|
||||
Y = Y[:22, :, :24]
|
||||
X = X[:22, :, :24]
|
||||
cce = tf.keras.losses.CategoricalCrossentropy(tf.constant([1.]), tf.constant([0.001]))
|
||||
a = cce(Y, X).numpy()
|
||||
print("Cross entropy: ", a)
|
||||
print("Perplexity: ", 2 ** a)
|
Loading…
Reference in New Issue
Block a user