Загрузить файлы ''

2020-08-28 10:39:32 +00:00 · 2020-08-28 10:39:32 +00:00 · bedafe06de
commit bedafe06de
parent 760852f176
2 changed files with 119 additions and 0 deletions
--- a/Create_data_2.py
+++ b/Create_data_2.py
@ -0,0 +1,64 @@
+# load doc into memory
+def load_doc(filename):
+    # open the file as read only
+    file = open(filename, 'r')
+    # read all text
+    text = file.read()
+    # close the file
+    file.close()
+    return text
+
+
+# save tokens to file, one dialog per line
+def save_doc(lines, filename):
+    data = '\n'.join(lines)
+    file = open(filename, 'w')
+    file.write(data)
+    file.close()
+
+
+# load text
+raw_gen = load_doc('gen.txt')
+raw_test = load_doc('test.txt')
+
+
+# raw_text = load_doc('rhyme.txt')
+
+def preparation(raw_text):
+    out = ""
+    for sim in raw_text:
+
+        if 97 <= ord(sim.lower()) <= 122 or sim.lower() == ' ' or sim.lower() == '\n':
+            out = out + sim.lower()
+
+    raw_text = out
+
+    # clean
+    tokens = raw_text.split()
+    raw_text = ' '.join(tokens)
+
+    # organize into sequences of characters
+    length = 10
+    sequences = list()
+    a, b = 0, 10
+    # for i in range(length, len(raw_text)):
+    while b <= len(raw_text):
+        # select sequence of tokens
+        seq = raw_text[a:b]
+        k = (b - a)
+        a = a + k
+        b = b + k
+
+        # store
+        sequences.append(seq)
+    return sequences
+
+
+
+# save sequences to file
+out_filename = 'gen_seq.txt'
+out_filename2 = 'test_seq.txt'
+
+
+save_doc(preparation(raw_gen), out_filename)
+save_doc(preparation(raw_test), out_filename2)
--- a/Perplexity.py
+++ b/Perplexity.py
@ -0,0 +1,55 @@
+from numpy import array
+from keras.utils import to_categorical
+import tensorflow as tf
+
+
+# load doc into memory
+def load_doc(filename):
+    # open the file as read only
+    file = open(filename, 'r')
+    # read all text
+    text = file.read()
+    # close the file
+    file.close()
+    return text
+
+
+def input_tensor(in_filename):
+    raw_text = load_doc(in_filename)
+    lines = raw_text.split('\n')
+
+    # integer encode sequences of characters
+    chars = sorted(list(set(raw_text)))
+    mapping = dict((c, i) for i, c in enumerate(chars))
+    sequences = list()
+
+    for line in lines:
+        # integer encode line
+        encoded_seq = [mapping[char] for char in line]
+        # store
+        sequences.append(encoded_seq)
+
+    vocab_size = len(mapping)
+    print('Vocabulary Size: %d' % vocab_size)
+
+    # separate into input and output
+    sequences = array(sequences)
+
+    X = sequences[:, :-1]
+
+    sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
+
+    X = array(sequences)
+
+    return X
+
+
+Y = input_tensor('gen_seq.txt')
+X = input_tensor('test_seq.txt')
+
+Y = Y[:22, :, :24]
+X = X[:22, :, :24]
+cce = tf.keras.losses.CategoricalCrossentropy(tf.constant([1.]), tf.constant([0.001]))
+a = cce(Y, X).numpy()
+print("Cross entropy: ", a)
+print("Perplexity: ", 2 ** a)