Загрузить файлы ''

2020-08-28 10:39:32 +00:00 · 2020-08-28 10:39:32 +00:00 · bedafe06de
commit bedafe06de
parent 760852f176
2 changed files with 119 additions and 0 deletions
--- a/Create_data_2.py
+++ b/Create_data_2.py
@ -0,0 +1,64 @@
 # load doc into memory
 def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 # save tokens to file, one dialog per line
 def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
 # load text
 raw_gen = load_doc('gen.txt')
 raw_test = load_doc('test.txt')
 # raw_text = load_doc('rhyme.txt')
 def preparation(raw_text):
    out = ""
    for sim in raw_text:
        if 97 <= ord(sim.lower()) <= 122 or sim.lower() == ' ' or sim.lower() == '\n':
            out = out + sim.lower()
    raw_text = out
    # clean
    tokens = raw_text.split()
    raw_text = ' '.join(tokens)
    # organize into sequences of characters
    length = 10
    sequences = list()
    a, b = 0, 10
    # for i in range(length, len(raw_text)):
    while b <= len(raw_text):
        # select sequence of tokens
        seq = raw_text[a:b]
        k = (b - a)
        a = a + k
        b = b + k
        # store
        sequences.append(seq)
    return sequences
 # save sequences to file
 out_filename = 'gen_seq.txt'
 out_filename2 = 'test_seq.txt'
 save_doc(preparation(raw_gen), out_filename)
 save_doc(preparation(raw_test), out_filename2)
--- a/Perplexity.py
+++ b/Perplexity.py
@ -0,0 +1,55 @@
 from numpy import array
 from keras.utils import to_categorical
 import tensorflow as tf
 # load doc into memory
 def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 def input_tensor(in_filename):
    raw_text = load_doc(in_filename)
    lines = raw_text.split('\n')
    # integer encode sequences of characters
    chars = sorted(list(set(raw_text)))
    mapping = dict((c, i) for i, c in enumerate(chars))
    sequences = list()
    for line in lines:
        # integer encode line
        encoded_seq = [mapping[char] for char in line]
        # store
        sequences.append(encoded_seq)
    vocab_size = len(mapping)
    print('Vocabulary Size: %d' % vocab_size)
    # separate into input and output
    sequences = array(sequences)
    X = sequences[:, :-1]
    sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
    X = array(sequences)
    return X
 Y = input_tensor('gen_seq.txt')
 X = input_tensor('test_seq.txt')
 Y = Y[:22, :, :24]
 X = X[:22, :, :24]
 cce = tf.keras.losses.CategoricalCrossentropy(tf.constant([1.]), tf.constant([0.001]))
 a = cce(Y, X).numpy()
 print("Cross entropy: ", a)
 print("Perplexity: ", 2 ** a)