From bedafe06de0c17cadecb73fd1067159d752b01ba Mon Sep 17 00:00:00 2001 From: Stanislav Matsunych Date: Fri, 28 Aug 2020 10:39:32 +0000 Subject: [PATCH] =?UTF-8?q?=D0=97=D0=B0=D0=B3=D1=80=D1=83=D0=B7=D0=B8?= =?UTF-8?q?=D1=82=D1=8C=20=D1=84=D0=B0=D0=B9=D0=BB=D1=8B=20''?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Create_data_2.py | 64 ++++++++++++++++++++++++++++++++++++++++++++++++ Perplexity.py | 55 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 Create_data_2.py create mode 100644 Perplexity.py diff --git a/Create_data_2.py b/Create_data_2.py new file mode 100644 index 0000000..fc95683 --- /dev/null +++ b/Create_data_2.py @@ -0,0 +1,64 @@ +# load doc into memory +def load_doc(filename): + # open the file as read only + file = open(filename, 'r') + # read all text + text = file.read() + # close the file + file.close() + return text + + +# save tokens to file, one dialog per line +def save_doc(lines, filename): + data = '\n'.join(lines) + file = open(filename, 'w') + file.write(data) + file.close() + + +# load text +raw_gen = load_doc('gen.txt') +raw_test = load_doc('test.txt') + + +# raw_text = load_doc('rhyme.txt') + +def preparation(raw_text): + out = "" + for sim in raw_text: + + if 97 <= ord(sim.lower()) <= 122 or sim.lower() == ' ' or sim.lower() == '\n': + out = out + sim.lower() + + raw_text = out + + # clean + tokens = raw_text.split() + raw_text = ' '.join(tokens) + + # organize into sequences of characters + length = 10 + sequences = list() + a, b = 0, 10 + # for i in range(length, len(raw_text)): + while b <= len(raw_text): + # select sequence of tokens + seq = raw_text[a:b] + k = (b - a) + a = a + k + b = b + k + + # store + sequences.append(seq) + return sequences + + + +# save sequences to file +out_filename = 'gen_seq.txt' +out_filename2 = 'test_seq.txt' + + +save_doc(preparation(raw_gen), out_filename) +save_doc(preparation(raw_test), out_filename2) diff --git a/Perplexity.py b/Perplexity.py new file mode 100644 index 0000000..8c83cdc --- /dev/null +++ b/Perplexity.py @@ -0,0 +1,55 @@ +from numpy import array +from keras.utils import to_categorical +import tensorflow as tf + + +# load doc into memory +def load_doc(filename): + # open the file as read only + file = open(filename, 'r') + # read all text + text = file.read() + # close the file + file.close() + return text + + +def input_tensor(in_filename): + raw_text = load_doc(in_filename) + lines = raw_text.split('\n') + + # integer encode sequences of characters + chars = sorted(list(set(raw_text))) + mapping = dict((c, i) for i, c in enumerate(chars)) + sequences = list() + + for line in lines: + # integer encode line + encoded_seq = [mapping[char] for char in line] + # store + sequences.append(encoded_seq) + + vocab_size = len(mapping) + print('Vocabulary Size: %d' % vocab_size) + + # separate into input and output + sequences = array(sequences) + + X = sequences[:, :-1] + + sequences = [to_categorical(x, num_classes=vocab_size) for x in X] + + X = array(sequences) + + return X + + +Y = input_tensor('gen_seq.txt') +X = input_tensor('test_seq.txt') + +Y = Y[:22, :, :24] +X = X[:22, :, :24] +cce = tf.keras.losses.CategoricalCrossentropy(tf.constant([1.]), tf.constant([0.001])) +a = cce(Y, X).numpy() +print("Cross entropy: ", a) +print("Perplexity: ", 2 ** a)