diff --git a/Create_Model.py b/Create_Model.py new file mode 100644 index 0000000..399d7bd --- /dev/null +++ b/Create_Model.py @@ -0,0 +1,64 @@ +from numpy import array +from pickle import dump +from keras.utils import to_categorical +from keras.models import Sequential +from keras.layers import Dense +from keras.layers import LSTM +from keras.callbacks import CSVLogger + + +# load doc into memory +def load_doc(filename): + # open the file as read only + file = open(filename, 'r') + # read all text + text = file.read() + # close the file + file.close() + return text + + +# load +in_filename = 'char_sequences.txt' +raw_text = load_doc(in_filename) +lines = raw_text.split('\n') + +# integer encode sequences of characters +chars = sorted(list(set(raw_text))) +mapping = dict((c, i) for i, c in enumerate(chars)) +sequences = list() +for line in lines: + # integer encode line + encoded_seq = [mapping[char] for char in line] + # store + sequences.append(encoded_seq) + +# vocabulary size +vocab_size = len(mapping) +print('Vocabulary Size: %d' % vocab_size) + +# separate into input and output +sequences = array(sequences) +X, y = sequences[:, :-1], sequences[:, -1] +sequences = [to_categorical(x, num_classes=vocab_size) for x in X] +X = array(sequences) +y = to_categorical(y, num_classes=vocab_size) + +# define model +model = Sequential() +model.add(LSTM(250, input_shape=(X.shape[1], X.shape[2]), return_sequences=True)) +model.add(LSTM(250, return_sequences=True)) +model.add((LSTM(250))) +model.add(Dense(vocab_size, activation='softmax')) +# compile model +model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) +# print(model.summary()) +# fit model +csv_logger = CSVLogger('log.csv', append=True, separator=';') + +model.fit(X, y, epochs=30, verbose=2, callbacks=[csv_logger]) + +# save the model to file +model.save('model.h5') +# save the mapping +dump(mapping, open('mapping.pkl', 'wb')) diff --git a/Create_data.py b/Create_data.py new file mode 100644 index 0000000..ec90f7d --- /dev/null +++ b/Create_data.py @@ -0,0 +1,53 @@ +# load doc into memory +def load_doc(filename): + # open the file as read only + file = open(filename, 'r') + # read all text + text = file.read() + # close the file + file.close() + return text + + +# save tokens to file, one dialog per line +def save_doc(lines, filename): + data = '\n'.join(lines) + file = open(filename, 'w') + file.write(data) + file.close() + + +# load text +raw_text = load_doc('input_data.txt') +print(raw_text) +# Clear +out = "" +for sim in raw_text: + + if 97 <= ord(sim.lower()) <= 122 or sim.lower() == ' ' or sim.lower() == '\n': + out = out + sim.lower() + +raw_text = out +# clean +tokens = raw_text.split() +raw_text = ' '.join(tokens) + +# organize into sequences of characters +length = 10 +sequences = list() +for i in range(length, len(raw_text)): + # select sequence of tokens + seq = raw_text[i - length:i + 1] + # store + sequences.append(seq) +print(sequences) + +print(sequences) +print('Total Sequences: %d' % len(sequences)) + +# save sequences to file +out_filename = 'char_sequences.txt' + +save_doc(sequences, out_filename) + + diff --git a/Generate.py b/Generate.py new file mode 100644 index 0000000..780b503 --- /dev/null +++ b/Generate.py @@ -0,0 +1,38 @@ +from pickle import load +from keras.models import load_model +from keras.utils import to_categorical +from keras.preprocessing.sequence import pad_sequences +import keras as K + + +# generate a sequence of characters with a language model +def generate_seq(model, mapping, seq_length, seed_text, n_chars): + in_text = seed_text + # generate a fixed number of characters + for _ in range(n_chars): + # encode the characters as integers + encoded = [mapping[char] for char in in_text] + # truncate sequences to a fixed length + encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre') + # one hot encode + encoded = to_categorical(encoded, num_classes=len(mapping)) + # predict character + yhat = model.predict_classes(encoded, verbose=0) + # reverse map integer to character + out_char = '' + for char, index in mapping.items(): + if index == yhat: + out_char = char + break + # append to input + in_text += char + return in_text + + +# load the model +model = load_model('model.h5') + +# load the mapping +mapping = load(open('mapping.pkl', 'rb')) + +print(generate_seq(model, mapping, 10, 'the ', 1000)) diff --git a/README.md b/README.md index db5be37..7d57dc4 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,178 @@ -# BC_Matsunych_2020_Final +# Systémová príručka +Tento projekt je implementovaný tak, že samostatné skripty nezávisia jeden od druhého a môžu sa používať samostatne. Zdrojové kódy boli implementované v jazyku Python 3.6.10. Keras bol použitý na implementáciu ako hlavná knižnica. + + +Zdrojové súbory: + + • Create_data.py + + • Create Model.py + + • Generate.py + + • Create_data_2.py + + • Perplexity.py + + • requirements.txt + + +## Create_data.py +Tento skript prijíma ako vstup súbor „input_data.txt“ obsahujúci textové údaje na trénovanie. +Po dokončení sa vytvorí súbor „char_sequences.txt“ obsahujúci postupnosti znakov zo vstupného súboru. + +### Procesy + +• Otvorenie a čítanie súboru. To sa vykonáva pomocou funkcie load_doc(). + +• Ďalšou fázou je príprava údajov. Počas prípravy sa z údajov odstránia všetky špeciálne znaky okrem medzier. Ďalej všetky ostatné znaky prevedené na malé písmená. + + out = "" + for sim in raw_text: + + if 97 <= ord(sim.lower()) <= 122 or sim.lower() == ' ' or sim.lower() == '\n': + out = out + sim.lower() + + raw_text = out + + tokens = raw_text.split() + raw_text = ' '.join(tokens) + + + +• Z vyčistených údajov sa vytvoria postupnosti znakov. length - dĺžka sekvencie. + + length = 10 + sequences = list() + for i in range(length, len(raw_text)): + # select sequence of tokens + seq = raw_text[i - length:i + 1] + # store + sequences.append(seq) + +• Hotové sekvencie sa uložia do súboru „char_sequences.txt“. + +## Create_Model.py +V tomto skripte sa vytvorí model a začína sa učenie neurónovej siete. Na vstupe skript dostane sekvenčný súbor „char_sequences.txt“. Na konci práce sa vytvoria dva súbory "model.h5" a "mapping.pkl". Vytvorí sa tiež súbor “log.csv”, v ktorom sa uložia výsledky metrík pre trénovanie. + +### Procesy + +• Otvorenie a čítanie súboru. Vytvorí sa zoznam sekvencií. + + in_filename = 'char_sequences.txt' + raw_text = load_doc(in_filename) + lines = raw_text.split('\n') + +• Kódovanie, priradenie celočíselnej hodnoty každému pôvodnému znaku. + + chars = sorted(list(set(raw_text))) + mapping = dict((c, i) for i, c in enumerate(chars)) + sequences = list() + for line in lines: + # integer encode line + encoded_seq = [mapping[char] for char in line] + # store + sequences.append(encoded_seq) + +• One-hot kódovanie. + + sequences = [to_categorical(x, num_classes=vocab_size) for x in X] + +• Tvorba modelu. Implementácia vrstiev, výber aktivačnej funkcie a početu neurónov. + + model = Sequential() + model.add(LSTM(250, input_shape=(X.shape[1], X.shape[2]), return_sequences=True)) + model.add(LSTM(250, return_sequences=True)) + model.add((LSTM(250))) + model.add(Dense(vocab_size, activation='softmax')) + +• Zostavenie modelu. Výber chybovej funkcie a optimalizatora. + + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) + + +• Trénovanie modelu. + + model.fit(X, y, epochs=30, verbose=2, callbacks=[csv_logger]) + + +• Mapovanie a model sa uložia. + + model.save('model.h5') + dump(mapping, open('mapping.pkl', 'wb')) + +## Generate.py +Tento skript generuje postupnosť znakov. Na vstupe sú dva súbory „model.h5“ a „mapping.pkl“, obsahujú stav vyškolenej natrénovanej siete a mapovanie (kódovanie znakov celočíselnými údajmi). Výstupom tohto skriptu sú vygenerované postupnosti znakov. + + def generate_seq(model, mapping, seq_length, seed_text, n_chars): + in_text = seed_text + + for _ in range(n_chars): + encoded = [mapping[char] for char in in_text] + encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre') + encoded = to_categorical(encoded, num_classes=len(mapping)) + yhat = model.predict_classes(encoded, verbose=0) + out_char = '' + for char, index in mapping.items(): + if index == yhat: + out_char = char + break + in_text += char + return in_text + +Tento skript obsahuje hlavnú funkciu „ generate_seq “, v ktorej sa uskutočňujú vyššie opísané procesy na kódovanie znakov a inverzné procesy na dekódovanie znakov. Na generovanie symbolov sa používa funkcia knižnice Keras „model.predict_classes“. + +## Create_data_2.py +Tento skript sa podobá na Create_data.py s jedným rozdielom, výsledkom budú pravidelné sekvencie, nie n-gramové sekvencie. + +## Perplexity.py +Účelom tohto skriptu je počítať perplexitu. Na vstupe tohto skriptu sú dva súbory "gen_seq.txt" a "test_seq.txt", obsahujúce sekvencie generované pomocou skriptu Create_data_2.py. + +### Procesy + +• Odovzdanie súboru a jeho One-hot kódovanie. + + def input_tensor(in_filename): + # in_filename = 'char_sequences_1.txt' + raw_text = load_doc(in_filename) + lines = raw_text.split('\n') + + # integer encode sequences of characters + chars = sorted(list(set(raw_text))) + mapping = dict((c, i) for i, c in enumerate(chars)) + sequences = list() + + for line in lines: + # integer encode line + encoded_seq = [mapping[char] for char in line] + # store + sequences.append(encoded_seq) + + vocab_size = len(mapping) + print('Vocabulary Size: %d' % vocab_size) + + # separate into input and output + sequences = array(sequences) + + X = sequences[:, :-1] + + sequences = [to_categorical(x, num_classes=vocab_size) for x in X] + + X = array(sequences) + + return X + +• Počítanie perplexity. + cce = tf.keras.losses.CategoricalCrossentropy(tf.constant([1.]), tf.constant([0.001])) + a = cce(Y, X).numpy() + print("Perplexity: ", 2 ** a) + +## requirements.txt +Tento súbor bol vytvorený pomocou príkazu: + +**> pip freeze > requirements.txt** + +Tento súbor obsahuje všetky nainštalované balíčky pre projekt. To umožňuje inštaláciu všetkých balíkov pomocou príkazu: + +**> pip install -r requirements.txt** diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..60a89ec --- /dev/null +++ b/requirements.txt @@ -0,0 +1,62 @@ +absl-py==0.9.0 +astor==0.8.0 +blinker==1.4 +brotlipy==0.7.0 +cachetools==4.1.0 +certifi==2020.6.20 +cffi==1.14.0 +chardet==3.0.4 +click==7.1.2 +cryptography==2.9.2 +cycler==0.10.0 +gast==0.2.2 +google-auth @ file:///tmp/build/80754af9/google-auth_1594357566944/work +google-auth-oauthlib==0.4.1 +google-pasta==0.2.0 +grpcio==1.27.2 +h5py==2.10.0 +idna @ file:///tmp/build/80754af9/idna_1593446292537/work +itsdangerous==1.1.0 +Jinja2==2.11.2 +Keras==2.3.1 +Keras-Applications @ file:///tmp/build/80754af9/keras-applications_1594366238411/work +Keras-Preprocessing==1.1.0 +kiwisolver==1.2.0 +Markdown==3.1.1 +MarkupSafe==1.1.1 +matplotlib @ file:///C:/ci/matplotlib-base_1592846129657/work +mkl-fft==1.1.0 +mkl-random==1.1.1 +mkl-service==2.3.0 +numpy==1.18.5 +oauthlib==3.1.0 +opt-einsum==3.1.0 +pandas @ file:///C:/ci/pandas_1592833608684/work +protobuf==3.12.3 +pyasn1==0.4.8 +pyasn1-modules==0.2.7 +pycparser @ file:///tmp/build/80754af9/pycparser_1594388511720/work +PyJWT==1.7.1 +pyOpenSSL @ file:///tmp/build/80754af9/pyopenssl_1594392929924/work +pyparsing==2.4.7 +pyreadline==2.1 +PySocks==1.7.1 +python-dateutil==2.8.1 +pytz==2020.1 +PyYAML==5.3.1 +requests @ file:///tmp/build/80754af9/requests_1592841827918/work +requests-oauthlib==1.3.0 +rsa==4.0 +scipy @ file:///C:/ci/scipy_1592930618155/work +six==1.15.0 +tensorboard==2.2.1 +tensorboard-plugin-wit==1.6.0 +tensorflow==2.1.0 +tensorflow-estimator==2.1.0 +termcolor==1.1.0 +tornado==6.0.4 +urllib3==1.25.9 +Werkzeug==0.16.1 +win-inet-pton==1.1.0 +wincertstore==0.2 +wrapt==1.12.1