Загрузить файлы ''

2020-08-28 10:38:38 +00:00 · 2020-08-28 10:38:38 +00:00 · 760852f176
commit 760852f176
parent 895a1ac303
5 changed files with 394 additions and 1 deletions
--- a/Create_Model.py
+++ b/Create_Model.py
@ -0,0 +1,64 @@
+from numpy import array
+from pickle import dump
+from keras.utils import to_categorical
+from keras.models import Sequential
+from keras.layers import Dense
+from keras.layers import LSTM
+from keras.callbacks import CSVLogger
+
+
+# load doc into memory
+def load_doc(filename):
+    # open the file as read only
+    file = open(filename, 'r')
+    # read all text
+    text = file.read()
+    # close the file
+    file.close()
+    return text
+
+
+# load
+in_filename = 'char_sequences.txt'
+raw_text = load_doc(in_filename)
+lines = raw_text.split('\n')
+
+# integer encode sequences of characters
+chars = sorted(list(set(raw_text)))
+mapping = dict((c, i) for i, c in enumerate(chars))
+sequences = list()
+for line in lines:
+    # integer encode line
+    encoded_seq = [mapping[char] for char in line]
+    # store
+    sequences.append(encoded_seq)
+
+# vocabulary size
+vocab_size = len(mapping)
+print('Vocabulary Size: %d' % vocab_size)
+
+# separate into input and output
+sequences = array(sequences)
+X, y = sequences[:, :-1], sequences[:, -1]
+sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
+X = array(sequences)
+y = to_categorical(y, num_classes=vocab_size)
+
+# define model
+model = Sequential()
+model.add(LSTM(250, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
+model.add(LSTM(250, return_sequences=True))
+model.add((LSTM(250)))
+model.add(Dense(vocab_size, activation='softmax'))
+# compile model
+model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
+# print(model.summary())
+# fit model
+csv_logger = CSVLogger('log.csv', append=True, separator=';')
+
+model.fit(X, y, epochs=30, verbose=2, callbacks=[csv_logger])
+
+# save the model to file
+model.save('model.h5')
+# save the mapping
+dump(mapping, open('mapping.pkl', 'wb'))
--- a/Create_data.py
+++ b/Create_data.py
@ -0,0 +1,53 @@
+# load doc into memory
+def load_doc(filename):
+    # open the file as read only
+    file = open(filename, 'r')
+    # read all text
+    text = file.read()
+    # close the file
+    file.close()
+    return text
+
+
+# save tokens to file, one dialog per line
+def save_doc(lines, filename):
+    data = '\n'.join(lines)
+    file = open(filename, 'w')
+    file.write(data)
+    file.close()
+
+
+# load text
+raw_text = load_doc('input_data.txt')
+print(raw_text)
+# Clear
+out = ""
+for sim in raw_text:
+
+    if 97 <= ord(sim.lower()) <= 122 or sim.lower() == ' ' or sim.lower() == '\n':
+        out = out + sim.lower()
+
+raw_text = out
+# clean
+tokens = raw_text.split()
+raw_text = ' '.join(tokens)
+
+# organize into sequences of characters
+length = 10
+sequences = list()
+for i in range(length, len(raw_text)):
+    # select sequence of tokens
+    seq = raw_text[i - length:i + 1]
+    # store
+    sequences.append(seq)
+print(sequences)
+
+print(sequences)
+print('Total Sequences: %d' % len(sequences))
+
+# save sequences to file
+out_filename = 'char_sequences.txt'
+
+save_doc(sequences, out_filename)
+
+
--- a/Generate.py
+++ b/Generate.py
@ -0,0 +1,38 @@
+from pickle import load
+from keras.models import load_model
+from keras.utils import to_categorical
+from keras.preprocessing.sequence import pad_sequences
+import keras as K
+
+
+# generate a sequence of characters with a language model
+def generate_seq(model, mapping, seq_length, seed_text, n_chars):
+    in_text = seed_text
+    # generate a fixed number of characters
+    for _ in range(n_chars):
+        # encode the characters as integers
+        encoded = [mapping[char] for char in in_text]
+        # truncate sequences to a fixed length
+        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
+        # one hot encode
+        encoded = to_categorical(encoded, num_classes=len(mapping))
+        # predict character
+        yhat = model.predict_classes(encoded, verbose=0)
+        # reverse map integer to character
+        out_char = ''
+        for char, index in mapping.items():
+            if index == yhat:
+                out_char = char
+                break
+        # append to input
+        in_text += char
+    return in_text
+
+
+# load the model
+model = load_model('model.h5')
+
+# load the mapping
+mapping = load(open('mapping.pkl', 'rb'))
+
+print(generate_seq(model, mapping, 10, 'the ', 1000))
--- a/README.md
+++ b/README.md
@ -1,2 +1,178 @@
-# BC_Matsunych_2020_Final
+# Systémová príručka

+Tento projekt je implementovaný tak, že samostatné skripty nezávisia jeden od druhého a môžu sa používať samostatne. Zdrojové kódy boli implementované v jazyku Python 3.6.10. Keras bol použitý na implementáciu ako hlavná knižnica. 
+
+
+Zdrojové súbory:
+
+  •	Create_data.py
+  
+  •	Create Model.py
+  
+  •	Generate.py
+  
+  •	Create_data_2.py
+  
+  •	Perplexity.py
+  
+  •	requirements.txt
+  
+
+## Create_data.py
+Tento skript prijíma ako vstup súbor „input_data.txt“ obsahujúci textové údaje na trénovanie.
+Po dokončení sa vytvorí súbor „char_sequences.txt“ obsahujúci postupnosti znakov zo vstupného súboru. 
+
+### Procesy
+
+•	Otvorenie a čítanie súboru. To sa vykonáva pomocou funkcie load_doc().
+
+•	Ďalšou fázou je príprava údajov. Počas prípravy sa z údajov odstránia všetky špeciálne znaky okrem medzier. Ďalej všetky ostatné znaky prevedené na malé písmená.
+
+    out = ""
+    for sim in raw_text:
+
+        if 97 <= ord(sim.lower()) <= 122 or sim.lower() == ' ' or sim.lower() == '\n':
+            out = out + sim.lower()
+
+    raw_text = out
+
+    tokens = raw_text.split()
+    raw_text = ' '.join(tokens)
+
+
+
+•	Z vyčistených údajov sa vytvoria postupnosti znakov. length - dĺžka sekvencie.
+
+    length = 10
+    sequences = list()
+    for i in range(length, len(raw_text)):
+        # select sequence of tokens
+        seq = raw_text[i - length:i + 1]
+        # store
+        sequences.append(seq)
+
+•	Hotové sekvencie sa uložia do súboru „char_sequences.txt“.
+
+## Create_Model.py
+V tomto skripte sa vytvorí model a začína sa učenie neurónovej siete. Na vstupe skript dostane sekvenčný súbor „char_sequences.txt“. Na konci práce sa vytvoria dva súbory "model.h5" a "mapping.pkl". Vytvorí sa tiež súbor “log.csv”, v ktorom sa uložia výsledky metrík pre trénovanie.
+
+### Procesy
+
+•	Otvorenie a čítanie súboru. Vytvorí sa zoznam sekvencií. 
+
+    in_filename = 'char_sequences.txt'
+    raw_text = load_doc(in_filename)
+    lines = raw_text.split('\n')
+
+•	Kódovanie, priradenie celočíselnej hodnoty každému pôvodnému znaku.
+
+    chars = sorted(list(set(raw_text)))
+    mapping = dict((c, i) for i, c in enumerate(chars))
+    sequences = list()
+    for line in lines:
+        # integer encode line
+        encoded_seq = [mapping[char] for char in line]
+        # store
+        sequences.append(encoded_seq)
+
+•	One-hot kódovanie.
+
+    sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
+
+•	Tvorba modelu. Implementácia vrstiev, výber aktivačnej funkcie a početu neurónov.
+
+    model = Sequential()
+    model.add(LSTM(250, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
+    model.add(LSTM(250, return_sequences=True))
+    model.add((LSTM(250)))
+    model.add(Dense(vocab_size, activation='softmax'))
+
+•	Zostavenie modelu. Výber chybovej funkcie a optimalizatora.
+
+    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
+
+
+•	Trénovanie modelu. 
+
+    model.fit(X, y, epochs=30, verbose=2, callbacks=[csv_logger])
+
+
+•	Mapovanie a model sa uložia.
+
+    model.save('model.h5')
+    dump(mapping, open('mapping.pkl', 'wb'))
+
+## Generate.py 
+Tento skript generuje postupnosť znakov. Na vstupe sú dva súbory „model.h5“ a „mapping.pkl“, obsahujú stav vyškolenej natrénovanej siete a mapovanie (kódovanie znakov celočíselnými údajmi). Výstupom tohto skriptu sú vygenerované postupnosti znakov.
+  
+    def generate_seq(model, mapping, seq_length, seed_text, n_chars):
+        in_text = seed_text
+
+        for _ in range(n_chars):
+            encoded = [mapping[char] for char in in_text]
+            encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
+            encoded = to_categorical(encoded, num_classes=len(mapping))
+            yhat = model.predict_classes(encoded, verbose=0)
+            out_char = ''
+            for char, index in mapping.items():
+                if index == yhat:
+                    out_char = char
+                    break
+            in_text += char
+        return in_text
+
+Tento skript obsahuje hlavnú funkciu „ generate_seq “, v ktorej sa uskutočňujú vyššie opísané procesy na kódovanie znakov a inverzné procesy na dekódovanie znakov. Na generovanie symbolov sa používa funkcia knižnice Keras „model.predict_classes“.
+
+## Create_data_2.py
+Tento skript sa podobá na Create_data.py s jedným rozdielom, výsledkom budú pravidelné sekvencie, nie n-gramové sekvencie. 
+
+## Perplexity.py
+Účelom tohto skriptu je počítať perplexitu. Na vstupe tohto skriptu sú dva súbory "gen_seq.txt" a "test_seq.txt", obsahujúce sekvencie generované pomocou skriptu Create_data_2.py.
+
+### Procesy
+
+•	Odovzdanie súboru a jeho One-hot kódovanie.
+
+    def input_tensor(in_filename):
+        # in_filename = 'char_sequences_1.txt'
+        raw_text = load_doc(in_filename)
+        lines = raw_text.split('\n')
+
+        # integer encode sequences of characters
+        chars = sorted(list(set(raw_text)))
+        mapping = dict((c, i) for i, c in enumerate(chars))
+        sequences = list()
+
+        for line in lines:
+            # integer encode line
+            encoded_seq = [mapping[char] for char in line]
+            # store
+            sequences.append(encoded_seq)
+
+        vocab_size = len(mapping)
+        print('Vocabulary Size: %d' % vocab_size)
+
+        # separate into input and output
+        sequences = array(sequences)
+
+        X = sequences[:, :-1]
+
+        sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
+
+        X = array(sequences)
+
+        return X
+
+•	Počítanie perplexity.
+  cce = tf.keras.losses.CategoricalCrossentropy(tf.constant([1.]), tf.constant([0.001]))
+  a = cce(Y, X).numpy()
+  print("Perplexity: ", 2 ** a)
+
+## requirements.txt
+Tento súbor bol vytvorený pomocou príkazu: 
+
+**> pip freeze > requirements.txt**
+
+Tento súbor obsahuje všetky nainštalované balíčky pre projekt. To umožňuje inštaláciu všetkých balíkov pomocou príkazu: 
+
+**> pip install -r requirements.txt**
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,62 @@
+absl-py==0.9.0
+astor==0.8.0
+blinker==1.4
+brotlipy==0.7.0
+cachetools==4.1.0
+certifi==2020.6.20
+cffi==1.14.0
+chardet==3.0.4
+click==7.1.2
+cryptography==2.9.2
+cycler==0.10.0
+gast==0.2.2
+google-auth @ file:///tmp/build/80754af9/google-auth_1594357566944/work
+google-auth-oauthlib==0.4.1
+google-pasta==0.2.0
+grpcio==1.27.2
+h5py==2.10.0
+idna @ file:///tmp/build/80754af9/idna_1593446292537/work
+itsdangerous==1.1.0
+Jinja2==2.11.2
+Keras==2.3.1
+Keras-Applications @ file:///tmp/build/80754af9/keras-applications_1594366238411/work
+Keras-Preprocessing==1.1.0
+kiwisolver==1.2.0
+Markdown==3.1.1
+MarkupSafe==1.1.1
+matplotlib @ file:///C:/ci/matplotlib-base_1592846129657/work
+mkl-fft==1.1.0
+mkl-random==1.1.1
+mkl-service==2.3.0
+numpy==1.18.5
+oauthlib==3.1.0
+opt-einsum==3.1.0
+pandas @ file:///C:/ci/pandas_1592833608684/work
+protobuf==3.12.3
+pyasn1==0.4.8
+pyasn1-modules==0.2.7
+pycparser @ file:///tmp/build/80754af9/pycparser_1594388511720/work
+PyJWT==1.7.1
+pyOpenSSL @ file:///tmp/build/80754af9/pyopenssl_1594392929924/work
+pyparsing==2.4.7
+pyreadline==2.1
+PySocks==1.7.1
+python-dateutil==2.8.1
+pytz==2020.1
+PyYAML==5.3.1
+requests @ file:///tmp/build/80754af9/requests_1592841827918/work
+requests-oauthlib==1.3.0
+rsa==4.0
+scipy @ file:///C:/ci/scipy_1592930618155/work
+six==1.15.0
+tensorboard==2.2.1
+tensorboard-plugin-wit==1.6.0
+tensorflow==2.1.0
+tensorflow-estimator==2.1.0
+termcolor==1.1.0
+tornado==6.0.4
+urllib3==1.25.9
+Werkzeug==0.16.1
+win-inet-pton==1.1.0
+wincertstore==0.2
+wrapt==1.12.1