Загрузить файлы ''
This commit is contained in:
parent
895a1ac303
commit
760852f176
64
Create_Model.py
Normal file
64
Create_Model.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
from numpy import array
|
||||||
|
from pickle import dump
|
||||||
|
from keras.utils import to_categorical
|
||||||
|
from keras.models import Sequential
|
||||||
|
from keras.layers import Dense
|
||||||
|
from keras.layers import LSTM
|
||||||
|
from keras.callbacks import CSVLogger
|
||||||
|
|
||||||
|
|
||||||
|
# load doc into memory
|
||||||
|
def load_doc(filename):
|
||||||
|
# open the file as read only
|
||||||
|
file = open(filename, 'r')
|
||||||
|
# read all text
|
||||||
|
text = file.read()
|
||||||
|
# close the file
|
||||||
|
file.close()
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
# load
|
||||||
|
in_filename = 'char_sequences.txt'
|
||||||
|
raw_text = load_doc(in_filename)
|
||||||
|
lines = raw_text.split('\n')
|
||||||
|
|
||||||
|
# integer encode sequences of characters
|
||||||
|
chars = sorted(list(set(raw_text)))
|
||||||
|
mapping = dict((c, i) for i, c in enumerate(chars))
|
||||||
|
sequences = list()
|
||||||
|
for line in lines:
|
||||||
|
# integer encode line
|
||||||
|
encoded_seq = [mapping[char] for char in line]
|
||||||
|
# store
|
||||||
|
sequences.append(encoded_seq)
|
||||||
|
|
||||||
|
# vocabulary size
|
||||||
|
vocab_size = len(mapping)
|
||||||
|
print('Vocabulary Size: %d' % vocab_size)
|
||||||
|
|
||||||
|
# separate into input and output
|
||||||
|
sequences = array(sequences)
|
||||||
|
X, y = sequences[:, :-1], sequences[:, -1]
|
||||||
|
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
|
||||||
|
X = array(sequences)
|
||||||
|
y = to_categorical(y, num_classes=vocab_size)
|
||||||
|
|
||||||
|
# define model
|
||||||
|
model = Sequential()
|
||||||
|
model.add(LSTM(250, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
|
||||||
|
model.add(LSTM(250, return_sequences=True))
|
||||||
|
model.add((LSTM(250)))
|
||||||
|
model.add(Dense(vocab_size, activation='softmax'))
|
||||||
|
# compile model
|
||||||
|
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
|
||||||
|
# print(model.summary())
|
||||||
|
# fit model
|
||||||
|
csv_logger = CSVLogger('log.csv', append=True, separator=';')
|
||||||
|
|
||||||
|
model.fit(X, y, epochs=30, verbose=2, callbacks=[csv_logger])
|
||||||
|
|
||||||
|
# save the model to file
|
||||||
|
model.save('model.h5')
|
||||||
|
# save the mapping
|
||||||
|
dump(mapping, open('mapping.pkl', 'wb'))
|
53
Create_data.py
Normal file
53
Create_data.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
# load doc into memory
|
||||||
|
def load_doc(filename):
|
||||||
|
# open the file as read only
|
||||||
|
file = open(filename, 'r')
|
||||||
|
# read all text
|
||||||
|
text = file.read()
|
||||||
|
# close the file
|
||||||
|
file.close()
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
# save tokens to file, one dialog per line
|
||||||
|
def save_doc(lines, filename):
|
||||||
|
data = '\n'.join(lines)
|
||||||
|
file = open(filename, 'w')
|
||||||
|
file.write(data)
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
|
||||||
|
# load text
|
||||||
|
raw_text = load_doc('input_data.txt')
|
||||||
|
print(raw_text)
|
||||||
|
# Clear
|
||||||
|
out = ""
|
||||||
|
for sim in raw_text:
|
||||||
|
|
||||||
|
if 97 <= ord(sim.lower()) <= 122 or sim.lower() == ' ' or sim.lower() == '\n':
|
||||||
|
out = out + sim.lower()
|
||||||
|
|
||||||
|
raw_text = out
|
||||||
|
# clean
|
||||||
|
tokens = raw_text.split()
|
||||||
|
raw_text = ' '.join(tokens)
|
||||||
|
|
||||||
|
# organize into sequences of characters
|
||||||
|
length = 10
|
||||||
|
sequences = list()
|
||||||
|
for i in range(length, len(raw_text)):
|
||||||
|
# select sequence of tokens
|
||||||
|
seq = raw_text[i - length:i + 1]
|
||||||
|
# store
|
||||||
|
sequences.append(seq)
|
||||||
|
print(sequences)
|
||||||
|
|
||||||
|
print(sequences)
|
||||||
|
print('Total Sequences: %d' % len(sequences))
|
||||||
|
|
||||||
|
# save sequences to file
|
||||||
|
out_filename = 'char_sequences.txt'
|
||||||
|
|
||||||
|
save_doc(sequences, out_filename)
|
||||||
|
|
||||||
|
|
38
Generate.py
Normal file
38
Generate.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
from pickle import load
|
||||||
|
from keras.models import load_model
|
||||||
|
from keras.utils import to_categorical
|
||||||
|
from keras.preprocessing.sequence import pad_sequences
|
||||||
|
import keras as K
|
||||||
|
|
||||||
|
|
||||||
|
# generate a sequence of characters with a language model
|
||||||
|
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
|
||||||
|
in_text = seed_text
|
||||||
|
# generate a fixed number of characters
|
||||||
|
for _ in range(n_chars):
|
||||||
|
# encode the characters as integers
|
||||||
|
encoded = [mapping[char] for char in in_text]
|
||||||
|
# truncate sequences to a fixed length
|
||||||
|
encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
|
||||||
|
# one hot encode
|
||||||
|
encoded = to_categorical(encoded, num_classes=len(mapping))
|
||||||
|
# predict character
|
||||||
|
yhat = model.predict_classes(encoded, verbose=0)
|
||||||
|
# reverse map integer to character
|
||||||
|
out_char = ''
|
||||||
|
for char, index in mapping.items():
|
||||||
|
if index == yhat:
|
||||||
|
out_char = char
|
||||||
|
break
|
||||||
|
# append to input
|
||||||
|
in_text += char
|
||||||
|
return in_text
|
||||||
|
|
||||||
|
|
||||||
|
# load the model
|
||||||
|
model = load_model('model.h5')
|
||||||
|
|
||||||
|
# load the mapping
|
||||||
|
mapping = load(open('mapping.pkl', 'rb'))
|
||||||
|
|
||||||
|
print(generate_seq(model, mapping, 10, 'the ', 1000))
|
178
README.md
178
README.md
@ -1,2 +1,178 @@
|
|||||||
# BC_Matsunych_2020_Final
|
# Systémová príručka
|
||||||
|
|
||||||
|
Tento projekt je implementovaný tak, že samostatné skripty nezávisia jeden od druhého a môžu sa používať samostatne. Zdrojové kódy boli implementované v jazyku Python 3.6.10. Keras bol použitý na implementáciu ako hlavná knižnica.
|
||||||
|
|
||||||
|
|
||||||
|
Zdrojové súbory:
|
||||||
|
|
||||||
|
• Create_data.py
|
||||||
|
|
||||||
|
• Create Model.py
|
||||||
|
|
||||||
|
• Generate.py
|
||||||
|
|
||||||
|
• Create_data_2.py
|
||||||
|
|
||||||
|
• Perplexity.py
|
||||||
|
|
||||||
|
• requirements.txt
|
||||||
|
|
||||||
|
|
||||||
|
## Create_data.py
|
||||||
|
Tento skript prijíma ako vstup súbor „input_data.txt“ obsahujúci textové údaje na trénovanie.
|
||||||
|
Po dokončení sa vytvorí súbor „char_sequences.txt“ obsahujúci postupnosti znakov zo vstupného súboru.
|
||||||
|
|
||||||
|
### Procesy
|
||||||
|
|
||||||
|
• Otvorenie a čítanie súboru. To sa vykonáva pomocou funkcie load_doc().
|
||||||
|
|
||||||
|
• Ďalšou fázou je príprava údajov. Počas prípravy sa z údajov odstránia všetky špeciálne znaky okrem medzier. Ďalej všetky ostatné znaky prevedené na malé písmená.
|
||||||
|
|
||||||
|
out = ""
|
||||||
|
for sim in raw_text:
|
||||||
|
|
||||||
|
if 97 <= ord(sim.lower()) <= 122 or sim.lower() == ' ' or sim.lower() == '\n':
|
||||||
|
out = out + sim.lower()
|
||||||
|
|
||||||
|
raw_text = out
|
||||||
|
|
||||||
|
tokens = raw_text.split()
|
||||||
|
raw_text = ' '.join(tokens)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
• Z vyčistených údajov sa vytvoria postupnosti znakov. length - dĺžka sekvencie.
|
||||||
|
|
||||||
|
length = 10
|
||||||
|
sequences = list()
|
||||||
|
for i in range(length, len(raw_text)):
|
||||||
|
# select sequence of tokens
|
||||||
|
seq = raw_text[i - length:i + 1]
|
||||||
|
# store
|
||||||
|
sequences.append(seq)
|
||||||
|
|
||||||
|
• Hotové sekvencie sa uložia do súboru „char_sequences.txt“.
|
||||||
|
|
||||||
|
## Create_Model.py
|
||||||
|
V tomto skripte sa vytvorí model a začína sa učenie neurónovej siete. Na vstupe skript dostane sekvenčný súbor „char_sequences.txt“. Na konci práce sa vytvoria dva súbory "model.h5" a "mapping.pkl". Vytvorí sa tiež súbor “log.csv”, v ktorom sa uložia výsledky metrík pre trénovanie.
|
||||||
|
|
||||||
|
### Procesy
|
||||||
|
|
||||||
|
• Otvorenie a čítanie súboru. Vytvorí sa zoznam sekvencií.
|
||||||
|
|
||||||
|
in_filename = 'char_sequences.txt'
|
||||||
|
raw_text = load_doc(in_filename)
|
||||||
|
lines = raw_text.split('\n')
|
||||||
|
|
||||||
|
• Kódovanie, priradenie celočíselnej hodnoty každému pôvodnému znaku.
|
||||||
|
|
||||||
|
chars = sorted(list(set(raw_text)))
|
||||||
|
mapping = dict((c, i) for i, c in enumerate(chars))
|
||||||
|
sequences = list()
|
||||||
|
for line in lines:
|
||||||
|
# integer encode line
|
||||||
|
encoded_seq = [mapping[char] for char in line]
|
||||||
|
# store
|
||||||
|
sequences.append(encoded_seq)
|
||||||
|
|
||||||
|
• One-hot kódovanie.
|
||||||
|
|
||||||
|
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
|
||||||
|
|
||||||
|
• Tvorba modelu. Implementácia vrstiev, výber aktivačnej funkcie a početu neurónov.
|
||||||
|
|
||||||
|
model = Sequential()
|
||||||
|
model.add(LSTM(250, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
|
||||||
|
model.add(LSTM(250, return_sequences=True))
|
||||||
|
model.add((LSTM(250)))
|
||||||
|
model.add(Dense(vocab_size, activation='softmax'))
|
||||||
|
|
||||||
|
• Zostavenie modelu. Výber chybovej funkcie a optimalizatora.
|
||||||
|
|
||||||
|
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
|
||||||
|
|
||||||
|
|
||||||
|
• Trénovanie modelu.
|
||||||
|
|
||||||
|
model.fit(X, y, epochs=30, verbose=2, callbacks=[csv_logger])
|
||||||
|
|
||||||
|
|
||||||
|
• Mapovanie a model sa uložia.
|
||||||
|
|
||||||
|
model.save('model.h5')
|
||||||
|
dump(mapping, open('mapping.pkl', 'wb'))
|
||||||
|
|
||||||
|
## Generate.py
|
||||||
|
Tento skript generuje postupnosť znakov. Na vstupe sú dva súbory „model.h5“ a „mapping.pkl“, obsahujú stav vyškolenej natrénovanej siete a mapovanie (kódovanie znakov celočíselnými údajmi). Výstupom tohto skriptu sú vygenerované postupnosti znakov.
|
||||||
|
|
||||||
|
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
|
||||||
|
in_text = seed_text
|
||||||
|
|
||||||
|
for _ in range(n_chars):
|
||||||
|
encoded = [mapping[char] for char in in_text]
|
||||||
|
encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
|
||||||
|
encoded = to_categorical(encoded, num_classes=len(mapping))
|
||||||
|
yhat = model.predict_classes(encoded, verbose=0)
|
||||||
|
out_char = ''
|
||||||
|
for char, index in mapping.items():
|
||||||
|
if index == yhat:
|
||||||
|
out_char = char
|
||||||
|
break
|
||||||
|
in_text += char
|
||||||
|
return in_text
|
||||||
|
|
||||||
|
Tento skript obsahuje hlavnú funkciu „ generate_seq “, v ktorej sa uskutočňujú vyššie opísané procesy na kódovanie znakov a inverzné procesy na dekódovanie znakov. Na generovanie symbolov sa používa funkcia knižnice Keras „model.predict_classes“.
|
||||||
|
|
||||||
|
## Create_data_2.py
|
||||||
|
Tento skript sa podobá na Create_data.py s jedným rozdielom, výsledkom budú pravidelné sekvencie, nie n-gramové sekvencie.
|
||||||
|
|
||||||
|
## Perplexity.py
|
||||||
|
Účelom tohto skriptu je počítať perplexitu. Na vstupe tohto skriptu sú dva súbory "gen_seq.txt" a "test_seq.txt", obsahujúce sekvencie generované pomocou skriptu Create_data_2.py.
|
||||||
|
|
||||||
|
### Procesy
|
||||||
|
|
||||||
|
• Odovzdanie súboru a jeho One-hot kódovanie.
|
||||||
|
|
||||||
|
def input_tensor(in_filename):
|
||||||
|
# in_filename = 'char_sequences_1.txt'
|
||||||
|
raw_text = load_doc(in_filename)
|
||||||
|
lines = raw_text.split('\n')
|
||||||
|
|
||||||
|
# integer encode sequences of characters
|
||||||
|
chars = sorted(list(set(raw_text)))
|
||||||
|
mapping = dict((c, i) for i, c in enumerate(chars))
|
||||||
|
sequences = list()
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
# integer encode line
|
||||||
|
encoded_seq = [mapping[char] for char in line]
|
||||||
|
# store
|
||||||
|
sequences.append(encoded_seq)
|
||||||
|
|
||||||
|
vocab_size = len(mapping)
|
||||||
|
print('Vocabulary Size: %d' % vocab_size)
|
||||||
|
|
||||||
|
# separate into input and output
|
||||||
|
sequences = array(sequences)
|
||||||
|
|
||||||
|
X = sequences[:, :-1]
|
||||||
|
|
||||||
|
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
|
||||||
|
|
||||||
|
X = array(sequences)
|
||||||
|
|
||||||
|
return X
|
||||||
|
|
||||||
|
• Počítanie perplexity.
|
||||||
|
cce = tf.keras.losses.CategoricalCrossentropy(tf.constant([1.]), tf.constant([0.001]))
|
||||||
|
a = cce(Y, X).numpy()
|
||||||
|
print("Perplexity: ", 2 ** a)
|
||||||
|
|
||||||
|
## requirements.txt
|
||||||
|
Tento súbor bol vytvorený pomocou príkazu:
|
||||||
|
|
||||||
|
**> pip freeze > requirements.txt**
|
||||||
|
|
||||||
|
Tento súbor obsahuje všetky nainštalované balíčky pre projekt. To umožňuje inštaláciu všetkých balíkov pomocou príkazu:
|
||||||
|
|
||||||
|
**> pip install -r requirements.txt**
|
||||||
|
62
requirements.txt
Normal file
62
requirements.txt
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
absl-py==0.9.0
|
||||||
|
astor==0.8.0
|
||||||
|
blinker==1.4
|
||||||
|
brotlipy==0.7.0
|
||||||
|
cachetools==4.1.0
|
||||||
|
certifi==2020.6.20
|
||||||
|
cffi==1.14.0
|
||||||
|
chardet==3.0.4
|
||||||
|
click==7.1.2
|
||||||
|
cryptography==2.9.2
|
||||||
|
cycler==0.10.0
|
||||||
|
gast==0.2.2
|
||||||
|
google-auth @ file:///tmp/build/80754af9/google-auth_1594357566944/work
|
||||||
|
google-auth-oauthlib==0.4.1
|
||||||
|
google-pasta==0.2.0
|
||||||
|
grpcio==1.27.2
|
||||||
|
h5py==2.10.0
|
||||||
|
idna @ file:///tmp/build/80754af9/idna_1593446292537/work
|
||||||
|
itsdangerous==1.1.0
|
||||||
|
Jinja2==2.11.2
|
||||||
|
Keras==2.3.1
|
||||||
|
Keras-Applications @ file:///tmp/build/80754af9/keras-applications_1594366238411/work
|
||||||
|
Keras-Preprocessing==1.1.0
|
||||||
|
kiwisolver==1.2.0
|
||||||
|
Markdown==3.1.1
|
||||||
|
MarkupSafe==1.1.1
|
||||||
|
matplotlib @ file:///C:/ci/matplotlib-base_1592846129657/work
|
||||||
|
mkl-fft==1.1.0
|
||||||
|
mkl-random==1.1.1
|
||||||
|
mkl-service==2.3.0
|
||||||
|
numpy==1.18.5
|
||||||
|
oauthlib==3.1.0
|
||||||
|
opt-einsum==3.1.0
|
||||||
|
pandas @ file:///C:/ci/pandas_1592833608684/work
|
||||||
|
protobuf==3.12.3
|
||||||
|
pyasn1==0.4.8
|
||||||
|
pyasn1-modules==0.2.7
|
||||||
|
pycparser @ file:///tmp/build/80754af9/pycparser_1594388511720/work
|
||||||
|
PyJWT==1.7.1
|
||||||
|
pyOpenSSL @ file:///tmp/build/80754af9/pyopenssl_1594392929924/work
|
||||||
|
pyparsing==2.4.7
|
||||||
|
pyreadline==2.1
|
||||||
|
PySocks==1.7.1
|
||||||
|
python-dateutil==2.8.1
|
||||||
|
pytz==2020.1
|
||||||
|
PyYAML==5.3.1
|
||||||
|
requests @ file:///tmp/build/80754af9/requests_1592841827918/work
|
||||||
|
requests-oauthlib==1.3.0
|
||||||
|
rsa==4.0
|
||||||
|
scipy @ file:///C:/ci/scipy_1592930618155/work
|
||||||
|
six==1.15.0
|
||||||
|
tensorboard==2.2.1
|
||||||
|
tensorboard-plugin-wit==1.6.0
|
||||||
|
tensorflow==2.1.0
|
||||||
|
tensorflow-estimator==2.1.0
|
||||||
|
termcolor==1.1.0
|
||||||
|
tornado==6.0.4
|
||||||
|
urllib3==1.25.9
|
||||||
|
Werkzeug==0.16.1
|
||||||
|
win-inet-pton==1.1.0
|
||||||
|
wincertstore==0.2
|
||||||
|
wrapt==1.12.1
|
Loading…
Reference in New Issue
Block a user