Přidat „pages/students/2016/lukas_pokryvka/dp2021/scripts/gensim_w2v.py“
This commit is contained in:
parent
4100268f4b
commit
23146fa9c1
@ -0,0 +1,83 @@
|
|||||||
|
# mozeme pouzit pri nacitavani priamo zo subora *.gz
|
||||||
|
# import gzip
|
||||||
|
import gensim
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
# nastavenie pre event logging
|
||||||
|
logging.basicConfig(
|
||||||
|
format='%(asctime)s : %(levelname)s : %(message)s',
|
||||||
|
level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
def show_file_contents(input_file):
|
||||||
|
with open(input_file, 'rb') as f:
|
||||||
|
for i, line in enumerate(f):
|
||||||
|
print(line)
|
||||||
|
break
|
||||||
|
|
||||||
|
# nacitanie vstupu v binarnom formate
|
||||||
|
|
||||||
|
|
||||||
|
def read_input(input_file):
|
||||||
|
logging.info(
|
||||||
|
"nacitavam subor {0}...moze to chvilku trvat".format(input_file))
|
||||||
|
with open(input_file, 'rb') as f:
|
||||||
|
for i, line in enumerate(f):
|
||||||
|
|
||||||
|
if (i % 1000 == 0):
|
||||||
|
logging.info("nacitane {0} riadkov".format(i))
|
||||||
|
# jednoducha uprava vstupu, vracia list of words
|
||||||
|
yield gensim.utils.simple_preprocess(line)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
documents = list(read_input('files.txt'))
|
||||||
|
logging.info("Vsetky data boli nacitane")
|
||||||
|
|
||||||
|
# vytvorenie slovnika a natrenovanie modelu
|
||||||
|
model = gensim.models.Word2Vec(
|
||||||
|
documents,
|
||||||
|
size=150,
|
||||||
|
window=10,
|
||||||
|
min_count=2,
|
||||||
|
workers=10)
|
||||||
|
model.train(documents, total_examples=len(documents), epochs=10)
|
||||||
|
|
||||||
|
# ulozenie vektorov slov
|
||||||
|
model.wv.save(os.path.join("./vectors/default"))
|
||||||
|
|
||||||
|
# hladanie podobnych slov
|
||||||
|
w1 = "kostol"
|
||||||
|
print("Najpodobnejsie slovo slovu {0}".format(
|
||||||
|
w1), model.wv.most_similar(positive=w1))
|
||||||
|
|
||||||
|
# najdenie n podobnych slov pre rozne slova
|
||||||
|
w1 = ["trh"]
|
||||||
|
print(
|
||||||
|
"Najpodobnejsie slovu {0}".format(w1),
|
||||||
|
model.wv.most_similar(
|
||||||
|
positive=w1,
|
||||||
|
topn=6))
|
||||||
|
|
||||||
|
w1 = ["letisko"]
|
||||||
|
print(
|
||||||
|
"Najpodobnejsie slovu {0}".format(w1),
|
||||||
|
model.wv.most_similar(
|
||||||
|
positive=w1,
|
||||||
|
topn=6))
|
||||||
|
|
||||||
|
w1 = ["škola"]
|
||||||
|
print(
|
||||||
|
"Najpodobnejsie slovu {0}".format(w1),
|
||||||
|
model.wv.most_similar(
|
||||||
|
positive=w1,
|
||||||
|
topn=6))
|
||||||
|
|
||||||
|
w1 = ["súradnice"]
|
||||||
|
print(
|
||||||
|
"Najpodobnejsie slovu {0}".format(w1),
|
||||||
|
model.wv.most_similar(
|
||||||
|
positive=w1,
|
||||||
|
topn=6))
|
Loading…
Reference in New Issue
Block a user