zpwiki/pages/students/2016/lukas_pokryvka/dp2021/scripts/gensim_w2v.py

84 lines
2.1 KiB
Python

# mozeme pouzit pri nacitavani priamo zo subora *.gz
# import gzip
import gensim
import logging
import os
# nastavenie pre event logging
logging.basicConfig(
format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
def show_file_contents(input_file):
with open(input_file, 'rb') as f:
for i, line in enumerate(f):
print(line)
break
# nacitanie vstupu v binarnom formate
def read_input(input_file):
logging.info(
"nacitavam subor {0}...moze to chvilku trvat".format(input_file))
with open(input_file, 'rb') as f:
for i, line in enumerate(f):
if (i % 1000 == 0):
logging.info("nacitane {0} riadkov".format(i))
# jednoducha uprava vstupu, vracia list of words
yield gensim.utils.simple_preprocess(line)
if __name__ == '__main__':
documents = list(read_input('files.txt'))
logging.info("Vsetky data boli nacitane")
# vytvorenie slovnika a natrenovanie modelu
model = gensim.models.Word2Vec(
documents,
size=150,
window=10,
min_count=2,
workers=10)
model.train(documents, total_examples=len(documents), epochs=10)
# ulozenie vektorov slov
model.wv.save(os.path.join("./vectors/default"))
# hladanie podobnych slov
w1 = "kostol"
print("Najpodobnejsie slovo slovu {0}".format(
w1), model.wv.most_similar(positive=w1))
# najdenie n podobnych slov pre rozne slova
w1 = ["trh"]
print(
"Najpodobnejsie slovu {0}".format(w1),
model.wv.most_similar(
positive=w1,
topn=6))
w1 = ["letisko"]
print(
"Najpodobnejsie slovu {0}".format(w1),
model.wv.most_similar(
positive=w1,
topn=6))
w1 = ["škola"]
print(
"Najpodobnejsie slovu {0}".format(w1),
model.wv.most_similar(
positive=w1,
topn=6))
w1 = ["súradnice"]
print(
"Najpodobnejsie slovu {0}".format(w1),
model.wv.most_similar(
positive=w1,
topn=6))