diff --git a/pages/students/2016/lukas_pokryvka/dp2021/scripts/gensim_w2v.py b/pages/students/2016/lukas_pokryvka/dp2021/scripts/gensim_w2v.py new file mode 100644 index 00000000..e5213e7a --- /dev/null +++ b/pages/students/2016/lukas_pokryvka/dp2021/scripts/gensim_w2v.py @@ -0,0 +1,83 @@ +# mozeme pouzit pri nacitavani priamo zo subora *.gz +# import gzip +import gensim +import logging +import os + +# nastavenie pre event logging +logging.basicConfig( + format='%(asctime)s : %(levelname)s : %(message)s', + level=logging.INFO) + + +def show_file_contents(input_file): + with open(input_file, 'rb') as f: + for i, line in enumerate(f): + print(line) + break + +# nacitanie vstupu v binarnom formate + + +def read_input(input_file): + logging.info( + "nacitavam subor {0}...moze to chvilku trvat".format(input_file)) + with open(input_file, 'rb') as f: + for i, line in enumerate(f): + + if (i % 1000 == 0): + logging.info("nacitane {0} riadkov".format(i)) + # jednoducha uprava vstupu, vracia list of words + yield gensim.utils.simple_preprocess(line) + + +if __name__ == '__main__': + + documents = list(read_input('files.txt')) + logging.info("Vsetky data boli nacitane") + + # vytvorenie slovnika a natrenovanie modelu + model = gensim.models.Word2Vec( + documents, + size=150, + window=10, + min_count=2, + workers=10) + model.train(documents, total_examples=len(documents), epochs=10) + + # ulozenie vektorov slov + model.wv.save(os.path.join("./vectors/default")) + + # hladanie podobnych slov + w1 = "kostol" + print("Najpodobnejsie slovo slovu {0}".format( + w1), model.wv.most_similar(positive=w1)) + + # najdenie n podobnych slov pre rozne slova + w1 = ["trh"] + print( + "Najpodobnejsie slovu {0}".format(w1), + model.wv.most_similar( + positive=w1, + topn=6)) + + w1 = ["letisko"] + print( + "Najpodobnejsie slovu {0}".format(w1), + model.wv.most_similar( + positive=w1, + topn=6)) + + w1 = ["škola"] + print( + "Najpodobnejsie slovu {0}".format(w1), + model.wv.most_similar( + positive=w1, + topn=6)) + + w1 = ["súradnice"] + print( + "Najpodobnejsie slovu {0}".format(w1), + model.wv.most_similar( + positive=w1, + topn=6))