From 23146fa9c1f9a8a967caba3f828207c4410e92fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Pokr=C3=BDvka?= Date: Mon, 30 Mar 2020 15:23:53 +0000 Subject: [PATCH] =?UTF-8?q?P=C5=99idat=20=E2=80=9Epages/students/2016/luka?= =?UTF-8?q?s=5Fpokryvka/dp2021/scripts/gensim=5Fw2v.py=E2=80=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../dp2021/scripts/gensim_w2v.py | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 pages/students/2016/lukas_pokryvka/dp2021/scripts/gensim_w2v.py diff --git a/pages/students/2016/lukas_pokryvka/dp2021/scripts/gensim_w2v.py b/pages/students/2016/lukas_pokryvka/dp2021/scripts/gensim_w2v.py new file mode 100644 index 0000000000..e5213e7a25 --- /dev/null +++ b/pages/students/2016/lukas_pokryvka/dp2021/scripts/gensim_w2v.py @@ -0,0 +1,83 @@ +# mozeme pouzit pri nacitavani priamo zo subora *.gz +# import gzip +import gensim +import logging +import os + +# nastavenie pre event logging +logging.basicConfig( + format='%(asctime)s : %(levelname)s : %(message)s', + level=logging.INFO) + + +def show_file_contents(input_file): + with open(input_file, 'rb') as f: + for i, line in enumerate(f): + print(line) + break + +# nacitanie vstupu v binarnom formate + + +def read_input(input_file): + logging.info( + "nacitavam subor {0}...moze to chvilku trvat".format(input_file)) + with open(input_file, 'rb') as f: + for i, line in enumerate(f): + + if (i % 1000 == 0): + logging.info("nacitane {0} riadkov".format(i)) + # jednoducha uprava vstupu, vracia list of words + yield gensim.utils.simple_preprocess(line) + + +if __name__ == '__main__': + + documents = list(read_input('files.txt')) + logging.info("Vsetky data boli nacitane") + + # vytvorenie slovnika a natrenovanie modelu + model = gensim.models.Word2Vec( + documents, + size=150, + window=10, + min_count=2, + workers=10) + model.train(documents, total_examples=len(documents), epochs=10) + + # ulozenie vektorov slov + model.wv.save(os.path.join("./vectors/default")) + + # hladanie podobnych slov + w1 = "kostol" + print("Najpodobnejsie slovo slovu {0}".format( + w1), model.wv.most_similar(positive=w1)) + + # najdenie n podobnych slov pre rozne slova + w1 = ["trh"] + print( + "Najpodobnejsie slovu {0}".format(w1), + model.wv.most_similar( + positive=w1, + topn=6)) + + w1 = ["letisko"] + print( + "Najpodobnejsie slovu {0}".format(w1), + model.wv.most_similar( + positive=w1, + topn=6)) + + w1 = ["škola"] + print( + "Najpodobnejsie slovu {0}".format(w1), + model.wv.most_similar( + positive=w1, + topn=6)) + + w1 = ["súradnice"] + print( + "Najpodobnejsie slovu {0}".format(w1), + model.wv.most_similar( + positive=w1, + topn=6))