# mozeme pouzit pri nacitavani priamo zo subora *.gz # import gzip import gensim import logging import os # nastavenie pre event logging logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) def show_file_contents(input_file): with open(input_file, 'rb') as f: for i, line in enumerate(f): print(line) break # nacitanie vstupu v binarnom formate def read_input(input_file): logging.info( "nacitavam subor {0}...moze to chvilku trvat".format(input_file)) with open(input_file, 'rb') as f: for i, line in enumerate(f): if (i % 1000 == 0): logging.info("nacitane {0} riadkov".format(i)) # jednoducha uprava vstupu, vracia list of words yield gensim.utils.simple_preprocess(line) if __name__ == '__main__': documents = list(read_input('files.txt')) logging.info("Vsetky data boli nacitane") # vytvorenie slovnika a natrenovanie modelu model = gensim.models.Word2Vec( documents, size=150, window=10, min_count=2, workers=10) model.train(documents, total_examples=len(documents), epochs=10) # ulozenie vektorov slov model.wv.save(os.path.join("./vectors/default")) # hladanie podobnych slov w1 = "kostol" print("Najpodobnejsie slovo slovu {0}".format( w1), model.wv.most_similar(positive=w1)) # najdenie n podobnych slov pre rozne slova w1 = ["trh"] print( "Najpodobnejsie slovu {0}".format(w1), model.wv.most_similar( positive=w1, topn=6)) w1 = ["letisko"] print( "Najpodobnejsie slovu {0}".format(w1), model.wv.most_similar( positive=w1, topn=6)) w1 = ["škola"] print( "Najpodobnejsie slovu {0}".format(w1), model.wv.most_similar( positive=w1, topn=6)) w1 = ["súradnice"] print( "Najpodobnejsie slovu {0}".format(w1), model.wv.most_similar( positive=w1, topn=6))