84 lines
		
	
	
		
			2.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			84 lines
		
	
	
		
			2.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # mozeme pouzit pri nacitavani priamo zo subora *.gz
 | |
| # import gzip
 | |
| import gensim
 | |
| import logging
 | |
| import os
 | |
| 
 | |
| # nastavenie pre event logging
 | |
| logging.basicConfig(
 | |
|     format='%(asctime)s : %(levelname)s : %(message)s',
 | |
|     level=logging.INFO)
 | |
| 
 | |
| 
 | |
| def show_file_contents(input_file):
 | |
|     with open(input_file, 'rb') as f:
 | |
|         for i, line in enumerate(f):
 | |
|             print(line)
 | |
|             break
 | |
| 
 | |
| # nacitanie vstupu v binarnom formate
 | |
| 
 | |
| 
 | |
| def read_input(input_file):
 | |
|     logging.info(
 | |
|         "nacitavam subor {0}...moze to chvilku trvat".format(input_file))
 | |
|     with open(input_file, 'rb') as f:
 | |
|         for i, line in enumerate(f):
 | |
| 
 | |
|             if (i % 1000 == 0):
 | |
|                 logging.info("nacitane {0} riadkov".format(i))
 | |
|             # jednoducha uprava vstupu, vracia list of words
 | |
|             yield gensim.utils.simple_preprocess(line)
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
| 
 | |
|     documents = list(read_input('files.txt'))
 | |
|     logging.info("Vsetky data boli nacitane")
 | |
| 
 | |
|     # vytvorenie slovnika a natrenovanie modelu
 | |
|     model = gensim.models.Word2Vec(
 | |
|         documents,
 | |
|         size=150,
 | |
|         window=10,
 | |
|         min_count=2,
 | |
|         workers=10)
 | |
|     model.train(documents, total_examples=len(documents), epochs=10)
 | |
| 
 | |
|     # ulozenie vektorov slov
 | |
|     model.wv.save(os.path.join("./vectors/default"))
 | |
| 
 | |
|     # hladanie podobnych slov
 | |
|     w1 = "kostol"
 | |
|     print("Najpodobnejsie slovo slovu {0}".format(
 | |
|         w1), model.wv.most_similar(positive=w1))
 | |
| 
 | |
|     # najdenie n podobnych slov pre rozne slova
 | |
|     w1 = ["trh"]
 | |
|     print(
 | |
|         "Najpodobnejsie slovu {0}".format(w1),
 | |
|         model.wv.most_similar(
 | |
|             positive=w1,
 | |
|             topn=6))
 | |
| 
 | |
|     w1 = ["letisko"]
 | |
|     print(
 | |
|         "Najpodobnejsie slovu {0}".format(w1),
 | |
|         model.wv.most_similar(
 | |
|             positive=w1,
 | |
|             topn=6))
 | |
| 
 | |
|     w1 = ["škola"]
 | |
|     print(
 | |
|         "Najpodobnejsie slovu {0}".format(w1),
 | |
|         model.wv.most_similar(
 | |
|             positive=w1,
 | |
|             topn=6))
 | |
| 
 | |
|     w1 = ["súradnice"]
 | |
|     print(
 | |
|         "Najpodobnejsie slovu {0}".format(w1),
 | |
|         model.wv.most_similar(
 | |
|             positive=w1,
 | |
|             topn=6))
 |