forked from KEMT/zpwiki
		
	Přidat „pages/students/2016/lukas_pokryvka/dp2021/scripts/gensim_w2v.py“
This commit is contained in:
		
							parent
							
								
									4100268f4b
								
							
						
					
					
						commit
						23146fa9c1
					
				| @ -0,0 +1,83 @@ | ||||
| # mozeme pouzit pri nacitavani priamo zo subora *.gz | ||||
| # import gzip | ||||
| import gensim | ||||
| import logging | ||||
| import os | ||||
| 
 | ||||
| # nastavenie pre event logging | ||||
| logging.basicConfig( | ||||
|     format='%(asctime)s : %(levelname)s : %(message)s', | ||||
|     level=logging.INFO) | ||||
| 
 | ||||
| 
 | ||||
| def show_file_contents(input_file): | ||||
|     with open(input_file, 'rb') as f: | ||||
|         for i, line in enumerate(f): | ||||
|             print(line) | ||||
|             break | ||||
| 
 | ||||
| # nacitanie vstupu v binarnom formate | ||||
| 
 | ||||
| 
 | ||||
| def read_input(input_file): | ||||
|     logging.info( | ||||
|         "nacitavam subor {0}...moze to chvilku trvat".format(input_file)) | ||||
|     with open(input_file, 'rb') as f: | ||||
|         for i, line in enumerate(f): | ||||
| 
 | ||||
|             if (i % 1000 == 0): | ||||
|                 logging.info("nacitane {0} riadkov".format(i)) | ||||
|             # jednoducha uprava vstupu, vracia list of words | ||||
|             yield gensim.utils.simple_preprocess(line) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
| 
 | ||||
|     documents = list(read_input('files.txt')) | ||||
|     logging.info("Vsetky data boli nacitane") | ||||
| 
 | ||||
|     # vytvorenie slovnika a natrenovanie modelu | ||||
|     model = gensim.models.Word2Vec( | ||||
|         documents, | ||||
|         size=150, | ||||
|         window=10, | ||||
|         min_count=2, | ||||
|         workers=10) | ||||
|     model.train(documents, total_examples=len(documents), epochs=10) | ||||
| 
 | ||||
|     # ulozenie vektorov slov | ||||
|     model.wv.save(os.path.join("./vectors/default")) | ||||
| 
 | ||||
|     # hladanie podobnych slov | ||||
|     w1 = "kostol" | ||||
|     print("Najpodobnejsie slovo slovu {0}".format( | ||||
|         w1), model.wv.most_similar(positive=w1)) | ||||
| 
 | ||||
|     # najdenie n podobnych slov pre rozne slova | ||||
|     w1 = ["trh"] | ||||
|     print( | ||||
|         "Najpodobnejsie slovu {0}".format(w1), | ||||
|         model.wv.most_similar( | ||||
|             positive=w1, | ||||
|             topn=6)) | ||||
| 
 | ||||
|     w1 = ["letisko"] | ||||
|     print( | ||||
|         "Najpodobnejsie slovu {0}".format(w1), | ||||
|         model.wv.most_similar( | ||||
|             positive=w1, | ||||
|             topn=6)) | ||||
| 
 | ||||
|     w1 = ["škola"] | ||||
|     print( | ||||
|         "Najpodobnejsie slovu {0}".format(w1), | ||||
|         model.wv.most_similar( | ||||
|             positive=w1, | ||||
|             topn=6)) | ||||
| 
 | ||||
|     w1 = ["súradnice"] | ||||
|     print( | ||||
|         "Najpodobnejsie slovu {0}".format(w1), | ||||
|         model.wv.most_similar( | ||||
|             positive=w1, | ||||
|             topn=6)) | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user