forked from KEMT/zpwiki
		
	Přidat „pages/students/2016/lukas_pokryvka/dp2021/scripts/gensim_w2v.py“
This commit is contained in:
		
							parent
							
								
									4100268f4b
								
							
						
					
					
						commit
						23146fa9c1
					
				| @ -0,0 +1,83 @@ | |||||||
|  | # mozeme pouzit pri nacitavani priamo zo subora *.gz | ||||||
|  | # import gzip | ||||||
|  | import gensim | ||||||
|  | import logging | ||||||
|  | import os | ||||||
|  | 
 | ||||||
|  | # nastavenie pre event logging | ||||||
|  | logging.basicConfig( | ||||||
|  |     format='%(asctime)s : %(levelname)s : %(message)s', | ||||||
|  |     level=logging.INFO) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def show_file_contents(input_file): | ||||||
|  |     with open(input_file, 'rb') as f: | ||||||
|  |         for i, line in enumerate(f): | ||||||
|  |             print(line) | ||||||
|  |             break | ||||||
|  | 
 | ||||||
|  | # nacitanie vstupu v binarnom formate | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def read_input(input_file): | ||||||
|  |     logging.info( | ||||||
|  |         "nacitavam subor {0}...moze to chvilku trvat".format(input_file)) | ||||||
|  |     with open(input_file, 'rb') as f: | ||||||
|  |         for i, line in enumerate(f): | ||||||
|  | 
 | ||||||
|  |             if (i % 1000 == 0): | ||||||
|  |                 logging.info("nacitane {0} riadkov".format(i)) | ||||||
|  |             # jednoducha uprava vstupu, vracia list of words | ||||||
|  |             yield gensim.utils.simple_preprocess(line) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | if __name__ == '__main__': | ||||||
|  | 
 | ||||||
|  |     documents = list(read_input('files.txt')) | ||||||
|  |     logging.info("Vsetky data boli nacitane") | ||||||
|  | 
 | ||||||
|  |     # vytvorenie slovnika a natrenovanie modelu | ||||||
|  |     model = gensim.models.Word2Vec( | ||||||
|  |         documents, | ||||||
|  |         size=150, | ||||||
|  |         window=10, | ||||||
|  |         min_count=2, | ||||||
|  |         workers=10) | ||||||
|  |     model.train(documents, total_examples=len(documents), epochs=10) | ||||||
|  | 
 | ||||||
|  |     # ulozenie vektorov slov | ||||||
|  |     model.wv.save(os.path.join("./vectors/default")) | ||||||
|  | 
 | ||||||
|  |     # hladanie podobnych slov | ||||||
|  |     w1 = "kostol" | ||||||
|  |     print("Najpodobnejsie slovo slovu {0}".format( | ||||||
|  |         w1), model.wv.most_similar(positive=w1)) | ||||||
|  | 
 | ||||||
|  |     # najdenie n podobnych slov pre rozne slova | ||||||
|  |     w1 = ["trh"] | ||||||
|  |     print( | ||||||
|  |         "Najpodobnejsie slovu {0}".format(w1), | ||||||
|  |         model.wv.most_similar( | ||||||
|  |             positive=w1, | ||||||
|  |             topn=6)) | ||||||
|  | 
 | ||||||
|  |     w1 = ["letisko"] | ||||||
|  |     print( | ||||||
|  |         "Najpodobnejsie slovu {0}".format(w1), | ||||||
|  |         model.wv.most_similar( | ||||||
|  |             positive=w1, | ||||||
|  |             topn=6)) | ||||||
|  | 
 | ||||||
|  |     w1 = ["škola"] | ||||||
|  |     print( | ||||||
|  |         "Najpodobnejsie slovu {0}".format(w1), | ||||||
|  |         model.wv.most_similar( | ||||||
|  |             positive=w1, | ||||||
|  |             topn=6)) | ||||||
|  | 
 | ||||||
|  |     w1 = ["súradnice"] | ||||||
|  |     print( | ||||||
|  |         "Najpodobnejsie slovu {0}".format(w1), | ||||||
|  |         model.wv.most_similar( | ||||||
|  |             positive=w1, | ||||||
|  |             topn=6)) | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user