Přidat „pages/students/2016/lukas_pokryvka/dp2021/scripts/gensim_w2v.py“
This commit is contained in:
		
							parent
							
								
									4100268f4b
								
							
						
					
					
						commit
						23146fa9c1
					
				@ -0,0 +1,83 @@
 | 
			
		||||
# mozeme pouzit pri nacitavani priamo zo subora *.gz
 | 
			
		||||
# import gzip
 | 
			
		||||
import gensim
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
# nastavenie pre event logging
 | 
			
		||||
logging.basicConfig(
 | 
			
		||||
    format='%(asctime)s : %(levelname)s : %(message)s',
 | 
			
		||||
    level=logging.INFO)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def show_file_contents(input_file):
 | 
			
		||||
    with open(input_file, 'rb') as f:
 | 
			
		||||
        for i, line in enumerate(f):
 | 
			
		||||
            print(line)
 | 
			
		||||
            break
 | 
			
		||||
 | 
			
		||||
# nacitanie vstupu v binarnom formate
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def read_input(input_file):
 | 
			
		||||
    logging.info(
 | 
			
		||||
        "nacitavam subor {0}...moze to chvilku trvat".format(input_file))
 | 
			
		||||
    with open(input_file, 'rb') as f:
 | 
			
		||||
        for i, line in enumerate(f):
 | 
			
		||||
 | 
			
		||||
            if (i % 1000 == 0):
 | 
			
		||||
                logging.info("nacitane {0} riadkov".format(i))
 | 
			
		||||
            # jednoducha uprava vstupu, vracia list of words
 | 
			
		||||
            yield gensim.utils.simple_preprocess(line)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
 | 
			
		||||
    documents = list(read_input('files.txt'))
 | 
			
		||||
    logging.info("Vsetky data boli nacitane")
 | 
			
		||||
 | 
			
		||||
    # vytvorenie slovnika a natrenovanie modelu
 | 
			
		||||
    model = gensim.models.Word2Vec(
 | 
			
		||||
        documents,
 | 
			
		||||
        size=150,
 | 
			
		||||
        window=10,
 | 
			
		||||
        min_count=2,
 | 
			
		||||
        workers=10)
 | 
			
		||||
    model.train(documents, total_examples=len(documents), epochs=10)
 | 
			
		||||
 | 
			
		||||
    # ulozenie vektorov slov
 | 
			
		||||
    model.wv.save(os.path.join("./vectors/default"))
 | 
			
		||||
 | 
			
		||||
    # hladanie podobnych slov
 | 
			
		||||
    w1 = "kostol"
 | 
			
		||||
    print("Najpodobnejsie slovo slovu {0}".format(
 | 
			
		||||
        w1), model.wv.most_similar(positive=w1))
 | 
			
		||||
 | 
			
		||||
    # najdenie n podobnych slov pre rozne slova
 | 
			
		||||
    w1 = ["trh"]
 | 
			
		||||
    print(
 | 
			
		||||
        "Najpodobnejsie slovu {0}".format(w1),
 | 
			
		||||
        model.wv.most_similar(
 | 
			
		||||
            positive=w1,
 | 
			
		||||
            topn=6))
 | 
			
		||||
 | 
			
		||||
    w1 = ["letisko"]
 | 
			
		||||
    print(
 | 
			
		||||
        "Najpodobnejsie slovu {0}".format(w1),
 | 
			
		||||
        model.wv.most_similar(
 | 
			
		||||
            positive=w1,
 | 
			
		||||
            topn=6))
 | 
			
		||||
 | 
			
		||||
    w1 = ["škola"]
 | 
			
		||||
    print(
 | 
			
		||||
        "Najpodobnejsie slovu {0}".format(w1),
 | 
			
		||||
        model.wv.most_similar(
 | 
			
		||||
            positive=w1,
 | 
			
		||||
            topn=6))
 | 
			
		||||
 | 
			
		||||
    w1 = ["súradnice"]
 | 
			
		||||
    print(
 | 
			
		||||
        "Najpodobnejsie slovu {0}".format(w1),
 | 
			
		||||
        model.wv.most_similar(
 | 
			
		||||
            positive=w1,
 | 
			
		||||
            topn=6))
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user