forked from KEMT/zpwiki
		
	update
This commit is contained in:
		
							parent
							
								
									179efb972e
								
							
						
					
					
						commit
						d919ca93c0
					
				| @ -1,3 +1,9 @@ | |||||||
|  | ## Update 05.06.2020 | ||||||
|  | - pridaný čas začiatku a čas ukončenia trénovania, aby bolo možné určit, ako dlho trénovanie trvalo | ||||||
|  | - upravený skript na úpravu textu do vhodnej podoby (skombinoval som môj vlastný skript s jedným voľne dostupným na internete, aby bola úprava textu presnejšia) | ||||||
|  | - pridaný tag na identifikáciu čísel v texte ("N"), čo by teoreticky mohlo zvýšiť presnosť modelu | ||||||
|  | - vyriešený výpočet precision, recall a f-score (problém som vyriešil tak, že som najprv zo skutočných hodnôt urobil tensor, ktorý som následne konvertoval na numpy pole) | ||||||
|  | 
 | ||||||
| ## Update 05.05.2020 | ## Update 05.05.2020 | ||||||
| - upravený skript "punc.py" tak, že model načítava dáta zo súboru/ov | - upravený skript "punc.py" tak, že model načítava dáta zo súboru/ov | ||||||
| - vytvorený skript "text.py", ktorý upraví dáta do vhodnej podoby (5 krokov) | - vytvorený skript "text.py", ktorý upraví dáta do vhodnej podoby (5 krokov) | ||||||
|  | |||||||
| @ -1,5 +1,4 @@ | |||||||
| import os | import os | ||||||
| import re |  | ||||||
| 
 | 
 | ||||||
| if os.path.exists('tags.txt'): | if os.path.exists('tags.txt'): | ||||||
| 	os.remove('tags.txt') | 	os.remove('tags.txt') | ||||||
| @ -11,15 +10,15 @@ with open('text.txt', 'r') as input_file: | |||||||
| 				if (word == '.PER'): | 				if (word == '.PER'): | ||||||
| 					word = word.replace(word, 'P') | 					word = word.replace(word, 'P') | ||||||
| 					output_file.write(word + ' ') | 					output_file.write(word + ' ') | ||||||
| 
 |  | ||||||
| 				elif (word == ',COM'): | 				elif (word == ',COM'): | ||||||
| 					word = word.replace(word, 'C') | 					word = word.replace(word, 'C') | ||||||
| 					output_file.write(word + ' ') | 					output_file.write(word + ' ') | ||||||
| 
 |  | ||||||
| 				elif(word == '?QUE'): | 				elif(word == '?QUE'): | ||||||
| 					word = word.replace(word, 'Q') | 					word = word.replace(word, 'Q') | ||||||
| 					output_file.write(word + ' ') | 					output_file.write(word + ' ') | ||||||
| 
 | 				elif(word == '<NUM>'): | ||||||
|  | 					word = word.replace(word, 'N') | ||||||
|  | 					output_file.write(word + ' ') | ||||||
| 				else: | 				else: | ||||||
| 					word = word.replace(word, 'S') | 					word = word.replace(word, 'S') | ||||||
| 					output_file.write(word + ' ') | 					output_file.write(word + ' ') | ||||||
							
								
								
									
										73
									
								
								pages/students/2016/darius_lindvai/dp2021/prepare_text.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										73
									
								
								pages/students/2016/darius_lindvai/dp2021/prepare_text.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,73 @@ | |||||||
|  | from __future__ import division, print_function | ||||||
|  | from nltk.tokenize import word_tokenize | ||||||
|  | 
 | ||||||
|  | import nltk | ||||||
|  | import os | ||||||
|  | from io import open | ||||||
|  | import re | ||||||
|  | import sys | ||||||
|  | 
 | ||||||
|  | nltk.download('punkt') | ||||||
|  | 
 | ||||||
|  | NUM = '<NUM>' | ||||||
|  | 
 | ||||||
|  | PUNCTS = {".": ".PER", ",": ".COM", "?": "?QUE", "!": ".PER", ":": ",COM", ";": ".PER", "-": ",COM"} | ||||||
|  | 
 | ||||||
|  | forbidden_symbols = re.compile(r"[\[\]\(\)\/\\\>\<\=\+\_\*]") | ||||||
|  | numbers = re.compile(r"\d") | ||||||
|  | multiple_punct = re.compile(r'([\.\?\!\,\:\;\-])(?:[\.\?\!\,\:\;\-]){1,}') | ||||||
|  | 
 | ||||||
|  | is_number = lambda x: len(numbers.sub("", x)) / len(x) < 0.6 | ||||||
|  | 
 | ||||||
|  | def untokenize(line): | ||||||
|  |     return line.replace(" '", "'").replace(" n't", "n't").replace("can not", "cannot") | ||||||
|  | 
 | ||||||
|  | def skip(line): | ||||||
|  | 
 | ||||||
|  |     if line.strip() == '': | ||||||
|  |         return True | ||||||
|  | 
 | ||||||
|  |     last_symbol = line[-1] | ||||||
|  |     if not last_symbol in PUNCTS: | ||||||
|  |         return True | ||||||
|  | 
 | ||||||
|  |     if forbidden_symbols.search(line) is not None: | ||||||
|  |         return True | ||||||
|  | 
 | ||||||
|  |     return False | ||||||
|  | 
 | ||||||
|  | def process_line(line): | ||||||
|  | 
 | ||||||
|  |     tokens = word_tokenize(line) | ||||||
|  |     output_tokens = [] | ||||||
|  | 
 | ||||||
|  |     for token in tokens: | ||||||
|  | 
 | ||||||
|  |         if token in PUNCTS: | ||||||
|  |             output_tokens.append(PUNCTS[token]) | ||||||
|  |         elif is_number(token): | ||||||
|  |             output_tokens.append(NUM) | ||||||
|  |         else: | ||||||
|  |             output_tokens.append(token.lower()) | ||||||
|  | 
 | ||||||
|  |     return untokenize(" ".join(output_tokens) + " ") | ||||||
|  | 
 | ||||||
|  | skipped = 0 | ||||||
|  | 
 | ||||||
|  | with open(sys.argv[2], 'w', encoding='utf-8') as out_txt: | ||||||
|  |     with open(sys.argv[1], 'r', encoding='utf-8') as text: | ||||||
|  | 
 | ||||||
|  |         for line in text: | ||||||
|  | 
 | ||||||
|  |             line = line.replace("\"", "").strip() | ||||||
|  |             line = multiple_punct.sub(r"\g<1>", line) | ||||||
|  | 
 | ||||||
|  |             if skip(line): | ||||||
|  |                 skipped += 1 | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             line = process_line(line) | ||||||
|  | 
 | ||||||
|  |             out_txt.write(line) | ||||||
|  | 
 | ||||||
|  | print("Skipped %d lines" % skipped) | ||||||
| @ -1,14 +1,13 @@ | |||||||
|  | import numpy as np | ||||||
| import torch | import torch | ||||||
| import torch.autograd as autograd | import torch.autograd as autograd | ||||||
| import torch.nn as nn | import torch.nn as nn | ||||||
| import torch.optim as optim | import torch.optim as optim | ||||||
|  | from sklearn import metrics | ||||||
|  | from datetime import datetime | ||||||
| 
 | 
 | ||||||
| torch.manual_seed(1) | torch.manual_seed(1) | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def argmax(vec): | def argmax(vec): | ||||||
|     # return the argmax as a python int |     # return the argmax as a python int | ||||||
|     _, idx = torch.max(vec, 1) |     _, idx = torch.max(vec, 1) | ||||||
| @ -27,10 +26,6 @@ def log_sum_exp(vec): | |||||||
|     return max_score + \ |     return max_score + \ | ||||||
|         torch.log(torch.sum(torch.exp(vec - max_score_broadcast))) |         torch.log(torch.sum(torch.exp(vec - max_score_broadcast))) | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class BiLSTM_CRF(nn.Module): | class BiLSTM_CRF(nn.Module): | ||||||
| 
 | 
 | ||||||
|     def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim): |     def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim): | ||||||
| @ -65,7 +60,7 @@ class BiLSTM_CRF(nn.Module): | |||||||
|                 torch.randn(2, 1, self.hidden_dim // 2)) |                 torch.randn(2, 1, self.hidden_dim // 2)) | ||||||
| 
 | 
 | ||||||
|     def _forward_alg(self, feats): |     def _forward_alg(self, feats): | ||||||
|         # Forward algorithm to compute the partition function |         # Do the forward algorithm to compute the partition function | ||||||
|         init_alphas = torch.full((1, self.tagset_size), -10000.) |         init_alphas = torch.full((1, self.tagset_size), -10000.) | ||||||
|         # START_TAG has all of the score. |         # START_TAG has all of the score. | ||||||
|         init_alphas[0][self.tag_to_ix[START_TAG]] = 0. |         init_alphas[0][self.tag_to_ix[START_TAG]] = 0. | ||||||
| @ -77,13 +72,18 @@ class BiLSTM_CRF(nn.Module): | |||||||
|         for feat in feats: |         for feat in feats: | ||||||
|             alphas_t = []  # The forward tensors at this timestep |             alphas_t = []  # The forward tensors at this timestep | ||||||
|             for next_tag in range(self.tagset_size): |             for next_tag in range(self.tagset_size): | ||||||
|                 # broadcast the emission score: it is the same regardless of the previous tag |                 # broadcast the emission score: it is the same regardless of | ||||||
|                 emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size) |                 # the previous tag | ||||||
|                 # the ith entry of trans_score is the score of transitioning to next_tag from i |                 emit_score = feat[next_tag].view( | ||||||
|  |                     1, -1).expand(1, self.tagset_size) | ||||||
|  |                 # the ith entry of trans_score is the score of transitioning to | ||||||
|  |                 # next_tag from i | ||||||
|                 trans_score = self.transitions[next_tag].view(1, -1) |                 trans_score = self.transitions[next_tag].view(1, -1) | ||||||
|                 # The ith entry of next_tag_var is the value for the edge (i -> next_tag) before we do log-sum-exp |                 # The ith entry of next_tag_var is the value for the | ||||||
|  |                 # edge (i -> next_tag) before we do log-sum-exp | ||||||
|                 next_tag_var = forward_var + trans_score + emit_score |                 next_tag_var = forward_var + trans_score + emit_score | ||||||
|                 # The forward variable for this tag is log-sum-exp of all the scores. |                 # The forward variable for this tag is log-sum-exp of all the | ||||||
|  |                 # scores. | ||||||
|                 alphas_t.append(log_sum_exp(next_tag_var).view(1)) |                 alphas_t.append(log_sum_exp(next_tag_var).view(1)) | ||||||
|             forward_var = torch.cat(alphas_t).view(1, -1) |             forward_var = torch.cat(alphas_t).view(1, -1) | ||||||
|         terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]] |         terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]] | ||||||
| @ -158,7 +158,7 @@ class BiLSTM_CRF(nn.Module): | |||||||
|         gold_score = self._score_sentence(feats, tags) |         gold_score = self._score_sentence(feats, tags) | ||||||
|         return forward_score - gold_score |         return forward_score - gold_score | ||||||
| 
 | 
 | ||||||
|     def forward(self, sentence): |     def forward(self, sentence):  # dont confuse this with _forward_alg above. | ||||||
|         # Get the emission scores from the BiLSTM |         # Get the emission scores from the BiLSTM | ||||||
|         lstm_feats = self._get_lstm_features(sentence) |         lstm_feats = self._get_lstm_features(sentence) | ||||||
| 
 | 
 | ||||||
| @ -166,25 +166,12 @@ class BiLSTM_CRF(nn.Module): | |||||||
|         score, tag_seq = self._viterbi_decode(lstm_feats) |         score, tag_seq = self._viterbi_decode(lstm_feats) | ||||||
|         return score, tag_seq |         return score, tag_seq | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| START_TAG = "<START>" | START_TAG = "<START>" | ||||||
| STOP_TAG = "<STOP>" | STOP_TAG = "<STOP>" | ||||||
| EMBEDDING_DIM = 5 | EMBEDDING_DIM = 5 | ||||||
| HIDDEN_DIM = 4 | HIDDEN_DIM = 4 | ||||||
| 
 | 
 | ||||||
| ''' | # Make up some training data | ||||||
| training_data = [( |  | ||||||
|     "hovorí sa ,COM že ľudstvo postihuje nová epidémia ,COM šíriaca sa závratnou rýchlosťou .PER preto je dôležité vedieť čo to je ,COM ako jej predísť alebo ako ju odstrániť .PER".split(), |  | ||||||
|     "S S C S S S S S C S S S S P S S S S S S S C S S S S S S S P".split() |  | ||||||
| ), ( |  | ||||||
|     "nárast obezity je spôsobený najmä spôsobom života .PER tuky zlepšujú chuť do jedla a dávajú lepší pocit sýtosti ,COM uvedomte si však ,COM že všetky tuky sa Vám ukladajú ,COM pokiaľ ich nespálite .PER".split(), |  | ||||||
|     "S S S S S S S P S S S S S S S S S S C S S S C S S S S S S C S S S P".split() |  | ||||||
| )] |  | ||||||
| ''' |  | ||||||
| 
 |  | ||||||
| with open('/home/dlindvai/work/text.txt', 'r') as text2: | with open('/home/dlindvai/work/text.txt', 'r') as text2: | ||||||
| 	with open('/home/dlindvai/work/tags.txt', 'r') as tags2: | 	with open('/home/dlindvai/work/tags.txt', 'r') as tags2: | ||||||
| 		text1 = text2.read().splitlines() | 		text1 = text2.read().splitlines() | ||||||
| @ -200,38 +187,60 @@ training_data = [( text.split() , tags.split() )] | |||||||
| 
 | 
 | ||||||
| word_to_ix = {} | word_to_ix = {} | ||||||
| for sentence, tags in training_data: | for sentence, tags in training_data: | ||||||
|     for word in sentence: | 	for word in sentence: | ||||||
|         if word not in word_to_ix: | 		if word not in word_to_ix: | ||||||
|             word_to_ix[word] = len(word_to_ix) | 			word_to_ix[word] = len(word_to_ix) | ||||||
| 
 | 
 | ||||||
| tag_to_ix = {"S": 0, "C": 1, "P": 2, "Q": 3, START_TAG: 4, STOP_TAG: 5} | tag_to_ix = {"S": 0, "P": 1, "C": 2, "Q": 3, "N": 4, START_TAG: 5, STOP_TAG: 6} | ||||||
| 
 | 
 | ||||||
| model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM) | model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM) | ||||||
| optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4) | optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | # Check predictions before training | ||||||
| with torch.no_grad(): | with torch.no_grad(): | ||||||
|     precheck_sent = prepare_sequence(training_data[0][0], word_to_ix) | 	precheck_sent = prepare_sequence(training_data[0][0], word_to_ix) | ||||||
|     precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long) | 	precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long) | ||||||
|     print("Predicted output before training: ", model(precheck_sent)) | 	#print(model(precheck_sent)) | ||||||
| 
 | 
 | ||||||
| for epoch in range(30):  # normally you would NOT do 300 epochs, but this is small dataset |  | ||||||
|     for sentence, tags in training_data: |  | ||||||
|         # Step 1. Remember that Pytorch accumulates gradients. |  | ||||||
|         # We need to clear them out before each instance |  | ||||||
|         model.zero_grad() |  | ||||||
| 
 | 
 | ||||||
|         # Step 2. Get our inputs ready for the network, that is, turn them into Tensors of word indices. | # Print start time | ||||||
|         sentence_in = prepare_sequence(sentence, word_to_ix) | start = datetime.now() | ||||||
|         targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long) | start_time = start.strftime("%H:%M:%S") | ||||||
|  | print("Start time = ", start_time) | ||||||
| 
 | 
 | ||||||
|         # Step 3. Run our forward pass. | for epoch in range(50): | ||||||
|         loss = model.neg_log_likelihood(sentence_in, targets) | 	for sentence, tags in training_data: | ||||||
|  | 		# Step 1. Remember that Pytorch accumulates gradients. We need to clear them out before each instance | ||||||
|  | 		model.zero_grad() | ||||||
| 
 | 
 | ||||||
|         # Step 4. Compute the loss, gradients, and update the parameters by calling optimizer.step() | 		# Step 2. Get our inputs ready for the network, that is, turn them into Tensors of word indices. | ||||||
|         loss.backward() | 		sentence_in = prepare_sequence(sentence, word_to_ix) | ||||||
|         optimizer.step() | 		targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long) | ||||||
| 
 | 
 | ||||||
|  | 		# Step 3. Run our forward pass. | ||||||
|  | 		loss = model.neg_log_likelihood(sentence_in, targets) | ||||||
|  | 
 | ||||||
|  | 		# Step 4. Compute the loss, gradients, and update the parameters by calling optimizer.step() | ||||||
|  | 		loss.backward() | ||||||
|  | 		optimizer.step() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Check predictions after training | ||||||
| with torch.no_grad(): | with torch.no_grad(): | ||||||
|     precheck_sent = prepare_sequence(training_data[0][0], word_to_ix) | 	precheck_sent = prepare_sequence(training_data[0][0], word_to_ix) | ||||||
|     print("Predicted output after training: ", model(precheck_sent)) | 	#print(model(precheck_sent)) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | # Error calculator | ||||||
|  | var = model(precheck_sent) | ||||||
|  | y_true = np.array(targets) | ||||||
|  | y_pred = np.array(var[1]) | ||||||
|  | 
 | ||||||
|  | print(metrics.confusion_matrix(y_true, y_pred)) | ||||||
|  | print(metrics.classification_report(y_true, y_pred, digits=3)) | ||||||
|  | 
 | ||||||
|  | # Print finish time | ||||||
|  | finish = datetime.now() | ||||||
|  | finish_time = finish.strftime("%H:%M:%S") | ||||||
|  | print("Finish time = ", finish_time) | ||||||
|  | |||||||
| @ -1,14 +0,0 @@ | |||||||
| import re |  | ||||||
| import os |  | ||||||
| 
 |  | ||||||
| if os.path.exists('text.txt'): |  | ||||||
| 	os.remove('text.txt') |  | ||||||
| 
 |  | ||||||
| with open('/home/dlindvai/work/train.txt', 'r') as input_file: |  | ||||||
| 	with open('/home/dlindvai/work/text.txt', 'a') as output_file: |  | ||||||
| 		for line in input_file: |  | ||||||
| 			line = line.replace('\n', '') |  | ||||||
| 			line = re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-]+)\s*", r"\1 ", line) |  | ||||||
| 			line = line.lower() |  | ||||||
| 			line = line.replace('.','.PER').replace(',',',COM').replace('?','?QUE') |  | ||||||
| 			output_file.write(line) |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user