Upload files to ''
This commit is contained in:
parent
6159a93471
commit
9efdf8a61f
163
new_train.py
Normal file
163
new_train.py
Normal file
@ -0,0 +1,163 @@
|
||||
import torch
|
||||
import json
|
||||
from tqdm import tqdm
|
||||
import torch.nn as nn
|
||||
from torch.optim import Adam
|
||||
import nltk
|
||||
import string
|
||||
from torch.utils.data import Dataset, DataLoader, RandomSampler
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import transformers
|
||||
#from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast
|
||||
from transformers import AutoTokenizer, T5ForConditionalGeneration
|
||||
import warnings
|
||||
from sklearn.model_selection import train_test_split
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
print("Imports succesfully done")
|
||||
|
||||
DEVICE ='cuda:0'
|
||||
TOKENIZER=AutoTokenizer.from_pretrained('google/umt5-small')
|
||||
TOKENIZER.add_tokens('<sep>')
|
||||
MODEL = T5ForConditionalGeneration.from_pretrained("google/mt5-small").to(DEVICE)
|
||||
|
||||
#pridam token
|
||||
MODEL.resize_token_embeddings(len(TOKENIZER))
|
||||
#lr = learning rate = 10-5
|
||||
OPTIMIZER = Adam(MODEL.parameters(), lr=0.00001)
|
||||
Q_LEN = 256 # Question Length
|
||||
T_LEN = 32 # Target Length
|
||||
BATCH_SIZE = 4 #dávka dát
|
||||
print("Model succesfully loaded")
|
||||
from datasets import load_dataset
|
||||
|
||||
dataset_english = load_dataset("squad_v2")
|
||||
dataset_slovak = load_dataset("TUKE-DeutscheTelekom/skquad")
|
||||
dataset_polish = load_dataset("clarin-pl/poquad")
|
||||
|
||||
def prepare_data_english(data):
|
||||
articles = []
|
||||
for item in tqdm(data["train"],desc="Preparing training datas"):
|
||||
context = item["context"]
|
||||
question = item["question"]
|
||||
try:
|
||||
start_position = item['answers']['answer_start'][0]
|
||||
except IndexError:
|
||||
continue
|
||||
text_length = len(item['answers']['text'][0])
|
||||
target_text = context[start_position : start_position + text_length]
|
||||
inputs = {"input": context+'<sep>'+question, "answer": target_text}
|
||||
articles.append(inputs)
|
||||
return articles
|
||||
data_english = prepare_data_english(dataset_english)
|
||||
data_polish = prepare_data_english(dataset_polish)
|
||||
data_slovak = prepare_data_english(dataset_slovak)
|
||||
|
||||
train_data = data_slovak + data_english + data_polish
|
||||
print("Training Samples : ",len(train_data))
|
||||
|
||||
|
||||
#Dataframe
|
||||
data = pd.DataFrame(train_data)
|
||||
|
||||
class QA_Dataset(Dataset):
|
||||
def __init__(self, tokenizer, dataframe, q_len, t_len):
|
||||
self.tokenizer = tokenizer
|
||||
self.q_len = q_len
|
||||
self.t_len = t_len
|
||||
self.data = dataframe
|
||||
self.input = self.data['input']
|
||||
#self.context = self.data["context"]
|
||||
self.answer = self.data['answer']
|
||||
def __len__(self):
|
||||
return len(self.questions)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
input = self.input[idx]
|
||||
answer = self.answer[idx]
|
||||
|
||||
input_tokenized = self.tokenizer(input, max_length=self.q_len, padding="max_length",
|
||||
truncation=True, pad_to_max_length=True, add_special_tokens=True)
|
||||
answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length",
|
||||
truncation=True, pad_to_max_length=True, add_special_tokens=True)
|
||||
|
||||
labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
|
||||
labels[labels == 0] = -100
|
||||
|
||||
return {
|
||||
"input_ids": torch.tensor(input_tokenized["input_ids"], dtype=torch.long),
|
||||
"attention_mask": torch.tensor(input_tokenized["attention_mask"], dtype=torch.long),
|
||||
"labels": labels,
|
||||
"decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
|
||||
}
|
||||
|
||||
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
|
||||
train_sampler = RandomSampler(train_data.index)
|
||||
val_sampler = RandomSampler(val_data.index)
|
||||
qa_dataset = QA_Dataset(TOKENIZER, data, Q_LEN, T_LEN)
|
||||
|
||||
train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
|
||||
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)
|
||||
print("Loaders working fine")
|
||||
|
||||
### TRAINING (46MINS ACCORDING THE V1_DATA)
|
||||
train_loss = 0
|
||||
val_loss = 0
|
||||
train_batch_count = 0
|
||||
val_batch_count = 0
|
||||
|
||||
|
||||
#TODO
|
||||
# Make a great epochs number
|
||||
# Evaluate results and find out how to calculate a real rouge metric
|
||||
for epoch in range(2):
|
||||
MODEL.train()
|
||||
for batch in tqdm(train_loader, desc="Training batches"):
|
||||
input_ids = batch["input_ids"].to(DEVICE)
|
||||
attention_mask = batch["attention_mask"].to(DEVICE)
|
||||
labels = batch["labels"].to(DEVICE)
|
||||
decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)
|
||||
|
||||
outputs = MODEL(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
labels=labels,
|
||||
decoder_attention_mask=decoder_attention_mask
|
||||
)
|
||||
|
||||
OPTIMIZER.zero_grad()
|
||||
outputs.loss.backward()
|
||||
OPTIMIZER.step()
|
||||
train_loss += outputs.loss.item()
|
||||
train_batch_count += 1
|
||||
|
||||
#Evaluation
|
||||
MODEL.eval()
|
||||
for batch in tqdm(val_loader, desc="Validation batches"):
|
||||
input_ids = batch["input_ids"].to(DEVICE)
|
||||
attention_mask = batch["attention_mask"].to(DEVICE)
|
||||
labels = batch["labels"].to(DEVICE)
|
||||
decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)
|
||||
|
||||
outputs = MODEL(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
labels=labels,
|
||||
decoder_attention_mask=decoder_attention_mask
|
||||
)
|
||||
|
||||
OPTIMIZER.zero_grad()
|
||||
outputs.loss.backward()
|
||||
OPTIMIZER.step()
|
||||
val_loss += outputs.loss.item()
|
||||
val_batch_count += 1
|
||||
print(f"{epoch+1}/{2} -> Train loss: {train_loss / train_batch_count}\tValidation loss: {val_loss/val_batch_count}")
|
||||
|
||||
|
||||
print("Training done succesfully")
|
||||
|
||||
## SAVE FINE_TUNED MODEL
|
||||
MODEL.save_pretrained("qa_model_umT5_small_3LANG")
|
||||
TOKENIZER.save_pretrained('qa_tokenizer_umT5_small_3LANG')
|
||||
|
164
new_usecase.py
Normal file
164
new_usecase.py
Normal file
@ -0,0 +1,164 @@
|
||||
|
||||
|
||||
|
||||
## IMPORT NESSESARY EQUIPMENTS
|
||||
from transformers import T5ForConditionalGeneration, T5Tokenizer,AutoTokenizer
|
||||
import torch
|
||||
#import evaluate # Bleu
|
||||
import json
|
||||
import random
|
||||
import statistics
|
||||
from sklearn.metrics import precision_score, recall_score, f1_score
|
||||
import warnings
|
||||
from tqdm import tqdm
|
||||
from datasets import load_dataset
|
||||
import evaluate
|
||||
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
||||
from sklearn.metrics import precision_score, recall_score, f1_score
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
rouge = evaluate.load('rouge')
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
DEVICE ='cuda:0'
|
||||
|
||||
#Prepare data first
|
||||
def prepare_data_english(data):
|
||||
articles = []
|
||||
for item in tqdm(data["validation"],desc="Preparing validation datas"):
|
||||
context = item["context"]
|
||||
question = item["question"]
|
||||
try:
|
||||
start_position = item['answers']['answer_start'][0]
|
||||
except IndexError:
|
||||
continue
|
||||
text_length = len(item['answers']['text'][0])
|
||||
target_text = context[start_position : start_position + text_length]
|
||||
inputs = {"input": context+'<sep>'+question, "answer": target_text}
|
||||
articles.append(inputs)
|
||||
return articles
|
||||
|
||||
#Load the pretrained model
|
||||
|
||||
model_name = 'qa_model_T5-slovak'
|
||||
model_dir = '/home/omasta/T5_JUPYTER/qa_model'
|
||||
tokenizer_dir = '/home/omasta/T5_JUPYTER/qa_tokenizer'
|
||||
MODEL = T5ForConditionalGeneration.from_pretrained(model_dir, from_tf=False, return_dict=True).to(DEVICE)
|
||||
print("Model succesfully loaded!")
|
||||
TOKENIZER = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=True)
|
||||
print("Tokenizer succesfully loaded!")
|
||||
Q_LEN = 512
|
||||
TOKENIZER.add_tokens('<sep>')
|
||||
MODEL.resize_token_embeddings(len(TOKENIZER))
|
||||
|
||||
#Load datasets
|
||||
#dataset_english = load_dataset("squad_v2")
|
||||
dataset_slovak = load_dataset("TUKE-DeutscheTelekom/skquad")
|
||||
#dataset_polish = load_dataset("clarin-pl/poquad")
|
||||
|
||||
#Prepare datas
|
||||
#data_english = prepare_data_english(dataset_english)
|
||||
#data_polish = prepare_data_english(dataset_polish)
|
||||
data_slovak = prepare_data_english(dataset_slovak)
|
||||
#Merge datasets
|
||||
#val_data = data_slovak + data_english + data_polish
|
||||
print("Val Samples : ",len(data_slovak))
|
||||
|
||||
|
||||
def prediction_rouge(predictions, references):
|
||||
return rouge.compute(predictions=[predictions], references=[[references]])
|
||||
|
||||
def compute_bleu(reference, prediction):
|
||||
smoothie = SmoothingFunction().method4
|
||||
return sentence_bleu([reference.split()],prediction.split(),smoothing_function=smoothie)
|
||||
|
||||
def classic_metrics(sentence1, sentence2):
|
||||
if sentence1 == "" and sentence2 == "":
|
||||
return 0,0,0
|
||||
else:
|
||||
# Vytvorenie "bag of words"
|
||||
vectorizer = CountVectorizer()
|
||||
try:
|
||||
bag_of_words = vectorizer.fit_transform([sentence1, sentence2])
|
||||
except ValueError:
|
||||
return 0,0,0
|
||||
# Získanie vektorov pre vety
|
||||
vector1 = bag_of_words.toarray()[0]
|
||||
vector2 = bag_of_words.toarray()[1]
|
||||
|
||||
# Výpočet metrík
|
||||
precision = precision_score(vector1, vector2, average='weighted')
|
||||
recall = recall_score(vector1, vector2, average='weighted')
|
||||
f1 = f1_score(vector1, vector2, average='weighted')
|
||||
return float(precision), float(recall), float(f1)
|
||||
|
||||
def predict_answer(input,ref_answer,language):
|
||||
inputs = TOKENIZER(input, max_length=512, padding="max_length", truncation=True, add_special_tokens=True)
|
||||
input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
|
||||
attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)
|
||||
outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)
|
||||
predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)
|
||||
ref_answer = ref_answer.lower()
|
||||
return {"pred":predicted_answer.lower(), "ref":ref_answer.lower(),"language":language}
|
||||
|
||||
def predict_and_save(val_data,lang):
|
||||
predictions = list()
|
||||
for i in tqdm(range(len(val_data)),desc="predicting"):
|
||||
pred=predict_answer(val_data[i]["input"],val_data[i]["answer"],lang)
|
||||
predictions.append(pred)
|
||||
return predictions
|
||||
#Predict
|
||||
pred_slovak = predict_and_save(data_slovak,"sk")
|
||||
#pred_english = predict_and_save(data_english,"en")
|
||||
#pred_polish = predict_and_save(data_polish,"pl")
|
||||
|
||||
#predictions = pred_slovak + pred_english + pred_polish
|
||||
|
||||
|
||||
#Save the results for later
|
||||
import json
|
||||
with open('predictions-t5.json', 'w') as json_file:
|
||||
json.dump(predictions, json_file)
|
||||
|
||||
|
||||
#Compute metrics
|
||||
import json
|
||||
with open("predictions-t5.json","r") as json_file:
|
||||
data = json.load(json_file)
|
||||
|
||||
new_data = list()
|
||||
language="sk"
|
||||
for item in data:
|
||||
if item["language"]==language:
|
||||
new_data.append(item)
|
||||
|
||||
bleu = list()
|
||||
rouges = list()
|
||||
precisions=list()
|
||||
recalls=list()
|
||||
f1s=list()
|
||||
|
||||
for item in tqdm(new_data,desc="Evaluating"):
|
||||
bleu.append(compute_bleu(item["pred"],item["ref"]))
|
||||
rouges.append(prediction_rouge(item["pred"],item["ref"]))
|
||||
precision, recall, f1 =classic_metrics(item["pred"],item["ref"])
|
||||
precisions.append(precision)
|
||||
recalls.append(recall)
|
||||
f1s.append(f1)
|
||||
#COMPUTATION OF METRICS
|
||||
rouge1_values = [rouge['rouge1'] for rouge in rouges]
|
||||
rouge2_values = [rouge['rouge2'] for rouge in rouges]
|
||||
rougeL_values = [rouge['rougeL'] for rouge in rouges]
|
||||
|
||||
average_rouge1 = sum(rouge1_values) / len(rouges)
|
||||
average_rouge2 = sum(rouge2_values) / len(rouges)
|
||||
average_rougeL = sum(rougeL_values) / len(rouges)
|
||||
print("Model name :",model_name)
|
||||
print("Language :",language)
|
||||
print("BLEU: ",sum(bleu)/len(bleu))
|
||||
print("Recall :",sum(recalls)/len(recalls))
|
||||
print("F1 : ",sum(f1s)/len(f1s))
|
||||
print("Precision :",sum(precisions)/len(precisions))
|
||||
print("Rouge-1 :",average_rouge1)
|
||||
print("Rouge-2 :",average_rouge2)
|
||||
print("Rouge-L :",average_rougeL)
|
||||
|
Loading…
Reference in New Issue
Block a user