Upload files to ''

This commit is contained in:
Dávid Omasta 2024-02-23 12:40:37 +00:00
parent 6159a93471
commit 9efdf8a61f
2 changed files with 327 additions and 0 deletions

163
new_train.py Normal file
View File

@ -0,0 +1,163 @@
import torch
import json
from tqdm import tqdm
import torch.nn as nn
from torch.optim import Adam
import nltk
import string
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import numpy as np
import transformers
#from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast
from transformers import AutoTokenizer, T5ForConditionalGeneration
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")
print("Imports succesfully done")
DEVICE ='cuda:0'
TOKENIZER=AutoTokenizer.from_pretrained('google/umt5-small')
TOKENIZER.add_tokens('<sep>')
MODEL = T5ForConditionalGeneration.from_pretrained("google/mt5-small").to(DEVICE)
#pridam token
MODEL.resize_token_embeddings(len(TOKENIZER))
#lr = learning rate = 10-5
OPTIMIZER = Adam(MODEL.parameters(), lr=0.00001)
Q_LEN = 256 # Question Length
T_LEN = 32 # Target Length
BATCH_SIZE = 4 #dávka dát
print("Model succesfully loaded")
from datasets import load_dataset
dataset_english = load_dataset("squad_v2")
dataset_slovak = load_dataset("TUKE-DeutscheTelekom/skquad")
dataset_polish = load_dataset("clarin-pl/poquad")
def prepare_data_english(data):
articles = []
for item in tqdm(data["train"],desc="Preparing training datas"):
context = item["context"]
question = item["question"]
try:
start_position = item['answers']['answer_start'][0]
except IndexError:
continue
text_length = len(item['answers']['text'][0])
target_text = context[start_position : start_position + text_length]
inputs = {"input": context+'<sep>'+question, "answer": target_text}
articles.append(inputs)
return articles
data_english = prepare_data_english(dataset_english)
data_polish = prepare_data_english(dataset_polish)
data_slovak = prepare_data_english(dataset_slovak)
train_data = data_slovak + data_english + data_polish
print("Training Samples : ",len(train_data))
#Dataframe
data = pd.DataFrame(train_data)
class QA_Dataset(Dataset):
def __init__(self, tokenizer, dataframe, q_len, t_len):
self.tokenizer = tokenizer
self.q_len = q_len
self.t_len = t_len
self.data = dataframe
self.input = self.data['input']
#self.context = self.data["context"]
self.answer = self.data['answer']
def __len__(self):
return len(self.questions)
def __getitem__(self, idx):
input = self.input[idx]
answer = self.answer[idx]
input_tokenized = self.tokenizer(input, max_length=self.q_len, padding="max_length",
truncation=True, pad_to_max_length=True, add_special_tokens=True)
answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length",
truncation=True, pad_to_max_length=True, add_special_tokens=True)
labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
labels[labels == 0] = -100
return {
"input_ids": torch.tensor(input_tokenized["input_ids"], dtype=torch.long),
"attention_mask": torch.tensor(input_tokenized["attention_mask"], dtype=torch.long),
"labels": labels,
"decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
}
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
train_sampler = RandomSampler(train_data.index)
val_sampler = RandomSampler(val_data.index)
qa_dataset = QA_Dataset(TOKENIZER, data, Q_LEN, T_LEN)
train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)
print("Loaders working fine")
### TRAINING (46MINS ACCORDING THE V1_DATA)
train_loss = 0
val_loss = 0
train_batch_count = 0
val_batch_count = 0
#TODO
# Make a great epochs number
# Evaluate results and find out how to calculate a real rouge metric
for epoch in range(2):
MODEL.train()
for batch in tqdm(train_loader, desc="Training batches"):
input_ids = batch["input_ids"].to(DEVICE)
attention_mask = batch["attention_mask"].to(DEVICE)
labels = batch["labels"].to(DEVICE)
decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)
outputs = MODEL(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels,
decoder_attention_mask=decoder_attention_mask
)
OPTIMIZER.zero_grad()
outputs.loss.backward()
OPTIMIZER.step()
train_loss += outputs.loss.item()
train_batch_count += 1
#Evaluation
MODEL.eval()
for batch in tqdm(val_loader, desc="Validation batches"):
input_ids = batch["input_ids"].to(DEVICE)
attention_mask = batch["attention_mask"].to(DEVICE)
labels = batch["labels"].to(DEVICE)
decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)
outputs = MODEL(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels,
decoder_attention_mask=decoder_attention_mask
)
OPTIMIZER.zero_grad()
outputs.loss.backward()
OPTIMIZER.step()
val_loss += outputs.loss.item()
val_batch_count += 1
print(f"{epoch+1}/{2} -> Train loss: {train_loss / train_batch_count}\tValidation loss: {val_loss/val_batch_count}")
print("Training done succesfully")
## SAVE FINE_TUNED MODEL
MODEL.save_pretrained("qa_model_umT5_small_3LANG")
TOKENIZER.save_pretrained('qa_tokenizer_umT5_small_3LANG')

164
new_usecase.py Normal file
View File

@ -0,0 +1,164 @@
## IMPORT NESSESARY EQUIPMENTS
from transformers import T5ForConditionalGeneration, T5Tokenizer,AutoTokenizer
import torch
#import evaluate # Bleu
import json
import random
import statistics
from sklearn.metrics import precision_score, recall_score, f1_score
import warnings
from tqdm import tqdm
from datasets import load_dataset
import evaluate
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
rouge = evaluate.load('rouge')
warnings.filterwarnings("ignore")
DEVICE ='cuda:0'
#Prepare data first
def prepare_data_english(data):
articles = []
for item in tqdm(data["validation"],desc="Preparing validation datas"):
context = item["context"]
question = item["question"]
try:
start_position = item['answers']['answer_start'][0]
except IndexError:
continue
text_length = len(item['answers']['text'][0])
target_text = context[start_position : start_position + text_length]
inputs = {"input": context+'<sep>'+question, "answer": target_text}
articles.append(inputs)
return articles
#Load the pretrained model
model_name = 'qa_model_T5-slovak'
model_dir = '/home/omasta/T5_JUPYTER/qa_model'
tokenizer_dir = '/home/omasta/T5_JUPYTER/qa_tokenizer'
MODEL = T5ForConditionalGeneration.from_pretrained(model_dir, from_tf=False, return_dict=True).to(DEVICE)
print("Model succesfully loaded!")
TOKENIZER = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=True)
print("Tokenizer succesfully loaded!")
Q_LEN = 512
TOKENIZER.add_tokens('<sep>')
MODEL.resize_token_embeddings(len(TOKENIZER))
#Load datasets
#dataset_english = load_dataset("squad_v2")
dataset_slovak = load_dataset("TUKE-DeutscheTelekom/skquad")
#dataset_polish = load_dataset("clarin-pl/poquad")
#Prepare datas
#data_english = prepare_data_english(dataset_english)
#data_polish = prepare_data_english(dataset_polish)
data_slovak = prepare_data_english(dataset_slovak)
#Merge datasets
#val_data = data_slovak + data_english + data_polish
print("Val Samples : ",len(data_slovak))
def prediction_rouge(predictions, references):
return rouge.compute(predictions=[predictions], references=[[references]])
def compute_bleu(reference, prediction):
smoothie = SmoothingFunction().method4
return sentence_bleu([reference.split()],prediction.split(),smoothing_function=smoothie)
def classic_metrics(sentence1, sentence2):
if sentence1 == "" and sentence2 == "":
return 0,0,0
else:
# Vytvorenie "bag of words"
vectorizer = CountVectorizer()
try:
bag_of_words = vectorizer.fit_transform([sentence1, sentence2])
except ValueError:
return 0,0,0
# Získanie vektorov pre vety
vector1 = bag_of_words.toarray()[0]
vector2 = bag_of_words.toarray()[1]
# Výpočet metrík
precision = precision_score(vector1, vector2, average='weighted')
recall = recall_score(vector1, vector2, average='weighted')
f1 = f1_score(vector1, vector2, average='weighted')
return float(precision), float(recall), float(f1)
def predict_answer(input,ref_answer,language):
inputs = TOKENIZER(input, max_length=512, padding="max_length", truncation=True, add_special_tokens=True)
input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)
outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)
predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)
ref_answer = ref_answer.lower()
return {"pred":predicted_answer.lower(), "ref":ref_answer.lower(),"language":language}
def predict_and_save(val_data,lang):
predictions = list()
for i in tqdm(range(len(val_data)),desc="predicting"):
pred=predict_answer(val_data[i]["input"],val_data[i]["answer"],lang)
predictions.append(pred)
return predictions
#Predict
pred_slovak = predict_and_save(data_slovak,"sk")
#pred_english = predict_and_save(data_english,"en")
#pred_polish = predict_and_save(data_polish,"pl")
#predictions = pred_slovak + pred_english + pred_polish
#Save the results for later
import json
with open('predictions-t5.json', 'w') as json_file:
json.dump(predictions, json_file)
#Compute metrics
import json
with open("predictions-t5.json","r") as json_file:
data = json.load(json_file)
new_data = list()
language="sk"
for item in data:
if item["language"]==language:
new_data.append(item)
bleu = list()
rouges = list()
precisions=list()
recalls=list()
f1s=list()
for item in tqdm(new_data,desc="Evaluating"):
bleu.append(compute_bleu(item["pred"],item["ref"]))
rouges.append(prediction_rouge(item["pred"],item["ref"]))
precision, recall, f1 =classic_metrics(item["pred"],item["ref"])
precisions.append(precision)
recalls.append(recall)
f1s.append(f1)
#COMPUTATION OF METRICS
rouge1_values = [rouge['rouge1'] for rouge in rouges]
rouge2_values = [rouge['rouge2'] for rouge in rouges]
rougeL_values = [rouge['rougeL'] for rouge in rouges]
average_rouge1 = sum(rouge1_values) / len(rouges)
average_rouge2 = sum(rouge2_values) / len(rouges)
average_rougeL = sum(rougeL_values) / len(rouges)
print("Model name :",model_name)
print("Language :",language)
print("BLEU: ",sum(bleu)/len(bleu))
print("Recall :",sum(recalls)/len(recalls))
print("F1 : ",sum(f1s)/len(f1s))
print("Precision :",sum(precisions)/len(precisions))
print("Rouge-1 :",average_rouge1)
print("Rouge-2 :",average_rouge2)
print("Rouge-L :",average_rougeL)