diff --git a/new_train.py b/new_train.py new file mode 100644 index 0000000..3103ef1 --- /dev/null +++ b/new_train.py @@ -0,0 +1,163 @@ +import torch +import json +from tqdm import tqdm +import torch.nn as nn +from torch.optim import Adam +import nltk +import string +from torch.utils.data import Dataset, DataLoader, RandomSampler +import pandas as pd +import numpy as np +import transformers +#from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast +from transformers import AutoTokenizer, T5ForConditionalGeneration +import warnings +from sklearn.model_selection import train_test_split +warnings.filterwarnings("ignore") + +print("Imports succesfully done") + +DEVICE ='cuda:0' +TOKENIZER=AutoTokenizer.from_pretrained('google/umt5-small') +TOKENIZER.add_tokens('') +MODEL = T5ForConditionalGeneration.from_pretrained("google/mt5-small").to(DEVICE) + +#pridam token +MODEL.resize_token_embeddings(len(TOKENIZER)) +#lr = learning rate = 10-5 +OPTIMIZER = Adam(MODEL.parameters(), lr=0.00001) +Q_LEN = 256 # Question Length +T_LEN = 32 # Target Length +BATCH_SIZE = 4 #dávka dát +print("Model succesfully loaded") +from datasets import load_dataset + +dataset_english = load_dataset("squad_v2") +dataset_slovak = load_dataset("TUKE-DeutscheTelekom/skquad") +dataset_polish = load_dataset("clarin-pl/poquad") + +def prepare_data_english(data): + articles = [] + for item in tqdm(data["train"],desc="Preparing training datas"): + context = item["context"] + question = item["question"] + try: + start_position = item['answers']['answer_start'][0] + except IndexError: + continue + text_length = len(item['answers']['text'][0]) + target_text = context[start_position : start_position + text_length] + inputs = {"input": context+''+question, "answer": target_text} + articles.append(inputs) + return articles +data_english = prepare_data_english(dataset_english) +data_polish = prepare_data_english(dataset_polish) +data_slovak = prepare_data_english(dataset_slovak) + +train_data = data_slovak + data_english + data_polish +print("Training Samples : ",len(train_data)) + + +#Dataframe +data = pd.DataFrame(train_data) + +class QA_Dataset(Dataset): + def __init__(self, tokenizer, dataframe, q_len, t_len): + self.tokenizer = tokenizer + self.q_len = q_len + self.t_len = t_len + self.data = dataframe + self.input = self.data['input'] + #self.context = self.data["context"] + self.answer = self.data['answer'] + def __len__(self): + return len(self.questions) + + def __getitem__(self, idx): + input = self.input[idx] + answer = self.answer[idx] + + input_tokenized = self.tokenizer(input, max_length=self.q_len, padding="max_length", + truncation=True, pad_to_max_length=True, add_special_tokens=True) + answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length", + truncation=True, pad_to_max_length=True, add_special_tokens=True) + + labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long) + labels[labels == 0] = -100 + + return { + "input_ids": torch.tensor(input_tokenized["input_ids"], dtype=torch.long), + "attention_mask": torch.tensor(input_tokenized["attention_mask"], dtype=torch.long), + "labels": labels, + "decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long) + } + +train_data, val_data = train_test_split(data, test_size=0.2, random_state=42) +train_sampler = RandomSampler(train_data.index) +val_sampler = RandomSampler(val_data.index) +qa_dataset = QA_Dataset(TOKENIZER, data, Q_LEN, T_LEN) + +train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler) +val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler) +print("Loaders working fine") + +### TRAINING (46MINS ACCORDING THE V1_DATA) +train_loss = 0 +val_loss = 0 +train_batch_count = 0 +val_batch_count = 0 + + +#TODO +# Make a great epochs number +# Evaluate results and find out how to calculate a real rouge metric +for epoch in range(2): + MODEL.train() + for batch in tqdm(train_loader, desc="Training batches"): + input_ids = batch["input_ids"].to(DEVICE) + attention_mask = batch["attention_mask"].to(DEVICE) + labels = batch["labels"].to(DEVICE) + decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE) + + outputs = MODEL( + input_ids=input_ids, + attention_mask=attention_mask, + labels=labels, + decoder_attention_mask=decoder_attention_mask + ) + + OPTIMIZER.zero_grad() + outputs.loss.backward() + OPTIMIZER.step() + train_loss += outputs.loss.item() + train_batch_count += 1 + + #Evaluation + MODEL.eval() + for batch in tqdm(val_loader, desc="Validation batches"): + input_ids = batch["input_ids"].to(DEVICE) + attention_mask = batch["attention_mask"].to(DEVICE) + labels = batch["labels"].to(DEVICE) + decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE) + + outputs = MODEL( + input_ids=input_ids, + attention_mask=attention_mask, + labels=labels, + decoder_attention_mask=decoder_attention_mask + ) + + OPTIMIZER.zero_grad() + outputs.loss.backward() + OPTIMIZER.step() + val_loss += outputs.loss.item() + val_batch_count += 1 + print(f"{epoch+1}/{2} -> Train loss: {train_loss / train_batch_count}\tValidation loss: {val_loss/val_batch_count}") + + +print("Training done succesfully") + +## SAVE FINE_TUNED MODEL +MODEL.save_pretrained("qa_model_umT5_small_3LANG") +TOKENIZER.save_pretrained('qa_tokenizer_umT5_small_3LANG') + diff --git a/new_usecase.py b/new_usecase.py new file mode 100644 index 0000000..ee811a9 --- /dev/null +++ b/new_usecase.py @@ -0,0 +1,164 @@ + + + +## IMPORT NESSESARY EQUIPMENTS +from transformers import T5ForConditionalGeneration, T5Tokenizer,AutoTokenizer +import torch +#import evaluate # Bleu +import json +import random +import statistics +from sklearn.metrics import precision_score, recall_score, f1_score +import warnings +from tqdm import tqdm +from datasets import load_dataset +import evaluate +from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction +from sklearn.metrics import precision_score, recall_score, f1_score +from sklearn.feature_extraction.text import CountVectorizer +rouge = evaluate.load('rouge') +warnings.filterwarnings("ignore") + +DEVICE ='cuda:0' + +#Prepare data first +def prepare_data_english(data): + articles = [] + for item in tqdm(data["validation"],desc="Preparing validation datas"): + context = item["context"] + question = item["question"] + try: + start_position = item['answers']['answer_start'][0] + except IndexError: + continue + text_length = len(item['answers']['text'][0]) + target_text = context[start_position : start_position + text_length] + inputs = {"input": context+''+question, "answer": target_text} + articles.append(inputs) + return articles + +#Load the pretrained model + +model_name = 'qa_model_T5-slovak' +model_dir = '/home/omasta/T5_JUPYTER/qa_model' +tokenizer_dir = '/home/omasta/T5_JUPYTER/qa_tokenizer' +MODEL = T5ForConditionalGeneration.from_pretrained(model_dir, from_tf=False, return_dict=True).to(DEVICE) +print("Model succesfully loaded!") +TOKENIZER = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=True) +print("Tokenizer succesfully loaded!") +Q_LEN = 512 +TOKENIZER.add_tokens('') +MODEL.resize_token_embeddings(len(TOKENIZER)) + +#Load datasets +#dataset_english = load_dataset("squad_v2") +dataset_slovak = load_dataset("TUKE-DeutscheTelekom/skquad") +#dataset_polish = load_dataset("clarin-pl/poquad") + +#Prepare datas +#data_english = prepare_data_english(dataset_english) +#data_polish = prepare_data_english(dataset_polish) +data_slovak = prepare_data_english(dataset_slovak) +#Merge datasets +#val_data = data_slovak + data_english + data_polish +print("Val Samples : ",len(data_slovak)) + + +def prediction_rouge(predictions, references): + return rouge.compute(predictions=[predictions], references=[[references]]) + +def compute_bleu(reference, prediction): + smoothie = SmoothingFunction().method4 + return sentence_bleu([reference.split()],prediction.split(),smoothing_function=smoothie) + +def classic_metrics(sentence1, sentence2): + if sentence1 == "" and sentence2 == "": + return 0,0,0 + else: + # Vytvorenie "bag of words" + vectorizer = CountVectorizer() + try: + bag_of_words = vectorizer.fit_transform([sentence1, sentence2]) + except ValueError: + return 0,0,0 + # Získanie vektorov pre vety + vector1 = bag_of_words.toarray()[0] + vector2 = bag_of_words.toarray()[1] + + # Výpočet metrík + precision = precision_score(vector1, vector2, average='weighted') + recall = recall_score(vector1, vector2, average='weighted') + f1 = f1_score(vector1, vector2, average='weighted') + return float(precision), float(recall), float(f1) + +def predict_answer(input,ref_answer,language): + inputs = TOKENIZER(input, max_length=512, padding="max_length", truncation=True, add_special_tokens=True) + input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0) + attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0) + outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask) + predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True) + ref_answer = ref_answer.lower() + return {"pred":predicted_answer.lower(), "ref":ref_answer.lower(),"language":language} + +def predict_and_save(val_data,lang): + predictions = list() + for i in tqdm(range(len(val_data)),desc="predicting"): + pred=predict_answer(val_data[i]["input"],val_data[i]["answer"],lang) + predictions.append(pred) + return predictions +#Predict +pred_slovak = predict_and_save(data_slovak,"sk") +#pred_english = predict_and_save(data_english,"en") +#pred_polish = predict_and_save(data_polish,"pl") + +#predictions = pred_slovak + pred_english + pred_polish + + +#Save the results for later +import json +with open('predictions-t5.json', 'w') as json_file: + json.dump(predictions, json_file) + + +#Compute metrics +import json +with open("predictions-t5.json","r") as json_file: + data = json.load(json_file) + +new_data = list() +language="sk" +for item in data: + if item["language"]==language: + new_data.append(item) + +bleu = list() +rouges = list() +precisions=list() +recalls=list() +f1s=list() + +for item in tqdm(new_data,desc="Evaluating"): + bleu.append(compute_bleu(item["pred"],item["ref"])) + rouges.append(prediction_rouge(item["pred"],item["ref"])) + precision, recall, f1 =classic_metrics(item["pred"],item["ref"]) + precisions.append(precision) + recalls.append(recall) + f1s.append(f1) +#COMPUTATION OF METRICS +rouge1_values = [rouge['rouge1'] for rouge in rouges] +rouge2_values = [rouge['rouge2'] for rouge in rouges] +rougeL_values = [rouge['rougeL'] for rouge in rouges] + +average_rouge1 = sum(rouge1_values) / len(rouges) +average_rouge2 = sum(rouge2_values) / len(rouges) +average_rougeL = sum(rougeL_values) / len(rouges) +print("Model name :",model_name) +print("Language :",language) +print("BLEU: ",sum(bleu)/len(bleu)) +print("Recall :",sum(recalls)/len(recalls)) +print("F1 : ",sum(f1s)/len(f1s)) +print("Precision :",sum(precisions)/len(precisions)) +print("Rouge-1 :",average_rouge1) +print("Rouge-2 :",average_rouge2) +print("Rouge-L :",average_rougeL) +