## IMPORT NESSESARY EQUIPMENTS from transformers import T5ForConditionalGeneration, T5Tokenizer,AutoTokenizer import torch import evaluate # Bleu import json import random import statistics from sklearn.metrics import precision_score, recall_score, f1_score ## TURN WARNINGS OFF import warnings warnings.filterwarnings("ignore") ##13/03/23 added from rouge import Rouge from tqdm import tqdm from datasets import load_dataset import re ##CUSTOM ROUGE METRIC - NEW TODO: # Názov modelu DEVICE ='cuda:0' #T5 MODEL #model_name = 'T5_SK_model' #model_dir = "/home/omasta/T5_JUPYTER/qa_model" #tokenizer_dir = "/home/omasta/T5_JUPYTER/qa_tokenizer" #mT5 SMALL MODEL model_name = 'qa_model' model_dir = '/home/omasta/T5_JUPYTER/qa_model_mT5_polish' tokenizer_dir = '/home/omasta/T5_JUPYTER/qa_tokenizer_mT5_polish' #Načítanie modelu z adresára MODEL = T5ForConditionalGeneration.from_pretrained(model_dir, from_tf=False, return_dict=True).to(DEVICE) print("Model succesfully loaded!") TOKENIZER = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=True) print("Tokenizer succesfully loaded!") Q_LEN = 512 TOKENIZER.add_tokens('') MODEL.resize_token_embeddings(len(TOKENIZER)) def nahradit_znaky(retezec): novy_retezec = retezec.replace('[', ' ').replace(']', ' ') return novy_retezec def predict_answer(data, ref_answer=None,random=None): predictions=[] for i in tqdm(data,desc="predicting"): inputs = TOKENIZER(i['input'], max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True) input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0) attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0) outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask) predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True) ref_answer = i['answer'].lower() #print(ref_answer) if ref_answer: # Load the Bleu metric #bleu = evaluate.load("google_bleu") #print('debug') #precision = list(precision_score(ref_answer, predicted_answer)) #recall = list(recall_score(ref_answer, predicted_answer)) #f1 = list(f1_score(ref_answer, predicted_answer)) #score = bleu.compute(predictions=[predicted_answer], # references=[ref_answer]) predictions.append({'prediction':predicted_answer,'ref_answer':ref_answer}) return predictions def prepare_data(data): articles = [] for article in data["data"]: for paragraph in article["paragraphs"]: for qa in paragraph["qas"]: question = qa["question"] answer = qa["answers"][0]["text"] inputs = {"input": paragraph["context"]+ "" + question, "answer": answer} articles.append(inputs) return articles def prepare_polish_data(data): arcs = list() for i in range(len(data)): questions=data[i]["question"] try: answer = nahradit_znaky(', '.join(data[i]["answers"]["text"])) except KeyError: continue context = data[i]["context"] inputs = {"input":context+""+questions,"answer":answer} arcs.append(inputs) return arcs #dataset = load_dataset("clarin-pl/poquad") dataset = load_dataset("squad_v2") dev_data = prepare_polish_data(dataset["validation"]) #print('data prepared') print(f'Number of dev samples {len(dev_data)}') #print(dev_data[0]) bleu_score = [] precisions=[] f1_scores=[] recall_scores=[] rouge_1 = [] rouge_2 = [] #X = 150 evaluate = predict_answer(dev_data) rouge = Rouge() for item in tqdm(evaluate,desc="evaluating"): try: scores = rouge.get_scores(item['prediction'], item['ref_answer']) precision=precision_score(list(item['ref_answer']), list(item['prediction']),average='macro') recall=recall_score(list(item['ref_answer']), list(item['prediction']),average='macro') f1=f1_score(list(item['ref_answer']), list(item['prediction']),average='macro') except ValueError: precision=0 recall=0 f1=0 precisions.append(precision) f1_scores.append(f1) recall_scores.append(recall) def rouge_eval(dict_x): rouge = Rouge() rouge_scores=[] for item in dict_x: if item['prediction'] and item['ref_answer']: rouge_score = rouge.get_scores(item['prediction'], item['ref_answer']) rouge_scores.append(rouge_score) else: continue return rouge_scores print(f'VYHODNOTENIE VYSLEDKOV : ------------------------') #print(evaluate) #bleu_score_total = statistics.mean(bleu_score) recall_score_total= statistics.mean(recall_scores) f1_score_total = statistics.mean(f1_scores) precision_total = statistics.mean(precisions) #print(f'Bleu_score of model {model_name} : ',bleu_score_total) print(f'Recall of model {model_name}: ',recall_score_total) print(f'F1 of model {model_name} : ', f1_score_total) print(f'Precision of model {model_name}: :',precision_total) print(model_dir) print(rouge_eval(evaluate)) print(f'{model_name} results') rouge_scores = rouge_eval(evaluate) rouge_values = [score[0]['rouge-1']['f'] for score in rouge_scores] mean_rouge_score = statistics.mean(rouge_values) print(f'Rouge mean score:{mean_rouge_score}') rouge2_values = [score[0]['rouge-2']['f'] for score in rouge_scores] mean_rouge_score =statistics.mean(rouge2_values) print(f'Rouge-2 mean score:{mean_rouge_score}')