Upload files to ''

This commit is contained in:
Dávid Omasta 2024-02-17 19:00:49 +00:00
parent 24dc8fd808
commit 6159a93471
4 changed files with 114 additions and 40 deletions

5
.env Normal file
View File

@ -0,0 +1,5 @@
URL="http://backend_inference:8000/predict"
PORT="8090"
HOST="localhost"
QA_MODEL="qa_model"
QA_TOKENIZER="qa_tokenizer"

34
docker-compose.yml Normal file
View File

@ -0,0 +1,34 @@
version: '3.3'
services:
backend:
#build: ./backend
image: backend:test
container_name: backend_inference
ports:
- 8090:8090
networks:
- semantic #dopis svoj nazov taskov
volumes:
- ./.env:/app/.env
- ./qa_model:/app/qa_model
- ./qa_tokenizer:/app/qa_tokenizer
restart: always
frontend:
#build: ./frontend
image: streamlit:dev
container_name: streamlit
ports:
- 8501:8501
depends_on:
- backend
links:
- backend
networks:
- semantic
restart: always
volumes:
- ./.env:/app/.env
networks:
semantic:

View File

@ -37,13 +37,20 @@ Q_LEN = 256 # Question Length
T_LEN = 32 # Target Length T_LEN = 32 # Target Length
BATCH_SIZE = 4 #dávka dát BATCH_SIZE = 4 #dávka dát
print("Model succesfully loaded") print("Model succesfully loaded")
from datasets import load_dataset
path_train = '/home/omasta/T5_JUPYTER/skquad-221017/train-v1.json' dataset = load_dataset("squad_v2")
print(dataset["train"][0])
#path_train = '/home/omasta/T5_JUPYTER/skquad-221017/train-v1.json'
path_train = "poquad-train.json"
with open(path_train) as f: with open(path_train) as f:
data = json.load(f) data = json.load(f)
def nahradit_znaky(retezec):
novy_retezec = retezec.replace('[', ' ').replace(']', ' ')
return novy_retezec
def prepare_data(data): def prepare_data(data):
articles = [] articles = []
for article in data["data"]: for article in data["data"]:
@ -60,15 +67,28 @@ def prepare_data(data):
articles.append(inputs) articles.append(inputs)
return articles return articles
def prep_data(data):
arcs = list()
for i in range(len(data)):
questions=data[i]["question"]
try:
answer = nahradit_znaky(', '.join(data[i]["answers"]["text"]))
except KeyError:
continue
context = data[i]["context"]
inputs = {"input":context+"<sep>"+questions,"answer":answer}
arcs.append(inputs)
return arcs
prepared_data = prepare_data(data) #print(dataset["train"][0]["answers"]["text"])
prepared_data=prep_data(dataset["train"])
#prepared_data = prepare_data(data)
print(prepared_data[0]) print(prepared_data[0])
#Dataframe #Dataframe
data = pd.DataFrame(prepared_data) data = pd.DataFrame(prepared_data)
class QA_Dataset(Dataset): class QA_Dataset(Dataset):
def __init__(self, tokenizer, dataframe, q_len, t_len): def __init__(self, tokenizer, dataframe, q_len, t_len):
self.tokenizer = tokenizer self.tokenizer = tokenizer
@ -113,18 +133,13 @@ train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampl
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler) val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)
print("Loaders working fine") print("Loaders working fine")
### TRAINING (46MINS ACCORDING THE V1_DATA) ### TRAINING (46MINS ACCORDING THE V1_DATA)
train_loss = 0 train_loss = 0
val_loss = 0 val_loss = 0
train_batch_count = 0 train_batch_count = 0
val_batch_count = 0 val_batch_count = 0
for epoch in range(4): for epoch in range(2):
MODEL.train() MODEL.train()
for batch in tqdm(train_loader, desc="Training batches"): for batch in tqdm(train_loader, desc="Training batches"):
input_ids = batch["input_ids"].to(DEVICE) input_ids = batch["input_ids"].to(DEVICE)
@ -171,5 +186,5 @@ for epoch in range(4):
print("Training done succesfully") print("Training done succesfully")
## SAVE FINE_TUNED MODEL ## SAVE FINE_TUNED MODEL
MODEL.save_pretrained("qa_model_mT5_small") MODEL.save_pretrained("qa_model_mT5_english")
TOKENIZER.save_pretrained('qa_tokenizer_mT5_small') TOKENIZER.save_pretrained('qa_tokenizer_mT5_english')

View File

@ -11,6 +11,11 @@ import warnings
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
##13/03/23 added ##13/03/23 added
from rouge import Rouge from rouge import Rouge
from tqdm import tqdm
from datasets import load_dataset
import re
##CUSTOM ROUGE METRIC - NEW TODO:
# Názov modelu # Názov modelu
DEVICE ='cuda:0' DEVICE ='cuda:0'
@ -22,9 +27,9 @@ DEVICE ='cuda:0'
#tokenizer_dir = "/home/omasta/T5_JUPYTER/qa_tokenizer" #tokenizer_dir = "/home/omasta/T5_JUPYTER/qa_tokenizer"
#mT5 SMALL MODEL #mT5 SMALL MODEL
model_name = 'mT5_SMALL' model_name = 'qa_model'
model_dir = '/home/omasta/T5_JUPYTER/qa_model_mT5_small' model_dir = '/home/omasta/T5_JUPYTER/qa_model_mT5_polish'
tokenizer_dir = '/home/omasta/T5_JUPYTER/qa_tokenizer_mT5_small' tokenizer_dir = '/home/omasta/T5_JUPYTER/qa_tokenizer_mT5_polish'
#Načítanie modelu z adresára #Načítanie modelu z adresára
MODEL = T5ForConditionalGeneration.from_pretrained(model_dir, from_tf=False, return_dict=True).to(DEVICE) MODEL = T5ForConditionalGeneration.from_pretrained(model_dir, from_tf=False, return_dict=True).to(DEVICE)
@ -35,9 +40,14 @@ Q_LEN = 512
TOKENIZER.add_tokens('<sep>') TOKENIZER.add_tokens('<sep>')
MODEL.resize_token_embeddings(len(TOKENIZER)) MODEL.resize_token_embeddings(len(TOKENIZER))
def nahradit_znaky(retezec):
novy_retezec = retezec.replace('[', ' ').replace(']', ' ')
return novy_retezec
def predict_answer(data, ref_answer=None,random=None): def predict_answer(data, ref_answer=None,random=None):
predictions=[] predictions=[]
for i in data: for i in tqdm(data,desc="predicting"):
inputs = TOKENIZER(i['input'], max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True) inputs = TOKENIZER(i['input'], max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)
input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0) input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0) attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)
@ -47,14 +57,14 @@ def predict_answer(data, ref_answer=None,random=None):
#print(ref_answer) #print(ref_answer)
if ref_answer: if ref_answer:
# Load the Bleu metric # Load the Bleu metric
bleu = evaluate.load("google_bleu") #bleu = evaluate.load("google_bleu")
#print('debug') #print('debug')
#precision = list(precision_score(ref_answer, predicted_answer)) #precision = list(precision_score(ref_answer, predicted_answer))
#recall = list(recall_score(ref_answer, predicted_answer)) #recall = list(recall_score(ref_answer, predicted_answer))
#f1 = list(f1_score(ref_answer, predicted_answer)) #f1 = list(f1_score(ref_answer, predicted_answer))
score = bleu.compute(predictions=[predicted_answer], #score = bleu.compute(predictions=[predicted_answer],
references=[ref_answer]) # references=[ref_answer])
predictions.append({'prediction':predicted_answer,'ref_answer':ref_answer,'score':score['google_bleu']}) predictions.append({'prediction':predicted_answer,'ref_answer':ref_answer})
return predictions return predictions
def prepare_data(data): def prepare_data(data):
@ -66,19 +76,29 @@ def prepare_data(data):
answer = qa["answers"][0]["text"] answer = qa["answers"][0]["text"]
inputs = {"input": paragraph["context"]+ "<sep>" + question, "answer": answer} inputs = {"input": paragraph["context"]+ "<sep>" + question, "answer": answer}
articles.append(inputs) articles.append(inputs)
return articles return articles
dev_data_path = '/home/omasta/T5_JUPYTER/skquad-221017/dev-v1.json' def prepare_polish_data(data):
with open(dev_data_path,'r') as f: arcs = list()
data=json.load(f) for i in range(len(data)):
#print('data imported') questions=data[i]["question"]
try:
answer = nahradit_znaky(', '.join(data[i]["answers"]["text"]))
except KeyError:
continue
context = data[i]["context"]
inputs = {"input":context+"<sep>"+questions,"answer":answer}
arcs.append(inputs)
return arcs
dev_data = prepare_data(data)
#dataset = load_dataset("clarin-pl/poquad")
dataset = load_dataset("squad_v2")
dev_data = prepare_polish_data(dataset["validation"])
#print('data prepared') #print('data prepared')
print(f'Number of dev samples {len(dev_data)}') print(f'Number of dev samples {len(dev_data)}')
print(dev_data[0]) #print(dev_data[0])
bleu_score = [] bleu_score = []
precisions=[] precisions=[]
f1_scores=[] f1_scores=[]
@ -88,10 +108,9 @@ rouge_2 = []
#X = 150 #X = 150
evaluate = predict_answer(dev_data) evaluate = predict_answer(dev_data)
rouge = Rouge() rouge = Rouge()
for item in evaluate: for item in tqdm(evaluate,desc="evaluating"):
bleu_score.append(item['score'])
try: try:
#scores = rouge.get_scores(item['prediction'], item['ref_answer'], avg=True) scores = rouge.get_scores(item['prediction'], item['ref_answer'])
precision=precision_score(list(item['ref_answer']), list(item['prediction']),average='macro') precision=precision_score(list(item['ref_answer']), list(item['prediction']),average='macro')
recall=recall_score(list(item['ref_answer']), list(item['prediction']),average='macro') recall=recall_score(list(item['ref_answer']), list(item['prediction']),average='macro')
f1=f1_score(list(item['ref_answer']), list(item['prediction']),average='macro') f1=f1_score(list(item['ref_answer']), list(item['prediction']),average='macro')
@ -119,21 +138,22 @@ def rouge_eval(dict_x):
print(f'VYHODNOTENIE VYSLEDKOV : ------------------------') print(f'VYHODNOTENIE VYSLEDKOV : ------------------------')
#print(evaluate) #print(evaluate)
#bleu_score_total = statistics.mean(bleu_score) #bleu_score_total = statistics.mean(bleu_score)
#recall_score_total= statistics.mean(recall_scores) recall_score_total= statistics.mean(recall_scores)
#f1_score_total = statistics.mean(f1_scores) f1_score_total = statistics.mean(f1_scores)
#precision_total = statistics.mean(precisions) precision_total = statistics.mean(precisions)
#print(f'Bleu_score of model {model_name} : ',bleu_score_total) #print(f'Bleu_score of model {model_name} : ',bleu_score_total)
#print(f'Recall of model {model_name}: ',recall_score_total) print(f'Recall of model {model_name}: ',recall_score_total)
#print(f'F1 of model {model_name} : ', f1_score_total) print(f'F1 of model {model_name} : ', f1_score_total)
#print(f'Precision of model {model_name}: :',precision_total) print(f'Precision of model {model_name}: :',precision_total)
#print(rouge_eval(evaluate)) print(model_dir)
print(rouge_eval(evaluate))
print(f'{model_name} results') print(f'{model_name} results')
rouge_scores = rouge_eval(evaluate) rouge_scores = rouge_eval(evaluate)
rouge_values = [score[0]['rouge-1']['f'] for score in rouge_scores] rouge_values = [score[0]['rouge-1']['f'] for score in rouge_scores]
mean_rouge_score = statistics.mean(rouge_values) mean_rouge_score = statistics.mean(rouge_values)
print(f'Rouge:{mean_rouge_score}') print(f'Rouge mean score:{mean_rouge_score}')
rouge2_values = [score[0]['rouge-2']['f'] for score in rouge_scores] rouge2_values = [score[0]['rouge-2']['f'] for score in rouge_scores]
mean_rouge_score =statistics.mean(rouge2_values) mean_rouge_score =statistics.mean(rouge2_values)
print(f'Rouge-2:{mean_rouge_score}') print(f'Rouge-2 mean score:{mean_rouge_score}')