From 6159a93471f5cbf49fb9d75b6c1dc1b314b99245 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A1vid=20Omasta?= Date: Sat, 17 Feb 2024 19:00:49 +0000 Subject: [PATCH] Upload files to '' --- .env | 5 ++++ docker-compose.yml | 34 +++++++++++++++++++++ train.py | 41 +++++++++++++++++-------- usecase.py | 74 +++++++++++++++++++++++++++++----------------- 4 files changed, 114 insertions(+), 40 deletions(-) create mode 100644 .env create mode 100644 docker-compose.yml diff --git a/.env b/.env new file mode 100644 index 0000000..421c141 --- /dev/null +++ b/.env @@ -0,0 +1,5 @@ +URL="http://backend_inference:8000/predict" +PORT="8090" +HOST="localhost" +QA_MODEL="qa_model" +QA_TOKENIZER="qa_tokenizer" \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..8a00ef6 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,34 @@ +version: '3.3' +services: + backend: + #build: ./backend + image: backend:test + container_name: backend_inference + ports: + - 8090:8090 + networks: + - semantic #dopis svoj nazov taskov + volumes: + - ./.env:/app/.env + - ./qa_model:/app/qa_model + - ./qa_tokenizer:/app/qa_tokenizer + + restart: always + + frontend: + #build: ./frontend + image: streamlit:dev + container_name: streamlit + ports: + - 8501:8501 + depends_on: + - backend + links: + - backend + networks: + - semantic + restart: always + volumes: + - ./.env:/app/.env +networks: + semantic: \ No newline at end of file diff --git a/train.py b/train.py index 2429684..301075c 100644 --- a/train.py +++ b/train.py @@ -37,13 +37,20 @@ Q_LEN = 256 # Question Length T_LEN = 32 # Target Length BATCH_SIZE = 4 #dávka dát print("Model succesfully loaded") +from datasets import load_dataset -path_train = '/home/omasta/T5_JUPYTER/skquad-221017/train-v1.json' - +dataset = load_dataset("squad_v2") +print(dataset["train"][0]) +#path_train = '/home/omasta/T5_JUPYTER/skquad-221017/train-v1.json' +path_train = "poquad-train.json" with open(path_train) as f: data = json.load(f) +def nahradit_znaky(retezec): + novy_retezec = retezec.replace('[', ' ').replace(']', ' ') + return novy_retezec + def prepare_data(data): articles = [] for article in data["data"]: @@ -60,15 +67,28 @@ def prepare_data(data): articles.append(inputs) return articles +def prep_data(data): + arcs = list() + for i in range(len(data)): + questions=data[i]["question"] + try: + answer = nahradit_znaky(', '.join(data[i]["answers"]["text"])) + except KeyError: + continue + context = data[i]["context"] + inputs = {"input":context+""+questions,"answer":answer} + arcs.append(inputs) + return arcs -prepared_data = prepare_data(data) +#print(dataset["train"][0]["answers"]["text"]) + +prepared_data=prep_data(dataset["train"]) +#prepared_data = prepare_data(data) print(prepared_data[0]) #Dataframe data = pd.DataFrame(prepared_data) - - class QA_Dataset(Dataset): def __init__(self, tokenizer, dataframe, q_len, t_len): self.tokenizer = tokenizer @@ -113,18 +133,13 @@ train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampl val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler) print("Loaders working fine") - - - ### TRAINING (46MINS ACCORDING THE V1_DATA) - - train_loss = 0 val_loss = 0 train_batch_count = 0 val_batch_count = 0 -for epoch in range(4): +for epoch in range(2): MODEL.train() for batch in tqdm(train_loader, desc="Training batches"): input_ids = batch["input_ids"].to(DEVICE) @@ -171,5 +186,5 @@ for epoch in range(4): print("Training done succesfully") ## SAVE FINE_TUNED MODEL -MODEL.save_pretrained("qa_model_mT5_small") -TOKENIZER.save_pretrained('qa_tokenizer_mT5_small') +MODEL.save_pretrained("qa_model_mT5_english") +TOKENIZER.save_pretrained('qa_tokenizer_mT5_english') diff --git a/usecase.py b/usecase.py index 1a32a39..16f2f43 100644 --- a/usecase.py +++ b/usecase.py @@ -11,6 +11,11 @@ import warnings warnings.filterwarnings("ignore") ##13/03/23 added from rouge import Rouge +from tqdm import tqdm +from datasets import load_dataset +import re +##CUSTOM ROUGE METRIC - NEW TODO: + # Názov modelu DEVICE ='cuda:0' @@ -22,9 +27,9 @@ DEVICE ='cuda:0' #tokenizer_dir = "/home/omasta/T5_JUPYTER/qa_tokenizer" #mT5 SMALL MODEL -model_name = 'mT5_SMALL' -model_dir = '/home/omasta/T5_JUPYTER/qa_model_mT5_small' -tokenizer_dir = '/home/omasta/T5_JUPYTER/qa_tokenizer_mT5_small' +model_name = 'qa_model' +model_dir = '/home/omasta/T5_JUPYTER/qa_model_mT5_polish' +tokenizer_dir = '/home/omasta/T5_JUPYTER/qa_tokenizer_mT5_polish' #Načítanie modelu z adresára MODEL = T5ForConditionalGeneration.from_pretrained(model_dir, from_tf=False, return_dict=True).to(DEVICE) @@ -35,9 +40,14 @@ Q_LEN = 512 TOKENIZER.add_tokens('') MODEL.resize_token_embeddings(len(TOKENIZER)) +def nahradit_znaky(retezec): + novy_retezec = retezec.replace('[', ' ').replace(']', ' ') + return novy_retezec + + def predict_answer(data, ref_answer=None,random=None): predictions=[] - for i in data: + for i in tqdm(data,desc="predicting"): inputs = TOKENIZER(i['input'], max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True) input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0) attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0) @@ -47,14 +57,14 @@ def predict_answer(data, ref_answer=None,random=None): #print(ref_answer) if ref_answer: # Load the Bleu metric - bleu = evaluate.load("google_bleu") + #bleu = evaluate.load("google_bleu") #print('debug') #precision = list(precision_score(ref_answer, predicted_answer)) #recall = list(recall_score(ref_answer, predicted_answer)) #f1 = list(f1_score(ref_answer, predicted_answer)) - score = bleu.compute(predictions=[predicted_answer], - references=[ref_answer]) - predictions.append({'prediction':predicted_answer,'ref_answer':ref_answer,'score':score['google_bleu']}) + #score = bleu.compute(predictions=[predicted_answer], + # references=[ref_answer]) + predictions.append({'prediction':predicted_answer,'ref_answer':ref_answer}) return predictions def prepare_data(data): @@ -66,19 +76,29 @@ def prepare_data(data): answer = qa["answers"][0]["text"] inputs = {"input": paragraph["context"]+ "" + question, "answer": answer} articles.append(inputs) - return articles -dev_data_path = '/home/omasta/T5_JUPYTER/skquad-221017/dev-v1.json' -with open(dev_data_path,'r') as f: - data=json.load(f) -#print('data imported') +def prepare_polish_data(data): + arcs = list() + for i in range(len(data)): + questions=data[i]["question"] + try: + answer = nahradit_znaky(', '.join(data[i]["answers"]["text"])) + except KeyError: + continue + context = data[i]["context"] + inputs = {"input":context+""+questions,"answer":answer} + arcs.append(inputs) + return arcs -dev_data = prepare_data(data) + +#dataset = load_dataset("clarin-pl/poquad") +dataset = load_dataset("squad_v2") +dev_data = prepare_polish_data(dataset["validation"]) #print('data prepared') print(f'Number of dev samples {len(dev_data)}') -print(dev_data[0]) +#print(dev_data[0]) bleu_score = [] precisions=[] f1_scores=[] @@ -88,10 +108,9 @@ rouge_2 = [] #X = 150 evaluate = predict_answer(dev_data) rouge = Rouge() -for item in evaluate: - bleu_score.append(item['score']) +for item in tqdm(evaluate,desc="evaluating"): try: - #scores = rouge.get_scores(item['prediction'], item['ref_answer'], avg=True) + scores = rouge.get_scores(item['prediction'], item['ref_answer']) precision=precision_score(list(item['ref_answer']), list(item['prediction']),average='macro') recall=recall_score(list(item['ref_answer']), list(item['prediction']),average='macro') f1=f1_score(list(item['ref_answer']), list(item['prediction']),average='macro') @@ -119,21 +138,22 @@ def rouge_eval(dict_x): print(f'VYHODNOTENIE VYSLEDKOV : ------------------------') #print(evaluate) #bleu_score_total = statistics.mean(bleu_score) -#recall_score_total= statistics.mean(recall_scores) -#f1_score_total = statistics.mean(f1_scores) -#precision_total = statistics.mean(precisions) +recall_score_total= statistics.mean(recall_scores) +f1_score_total = statistics.mean(f1_scores) +precision_total = statistics.mean(precisions) #print(f'Bleu_score of model {model_name} : ',bleu_score_total) -#print(f'Recall of model {model_name}: ',recall_score_total) -#print(f'F1 of model {model_name} : ', f1_score_total) -#print(f'Precision of model {model_name}: :',precision_total) -#print(rouge_eval(evaluate)) +print(f'Recall of model {model_name}: ',recall_score_total) +print(f'F1 of model {model_name} : ', f1_score_total) +print(f'Precision of model {model_name}: :',precision_total) +print(model_dir) +print(rouge_eval(evaluate)) print(f'{model_name} results') rouge_scores = rouge_eval(evaluate) rouge_values = [score[0]['rouge-1']['f'] for score in rouge_scores] mean_rouge_score = statistics.mean(rouge_values) -print(f'Rouge:{mean_rouge_score}') +print(f'Rouge mean score:{mean_rouge_score}') rouge2_values = [score[0]['rouge-2']['f'] for score in rouge_scores] mean_rouge_score =statistics.mean(rouge2_values) -print(f'Rouge-2:{mean_rouge_score}') +print(f'Rouge-2 mean score:{mean_rouge_score}')