Upload files to ''
This commit is contained in:
parent
24dc8fd808
commit
6159a93471
5
.env
Normal file
5
.env
Normal file
@ -0,0 +1,5 @@
|
||||
URL="http://backend_inference:8000/predict"
|
||||
PORT="8090"
|
||||
HOST="localhost"
|
||||
QA_MODEL="qa_model"
|
||||
QA_TOKENIZER="qa_tokenizer"
|
34
docker-compose.yml
Normal file
34
docker-compose.yml
Normal file
@ -0,0 +1,34 @@
|
||||
version: '3.3'
|
||||
services:
|
||||
backend:
|
||||
#build: ./backend
|
||||
image: backend:test
|
||||
container_name: backend_inference
|
||||
ports:
|
||||
- 8090:8090
|
||||
networks:
|
||||
- semantic #dopis svoj nazov taskov
|
||||
volumes:
|
||||
- ./.env:/app/.env
|
||||
- ./qa_model:/app/qa_model
|
||||
- ./qa_tokenizer:/app/qa_tokenizer
|
||||
|
||||
restart: always
|
||||
|
||||
frontend:
|
||||
#build: ./frontend
|
||||
image: streamlit:dev
|
||||
container_name: streamlit
|
||||
ports:
|
||||
- 8501:8501
|
||||
depends_on:
|
||||
- backend
|
||||
links:
|
||||
- backend
|
||||
networks:
|
||||
- semantic
|
||||
restart: always
|
||||
volumes:
|
||||
- ./.env:/app/.env
|
||||
networks:
|
||||
semantic:
|
41
train.py
41
train.py
@ -37,13 +37,20 @@ Q_LEN = 256 # Question Length
|
||||
T_LEN = 32 # Target Length
|
||||
BATCH_SIZE = 4 #dávka dát
|
||||
print("Model succesfully loaded")
|
||||
from datasets import load_dataset
|
||||
|
||||
path_train = '/home/omasta/T5_JUPYTER/skquad-221017/train-v1.json'
|
||||
|
||||
dataset = load_dataset("squad_v2")
|
||||
print(dataset["train"][0])
|
||||
#path_train = '/home/omasta/T5_JUPYTER/skquad-221017/train-v1.json'
|
||||
path_train = "poquad-train.json"
|
||||
with open(path_train) as f:
|
||||
data = json.load(f)
|
||||
|
||||
|
||||
def nahradit_znaky(retezec):
|
||||
novy_retezec = retezec.replace('[', ' ').replace(']', ' ')
|
||||
return novy_retezec
|
||||
|
||||
def prepare_data(data):
|
||||
articles = []
|
||||
for article in data["data"]:
|
||||
@ -60,15 +67,28 @@ def prepare_data(data):
|
||||
articles.append(inputs)
|
||||
|
||||
return articles
|
||||
def prep_data(data):
|
||||
arcs = list()
|
||||
for i in range(len(data)):
|
||||
questions=data[i]["question"]
|
||||
try:
|
||||
answer = nahradit_znaky(', '.join(data[i]["answers"]["text"]))
|
||||
except KeyError:
|
||||
continue
|
||||
context = data[i]["context"]
|
||||
inputs = {"input":context+"<sep>"+questions,"answer":answer}
|
||||
arcs.append(inputs)
|
||||
return arcs
|
||||
|
||||
prepared_data = prepare_data(data)
|
||||
#print(dataset["train"][0]["answers"]["text"])
|
||||
|
||||
prepared_data=prep_data(dataset["train"])
|
||||
#prepared_data = prepare_data(data)
|
||||
print(prepared_data[0])
|
||||
|
||||
#Dataframe
|
||||
data = pd.DataFrame(prepared_data)
|
||||
|
||||
|
||||
|
||||
class QA_Dataset(Dataset):
|
||||
def __init__(self, tokenizer, dataframe, q_len, t_len):
|
||||
self.tokenizer = tokenizer
|
||||
@ -113,18 +133,13 @@ train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampl
|
||||
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)
|
||||
print("Loaders working fine")
|
||||
|
||||
|
||||
|
||||
|
||||
### TRAINING (46MINS ACCORDING THE V1_DATA)
|
||||
|
||||
|
||||
train_loss = 0
|
||||
val_loss = 0
|
||||
train_batch_count = 0
|
||||
val_batch_count = 0
|
||||
|
||||
for epoch in range(4):
|
||||
for epoch in range(2):
|
||||
MODEL.train()
|
||||
for batch in tqdm(train_loader, desc="Training batches"):
|
||||
input_ids = batch["input_ids"].to(DEVICE)
|
||||
@ -171,5 +186,5 @@ for epoch in range(4):
|
||||
print("Training done succesfully")
|
||||
|
||||
## SAVE FINE_TUNED MODEL
|
||||
MODEL.save_pretrained("qa_model_mT5_small")
|
||||
TOKENIZER.save_pretrained('qa_tokenizer_mT5_small')
|
||||
MODEL.save_pretrained("qa_model_mT5_english")
|
||||
TOKENIZER.save_pretrained('qa_tokenizer_mT5_english')
|
||||
|
74
usecase.py
74
usecase.py
@ -11,6 +11,11 @@ import warnings
|
||||
warnings.filterwarnings("ignore")
|
||||
##13/03/23 added
|
||||
from rouge import Rouge
|
||||
from tqdm import tqdm
|
||||
from datasets import load_dataset
|
||||
import re
|
||||
##CUSTOM ROUGE METRIC - NEW TODO:
|
||||
|
||||
|
||||
# Názov modelu
|
||||
DEVICE ='cuda:0'
|
||||
@ -22,9 +27,9 @@ DEVICE ='cuda:0'
|
||||
#tokenizer_dir = "/home/omasta/T5_JUPYTER/qa_tokenizer"
|
||||
|
||||
#mT5 SMALL MODEL
|
||||
model_name = 'mT5_SMALL'
|
||||
model_dir = '/home/omasta/T5_JUPYTER/qa_model_mT5_small'
|
||||
tokenizer_dir = '/home/omasta/T5_JUPYTER/qa_tokenizer_mT5_small'
|
||||
model_name = 'qa_model'
|
||||
model_dir = '/home/omasta/T5_JUPYTER/qa_model_mT5_polish'
|
||||
tokenizer_dir = '/home/omasta/T5_JUPYTER/qa_tokenizer_mT5_polish'
|
||||
|
||||
#Načítanie modelu z adresára
|
||||
MODEL = T5ForConditionalGeneration.from_pretrained(model_dir, from_tf=False, return_dict=True).to(DEVICE)
|
||||
@ -35,9 +40,14 @@ Q_LEN = 512
|
||||
TOKENIZER.add_tokens('<sep>')
|
||||
MODEL.resize_token_embeddings(len(TOKENIZER))
|
||||
|
||||
def nahradit_znaky(retezec):
|
||||
novy_retezec = retezec.replace('[', ' ').replace(']', ' ')
|
||||
return novy_retezec
|
||||
|
||||
|
||||
def predict_answer(data, ref_answer=None,random=None):
|
||||
predictions=[]
|
||||
for i in data:
|
||||
for i in tqdm(data,desc="predicting"):
|
||||
inputs = TOKENIZER(i['input'], max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)
|
||||
input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
|
||||
attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)
|
||||
@ -47,14 +57,14 @@ def predict_answer(data, ref_answer=None,random=None):
|
||||
#print(ref_answer)
|
||||
if ref_answer:
|
||||
# Load the Bleu metric
|
||||
bleu = evaluate.load("google_bleu")
|
||||
#bleu = evaluate.load("google_bleu")
|
||||
#print('debug')
|
||||
#precision = list(precision_score(ref_answer, predicted_answer))
|
||||
#recall = list(recall_score(ref_answer, predicted_answer))
|
||||
#f1 = list(f1_score(ref_answer, predicted_answer))
|
||||
score = bleu.compute(predictions=[predicted_answer],
|
||||
references=[ref_answer])
|
||||
predictions.append({'prediction':predicted_answer,'ref_answer':ref_answer,'score':score['google_bleu']})
|
||||
#score = bleu.compute(predictions=[predicted_answer],
|
||||
# references=[ref_answer])
|
||||
predictions.append({'prediction':predicted_answer,'ref_answer':ref_answer})
|
||||
return predictions
|
||||
|
||||
def prepare_data(data):
|
||||
@ -66,19 +76,29 @@ def prepare_data(data):
|
||||
answer = qa["answers"][0]["text"]
|
||||
inputs = {"input": paragraph["context"]+ "<sep>" + question, "answer": answer}
|
||||
articles.append(inputs)
|
||||
|
||||
return articles
|
||||
|
||||
dev_data_path = '/home/omasta/T5_JUPYTER/skquad-221017/dev-v1.json'
|
||||
with open(dev_data_path,'r') as f:
|
||||
data=json.load(f)
|
||||
#print('data imported')
|
||||
def prepare_polish_data(data):
|
||||
arcs = list()
|
||||
for i in range(len(data)):
|
||||
questions=data[i]["question"]
|
||||
try:
|
||||
answer = nahradit_znaky(', '.join(data[i]["answers"]["text"]))
|
||||
except KeyError:
|
||||
continue
|
||||
context = data[i]["context"]
|
||||
inputs = {"input":context+"<sep>"+questions,"answer":answer}
|
||||
arcs.append(inputs)
|
||||
return arcs
|
||||
|
||||
dev_data = prepare_data(data)
|
||||
|
||||
#dataset = load_dataset("clarin-pl/poquad")
|
||||
dataset = load_dataset("squad_v2")
|
||||
dev_data = prepare_polish_data(dataset["validation"])
|
||||
|
||||
#print('data prepared')
|
||||
print(f'Number of dev samples {len(dev_data)}')
|
||||
print(dev_data[0])
|
||||
#print(dev_data[0])
|
||||
bleu_score = []
|
||||
precisions=[]
|
||||
f1_scores=[]
|
||||
@ -88,10 +108,9 @@ rouge_2 = []
|
||||
#X = 150
|
||||
evaluate = predict_answer(dev_data)
|
||||
rouge = Rouge()
|
||||
for item in evaluate:
|
||||
bleu_score.append(item['score'])
|
||||
for item in tqdm(evaluate,desc="evaluating"):
|
||||
try:
|
||||
#scores = rouge.get_scores(item['prediction'], item['ref_answer'], avg=True)
|
||||
scores = rouge.get_scores(item['prediction'], item['ref_answer'])
|
||||
precision=precision_score(list(item['ref_answer']), list(item['prediction']),average='macro')
|
||||
recall=recall_score(list(item['ref_answer']), list(item['prediction']),average='macro')
|
||||
f1=f1_score(list(item['ref_answer']), list(item['prediction']),average='macro')
|
||||
@ -119,21 +138,22 @@ def rouge_eval(dict_x):
|
||||
print(f'VYHODNOTENIE VYSLEDKOV : ------------------------')
|
||||
#print(evaluate)
|
||||
#bleu_score_total = statistics.mean(bleu_score)
|
||||
#recall_score_total= statistics.mean(recall_scores)
|
||||
#f1_score_total = statistics.mean(f1_scores)
|
||||
#precision_total = statistics.mean(precisions)
|
||||
recall_score_total= statistics.mean(recall_scores)
|
||||
f1_score_total = statistics.mean(f1_scores)
|
||||
precision_total = statistics.mean(precisions)
|
||||
#print(f'Bleu_score of model {model_name} : ',bleu_score_total)
|
||||
#print(f'Recall of model {model_name}: ',recall_score_total)
|
||||
#print(f'F1 of model {model_name} : ', f1_score_total)
|
||||
#print(f'Precision of model {model_name}: :',precision_total)
|
||||
#print(rouge_eval(evaluate))
|
||||
print(f'Recall of model {model_name}: ',recall_score_total)
|
||||
print(f'F1 of model {model_name} : ', f1_score_total)
|
||||
print(f'Precision of model {model_name}: :',precision_total)
|
||||
print(model_dir)
|
||||
print(rouge_eval(evaluate))
|
||||
print(f'{model_name} results')
|
||||
rouge_scores = rouge_eval(evaluate)
|
||||
rouge_values = [score[0]['rouge-1']['f'] for score in rouge_scores]
|
||||
mean_rouge_score = statistics.mean(rouge_values)
|
||||
print(f'Rouge:{mean_rouge_score}')
|
||||
print(f'Rouge mean score:{mean_rouge_score}')
|
||||
|
||||
rouge2_values = [score[0]['rouge-2']['f'] for score in rouge_scores]
|
||||
mean_rouge_score =statistics.mean(rouge2_values)
|
||||
print(f'Rouge-2:{mean_rouge_score}')
|
||||
print(f'Rouge-2 mean score:{mean_rouge_score}')
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user