Compare commits
4 Commits
master
...
UPDATE_202
Author | SHA1 | Date | |
---|---|---|---|
9efdf8a61f | |||
6159a93471 | |||
24dc8fd808 | |||
6e57cab13f |
5
.env
Normal file
5
.env
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
URL="http://backend_inference:8000/predict"
|
||||||
|
PORT="8090"
|
||||||
|
HOST="localhost"
|
||||||
|
QA_MODEL="qa_model"
|
||||||
|
QA_TOKENIZER="qa_tokenizer"
|
55
Backend/Dockerfile
Normal file
55
Backend/Dockerfile
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
FROM python:3.10-slim-bullseye AS base
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Set utf-8 encoding for Python et al
|
||||||
|
ENV LANG=C.UTF-8 \
|
||||||
|
# Turn off writing .pyc files
|
||||||
|
PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
# Reduce the OS system calls for this tool it makes a difference
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
# Disables cache dir in pip
|
||||||
|
PIP_NO_CACHE_DIR=1 \
|
||||||
|
# Virtual environment
|
||||||
|
VENV="/opt/venv" \
|
||||||
|
# Add new user
|
||||||
|
APPUSER=appuser \
|
||||||
|
# Ensure that the python and pip executables used in the image
|
||||||
|
PATH="${VENV}/bin:$PATH"
|
||||||
|
|
||||||
|
|
||||||
|
FROM base as builder
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y git build-essential
|
||||||
|
|
||||||
|
|
||||||
|
RUN python -m venv ${VENV} \
|
||||||
|
&& . ${VENV}/bin/activate \
|
||||||
|
&& pip install --upgrade pip \
|
||||||
|
&& pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
FROM base as runner
|
||||||
|
|
||||||
|
|
||||||
|
COPY api.py .
|
||||||
|
|
||||||
|
COPY --from=builder ${VENV} ${VENV}
|
||||||
|
ENV PATH="${VENV}/bin:$PATH"
|
||||||
|
|
||||||
|
# Update permissions & change user to not run as root
|
||||||
|
RUN chgrp -R 0 /app \
|
||||||
|
&& chmod -R g=u /app \
|
||||||
|
&& groupadd -r ${APPUSER} \
|
||||||
|
&& useradd -r -g ${APPUSER} ${APPUSER} \
|
||||||
|
&& chown -R ${APPUSER}:${APPUSER} /app \
|
||||||
|
&& usermod -d /app ${APPUSER}
|
||||||
|
|
||||||
|
|
||||||
|
CMD ["/opt/venv/bin/uvicorn", "api:app", "--host", "0.0.0.0"]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
57
Backend/api.py
Normal file
57
Backend/api.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
import torch
|
||||||
|
import uvicorn
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from transformers import MT5Tokenizer,AutoTokenizer, AutoModel ,T5ForConditionalGeneration
|
||||||
|
import warnings
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
#from ece import compute_ECE
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
from functools import reduce
|
||||||
|
|
||||||
|
warnings.filterwarnings("ignore")
|
||||||
|
DEVICE ='cpu'
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
host = os.getenv("HOST")
|
||||||
|
port = os.getenv("PORT")
|
||||||
|
|
||||||
|
model_dir = os.getenv("QA_MODEL")
|
||||||
|
#model_dir = "C:/Users/david/Desktop/T5_JUPYTER/qa_model"
|
||||||
|
tokenizer_dir = os.getenv("QA_TOKENIZER")
|
||||||
|
#tokenizer_dir = "C:/Users/david/Desktop/T5_JUPYTER/qa_tokenizer"
|
||||||
|
MODEL = T5ForConditionalGeneration.from_pretrained(model_dir, from_tf=False, return_dict=True).to(DEVICE)
|
||||||
|
print("Model succesfully loaded!")
|
||||||
|
TOKENIZER = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=True)
|
||||||
|
print("Tokenizer succesfully loaded!")
|
||||||
|
Q_LEN = 512
|
||||||
|
TOKENIZER.add_tokens('<sep>')
|
||||||
|
|
||||||
|
|
||||||
|
print('model loaded')
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
# BASE MODEL
|
||||||
|
class InputData(BaseModel):
|
||||||
|
context: str
|
||||||
|
question: str
|
||||||
|
|
||||||
|
@app.post("/predict")
|
||||||
|
async def predict(input_data: InputData):
|
||||||
|
inputs = TOKENIZER(input_data.question, input_data.context, max_length=512, padding="max_length", truncation=True, add_special_tokens=True)
|
||||||
|
input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
|
||||||
|
attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)
|
||||||
|
outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask, return_dict_in_generate=True,output_scores=True,max_length=512)
|
||||||
|
predicted_ids = outputs.sequences.numpy()
|
||||||
|
predicted_text = TOKENIZER.decode(predicted_ids[0], skip_special_tokens=True)
|
||||||
|
|
||||||
|
return {'prediction':predicted_text}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
uvicorn.run(app, host=host, port=port)
|
5
Backend/requirements.txt
Normal file
5
Backend/requirements.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
uvicorn==0.23.2
|
||||||
|
fastapi==0.103.2
|
||||||
|
transformers==4.34.0
|
||||||
|
rank_bm25==0.2.2
|
||||||
|
python-dotenv
|
51
Streamlit/Dockerfile
Normal file
51
Streamlit/Dockerfile
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
FROM python:3.10-slim-bullseye AS base
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Set utf-8 encoding for Python et al
|
||||||
|
ENV LANG=C.UTF-8 \
|
||||||
|
# Turn off writing .pyc files
|
||||||
|
PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
# Reduce the OS system calls for this tool it makes a difference
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
# Disables cache dir in pip
|
||||||
|
PIP_NO_CACHE_DIR=1 \
|
||||||
|
# Virtual environment
|
||||||
|
VENV="/opt/venv" \
|
||||||
|
# Add new user
|
||||||
|
APPUSER=appuser \
|
||||||
|
# Ensure that the python and pip executables used in the image
|
||||||
|
PATH="${VENV}/bin:$PATH"
|
||||||
|
|
||||||
|
|
||||||
|
FROM base as builder
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& python -m venv ${VENV} \
|
||||||
|
&& . ${VENV}/bin/activate \
|
||||||
|
&& pip install --upgrade pip \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
|
||||||
|
FROM base as runner
|
||||||
|
|
||||||
|
COPY aplication.py .
|
||||||
|
|
||||||
|
|
||||||
|
COPY --from=builder ${VENV} ${VENV}
|
||||||
|
ENV PATH="${VENV}/bin:$PATH"
|
||||||
|
|
||||||
|
# Update permissions & change user to not run as root
|
||||||
|
RUN chgrp -R 0 /app \
|
||||||
|
&& chmod -R g=u /app \
|
||||||
|
&& groupadd -r ${APPUSER} \
|
||||||
|
&& useradd -r -g ${APPUSER} ${APPUSER} \
|
||||||
|
&& chown -R ${APPUSER}:${APPUSER} /app \
|
||||||
|
&& usermod -d /app ${APPUSER}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#$HEALTHCHECK CMD curl --fail http://localhost/_stcore/health
|
||||||
|
CMD ["streamlit", "run", "aplication.py", "--server.address=0.0.0.0"]
|
36
Streamlit/aplication.py
Normal file
36
Streamlit/aplication.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import streamlit as st
|
||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
load_dotenv()
|
||||||
|
def predict(context,question):
|
||||||
|
url = os.getenv("URL")
|
||||||
|
#url = 'http://localhost:8090/predict'
|
||||||
|
data = {'context': context,'question': question}
|
||||||
|
json_data = json.dumps(data)
|
||||||
|
headers = {'Content-type': 'application/json'}
|
||||||
|
response = requests.post(url, data=json_data, headers=headers)
|
||||||
|
result = response.json()
|
||||||
|
return result
|
||||||
|
|
||||||
|
def main():
|
||||||
|
st.title("T5 model inference")
|
||||||
|
|
||||||
|
# Vytvoríme polia pre zadanie hodnôt
|
||||||
|
context = st.text_input("context:")
|
||||||
|
question = st.text_input("question:")
|
||||||
|
prediction = predict(context,question)
|
||||||
|
# Vytvoríme tlačidlo pre vykonanie akcie
|
||||||
|
if st.button("Execute"):
|
||||||
|
|
||||||
|
st.json({
|
||||||
|
'context': context,
|
||||||
|
'question': question,
|
||||||
|
'prediciton':prediction
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
3
Streamlit/requirements.txt
Normal file
3
Streamlit/requirements.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
requests
|
||||||
|
streamlit
|
||||||
|
python-dotenv
|
34
docker-compose.yml
Normal file
34
docker-compose.yml
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
version: '3.3'
|
||||||
|
services:
|
||||||
|
backend:
|
||||||
|
#build: ./backend
|
||||||
|
image: backend:test
|
||||||
|
container_name: backend_inference
|
||||||
|
ports:
|
||||||
|
- 8090:8090
|
||||||
|
networks:
|
||||||
|
- semantic #dopis svoj nazov taskov
|
||||||
|
volumes:
|
||||||
|
- ./.env:/app/.env
|
||||||
|
- ./qa_model:/app/qa_model
|
||||||
|
- ./qa_tokenizer:/app/qa_tokenizer
|
||||||
|
|
||||||
|
restart: always
|
||||||
|
|
||||||
|
frontend:
|
||||||
|
#build: ./frontend
|
||||||
|
image: streamlit:dev
|
||||||
|
container_name: streamlit
|
||||||
|
ports:
|
||||||
|
- 8501:8501
|
||||||
|
depends_on:
|
||||||
|
- backend
|
||||||
|
links:
|
||||||
|
- backend
|
||||||
|
networks:
|
||||||
|
- semantic
|
||||||
|
restart: always
|
||||||
|
volumes:
|
||||||
|
- ./.env:/app/.env
|
||||||
|
networks:
|
||||||
|
semantic:
|
163
new_train.py
Normal file
163
new_train.py
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
import torch
|
||||||
|
import json
|
||||||
|
from tqdm import tqdm
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.optim import Adam
|
||||||
|
import nltk
|
||||||
|
import string
|
||||||
|
from torch.utils.data import Dataset, DataLoader, RandomSampler
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import transformers
|
||||||
|
#from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast
|
||||||
|
from transformers import AutoTokenizer, T5ForConditionalGeneration
|
||||||
|
import warnings
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
warnings.filterwarnings("ignore")
|
||||||
|
|
||||||
|
print("Imports succesfully done")
|
||||||
|
|
||||||
|
DEVICE ='cuda:0'
|
||||||
|
TOKENIZER=AutoTokenizer.from_pretrained('google/umt5-small')
|
||||||
|
TOKENIZER.add_tokens('<sep>')
|
||||||
|
MODEL = T5ForConditionalGeneration.from_pretrained("google/mt5-small").to(DEVICE)
|
||||||
|
|
||||||
|
#pridam token
|
||||||
|
MODEL.resize_token_embeddings(len(TOKENIZER))
|
||||||
|
#lr = learning rate = 10-5
|
||||||
|
OPTIMIZER = Adam(MODEL.parameters(), lr=0.00001)
|
||||||
|
Q_LEN = 256 # Question Length
|
||||||
|
T_LEN = 32 # Target Length
|
||||||
|
BATCH_SIZE = 4 #dávka dát
|
||||||
|
print("Model succesfully loaded")
|
||||||
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
dataset_english = load_dataset("squad_v2")
|
||||||
|
dataset_slovak = load_dataset("TUKE-DeutscheTelekom/skquad")
|
||||||
|
dataset_polish = load_dataset("clarin-pl/poquad")
|
||||||
|
|
||||||
|
def prepare_data_english(data):
|
||||||
|
articles = []
|
||||||
|
for item in tqdm(data["train"],desc="Preparing training datas"):
|
||||||
|
context = item["context"]
|
||||||
|
question = item["question"]
|
||||||
|
try:
|
||||||
|
start_position = item['answers']['answer_start'][0]
|
||||||
|
except IndexError:
|
||||||
|
continue
|
||||||
|
text_length = len(item['answers']['text'][0])
|
||||||
|
target_text = context[start_position : start_position + text_length]
|
||||||
|
inputs = {"input": context+'<sep>'+question, "answer": target_text}
|
||||||
|
articles.append(inputs)
|
||||||
|
return articles
|
||||||
|
data_english = prepare_data_english(dataset_english)
|
||||||
|
data_polish = prepare_data_english(dataset_polish)
|
||||||
|
data_slovak = prepare_data_english(dataset_slovak)
|
||||||
|
|
||||||
|
train_data = data_slovak + data_english + data_polish
|
||||||
|
print("Training Samples : ",len(train_data))
|
||||||
|
|
||||||
|
|
||||||
|
#Dataframe
|
||||||
|
data = pd.DataFrame(train_data)
|
||||||
|
|
||||||
|
class QA_Dataset(Dataset):
|
||||||
|
def __init__(self, tokenizer, dataframe, q_len, t_len):
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.q_len = q_len
|
||||||
|
self.t_len = t_len
|
||||||
|
self.data = dataframe
|
||||||
|
self.input = self.data['input']
|
||||||
|
#self.context = self.data["context"]
|
||||||
|
self.answer = self.data['answer']
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.questions)
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
input = self.input[idx]
|
||||||
|
answer = self.answer[idx]
|
||||||
|
|
||||||
|
input_tokenized = self.tokenizer(input, max_length=self.q_len, padding="max_length",
|
||||||
|
truncation=True, pad_to_max_length=True, add_special_tokens=True)
|
||||||
|
answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length",
|
||||||
|
truncation=True, pad_to_max_length=True, add_special_tokens=True)
|
||||||
|
|
||||||
|
labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
|
||||||
|
labels[labels == 0] = -100
|
||||||
|
|
||||||
|
return {
|
||||||
|
"input_ids": torch.tensor(input_tokenized["input_ids"], dtype=torch.long),
|
||||||
|
"attention_mask": torch.tensor(input_tokenized["attention_mask"], dtype=torch.long),
|
||||||
|
"labels": labels,
|
||||||
|
"decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
|
||||||
|
}
|
||||||
|
|
||||||
|
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
|
||||||
|
train_sampler = RandomSampler(train_data.index)
|
||||||
|
val_sampler = RandomSampler(val_data.index)
|
||||||
|
qa_dataset = QA_Dataset(TOKENIZER, data, Q_LEN, T_LEN)
|
||||||
|
|
||||||
|
train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
|
||||||
|
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)
|
||||||
|
print("Loaders working fine")
|
||||||
|
|
||||||
|
### TRAINING (46MINS ACCORDING THE V1_DATA)
|
||||||
|
train_loss = 0
|
||||||
|
val_loss = 0
|
||||||
|
train_batch_count = 0
|
||||||
|
val_batch_count = 0
|
||||||
|
|
||||||
|
|
||||||
|
#TODO
|
||||||
|
# Make a great epochs number
|
||||||
|
# Evaluate results and find out how to calculate a real rouge metric
|
||||||
|
for epoch in range(2):
|
||||||
|
MODEL.train()
|
||||||
|
for batch in tqdm(train_loader, desc="Training batches"):
|
||||||
|
input_ids = batch["input_ids"].to(DEVICE)
|
||||||
|
attention_mask = batch["attention_mask"].to(DEVICE)
|
||||||
|
labels = batch["labels"].to(DEVICE)
|
||||||
|
decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)
|
||||||
|
|
||||||
|
outputs = MODEL(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
labels=labels,
|
||||||
|
decoder_attention_mask=decoder_attention_mask
|
||||||
|
)
|
||||||
|
|
||||||
|
OPTIMIZER.zero_grad()
|
||||||
|
outputs.loss.backward()
|
||||||
|
OPTIMIZER.step()
|
||||||
|
train_loss += outputs.loss.item()
|
||||||
|
train_batch_count += 1
|
||||||
|
|
||||||
|
#Evaluation
|
||||||
|
MODEL.eval()
|
||||||
|
for batch in tqdm(val_loader, desc="Validation batches"):
|
||||||
|
input_ids = batch["input_ids"].to(DEVICE)
|
||||||
|
attention_mask = batch["attention_mask"].to(DEVICE)
|
||||||
|
labels = batch["labels"].to(DEVICE)
|
||||||
|
decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)
|
||||||
|
|
||||||
|
outputs = MODEL(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
labels=labels,
|
||||||
|
decoder_attention_mask=decoder_attention_mask
|
||||||
|
)
|
||||||
|
|
||||||
|
OPTIMIZER.zero_grad()
|
||||||
|
outputs.loss.backward()
|
||||||
|
OPTIMIZER.step()
|
||||||
|
val_loss += outputs.loss.item()
|
||||||
|
val_batch_count += 1
|
||||||
|
print(f"{epoch+1}/{2} -> Train loss: {train_loss / train_batch_count}\tValidation loss: {val_loss/val_batch_count}")
|
||||||
|
|
||||||
|
|
||||||
|
print("Training done succesfully")
|
||||||
|
|
||||||
|
## SAVE FINE_TUNED MODEL
|
||||||
|
MODEL.save_pretrained("qa_model_umT5_small_3LANG")
|
||||||
|
TOKENIZER.save_pretrained('qa_tokenizer_umT5_small_3LANG')
|
||||||
|
|
164
new_usecase.py
Normal file
164
new_usecase.py
Normal file
@ -0,0 +1,164 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## IMPORT NESSESARY EQUIPMENTS
|
||||||
|
from transformers import T5ForConditionalGeneration, T5Tokenizer,AutoTokenizer
|
||||||
|
import torch
|
||||||
|
#import evaluate # Bleu
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import statistics
|
||||||
|
from sklearn.metrics import precision_score, recall_score, f1_score
|
||||||
|
import warnings
|
||||||
|
from tqdm import tqdm
|
||||||
|
from datasets import load_dataset
|
||||||
|
import evaluate
|
||||||
|
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
||||||
|
from sklearn.metrics import precision_score, recall_score, f1_score
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
rouge = evaluate.load('rouge')
|
||||||
|
warnings.filterwarnings("ignore")
|
||||||
|
|
||||||
|
DEVICE ='cuda:0'
|
||||||
|
|
||||||
|
#Prepare data first
|
||||||
|
def prepare_data_english(data):
|
||||||
|
articles = []
|
||||||
|
for item in tqdm(data["validation"],desc="Preparing validation datas"):
|
||||||
|
context = item["context"]
|
||||||
|
question = item["question"]
|
||||||
|
try:
|
||||||
|
start_position = item['answers']['answer_start'][0]
|
||||||
|
except IndexError:
|
||||||
|
continue
|
||||||
|
text_length = len(item['answers']['text'][0])
|
||||||
|
target_text = context[start_position : start_position + text_length]
|
||||||
|
inputs = {"input": context+'<sep>'+question, "answer": target_text}
|
||||||
|
articles.append(inputs)
|
||||||
|
return articles
|
||||||
|
|
||||||
|
#Load the pretrained model
|
||||||
|
|
||||||
|
model_name = 'qa_model_T5-slovak'
|
||||||
|
model_dir = '/home/omasta/T5_JUPYTER/qa_model'
|
||||||
|
tokenizer_dir = '/home/omasta/T5_JUPYTER/qa_tokenizer'
|
||||||
|
MODEL = T5ForConditionalGeneration.from_pretrained(model_dir, from_tf=False, return_dict=True).to(DEVICE)
|
||||||
|
print("Model succesfully loaded!")
|
||||||
|
TOKENIZER = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=True)
|
||||||
|
print("Tokenizer succesfully loaded!")
|
||||||
|
Q_LEN = 512
|
||||||
|
TOKENIZER.add_tokens('<sep>')
|
||||||
|
MODEL.resize_token_embeddings(len(TOKENIZER))
|
||||||
|
|
||||||
|
#Load datasets
|
||||||
|
#dataset_english = load_dataset("squad_v2")
|
||||||
|
dataset_slovak = load_dataset("TUKE-DeutscheTelekom/skquad")
|
||||||
|
#dataset_polish = load_dataset("clarin-pl/poquad")
|
||||||
|
|
||||||
|
#Prepare datas
|
||||||
|
#data_english = prepare_data_english(dataset_english)
|
||||||
|
#data_polish = prepare_data_english(dataset_polish)
|
||||||
|
data_slovak = prepare_data_english(dataset_slovak)
|
||||||
|
#Merge datasets
|
||||||
|
#val_data = data_slovak + data_english + data_polish
|
||||||
|
print("Val Samples : ",len(data_slovak))
|
||||||
|
|
||||||
|
|
||||||
|
def prediction_rouge(predictions, references):
|
||||||
|
return rouge.compute(predictions=[predictions], references=[[references]])
|
||||||
|
|
||||||
|
def compute_bleu(reference, prediction):
|
||||||
|
smoothie = SmoothingFunction().method4
|
||||||
|
return sentence_bleu([reference.split()],prediction.split(),smoothing_function=smoothie)
|
||||||
|
|
||||||
|
def classic_metrics(sentence1, sentence2):
|
||||||
|
if sentence1 == "" and sentence2 == "":
|
||||||
|
return 0,0,0
|
||||||
|
else:
|
||||||
|
# Vytvorenie "bag of words"
|
||||||
|
vectorizer = CountVectorizer()
|
||||||
|
try:
|
||||||
|
bag_of_words = vectorizer.fit_transform([sentence1, sentence2])
|
||||||
|
except ValueError:
|
||||||
|
return 0,0,0
|
||||||
|
# Získanie vektorov pre vety
|
||||||
|
vector1 = bag_of_words.toarray()[0]
|
||||||
|
vector2 = bag_of_words.toarray()[1]
|
||||||
|
|
||||||
|
# Výpočet metrík
|
||||||
|
precision = precision_score(vector1, vector2, average='weighted')
|
||||||
|
recall = recall_score(vector1, vector2, average='weighted')
|
||||||
|
f1 = f1_score(vector1, vector2, average='weighted')
|
||||||
|
return float(precision), float(recall), float(f1)
|
||||||
|
|
||||||
|
def predict_answer(input,ref_answer,language):
|
||||||
|
inputs = TOKENIZER(input, max_length=512, padding="max_length", truncation=True, add_special_tokens=True)
|
||||||
|
input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
|
||||||
|
attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)
|
||||||
|
outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)
|
||||||
|
predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)
|
||||||
|
ref_answer = ref_answer.lower()
|
||||||
|
return {"pred":predicted_answer.lower(), "ref":ref_answer.lower(),"language":language}
|
||||||
|
|
||||||
|
def predict_and_save(val_data,lang):
|
||||||
|
predictions = list()
|
||||||
|
for i in tqdm(range(len(val_data)),desc="predicting"):
|
||||||
|
pred=predict_answer(val_data[i]["input"],val_data[i]["answer"],lang)
|
||||||
|
predictions.append(pred)
|
||||||
|
return predictions
|
||||||
|
#Predict
|
||||||
|
pred_slovak = predict_and_save(data_slovak,"sk")
|
||||||
|
#pred_english = predict_and_save(data_english,"en")
|
||||||
|
#pred_polish = predict_and_save(data_polish,"pl")
|
||||||
|
|
||||||
|
#predictions = pred_slovak + pred_english + pred_polish
|
||||||
|
|
||||||
|
|
||||||
|
#Save the results for later
|
||||||
|
import json
|
||||||
|
with open('predictions-t5.json', 'w') as json_file:
|
||||||
|
json.dump(predictions, json_file)
|
||||||
|
|
||||||
|
|
||||||
|
#Compute metrics
|
||||||
|
import json
|
||||||
|
with open("predictions-t5.json","r") as json_file:
|
||||||
|
data = json.load(json_file)
|
||||||
|
|
||||||
|
new_data = list()
|
||||||
|
language="sk"
|
||||||
|
for item in data:
|
||||||
|
if item["language"]==language:
|
||||||
|
new_data.append(item)
|
||||||
|
|
||||||
|
bleu = list()
|
||||||
|
rouges = list()
|
||||||
|
precisions=list()
|
||||||
|
recalls=list()
|
||||||
|
f1s=list()
|
||||||
|
|
||||||
|
for item in tqdm(new_data,desc="Evaluating"):
|
||||||
|
bleu.append(compute_bleu(item["pred"],item["ref"]))
|
||||||
|
rouges.append(prediction_rouge(item["pred"],item["ref"]))
|
||||||
|
precision, recall, f1 =classic_metrics(item["pred"],item["ref"])
|
||||||
|
precisions.append(precision)
|
||||||
|
recalls.append(recall)
|
||||||
|
f1s.append(f1)
|
||||||
|
#COMPUTATION OF METRICS
|
||||||
|
rouge1_values = [rouge['rouge1'] for rouge in rouges]
|
||||||
|
rouge2_values = [rouge['rouge2'] for rouge in rouges]
|
||||||
|
rougeL_values = [rouge['rougeL'] for rouge in rouges]
|
||||||
|
|
||||||
|
average_rouge1 = sum(rouge1_values) / len(rouges)
|
||||||
|
average_rouge2 = sum(rouge2_values) / len(rouges)
|
||||||
|
average_rougeL = sum(rougeL_values) / len(rouges)
|
||||||
|
print("Model name :",model_name)
|
||||||
|
print("Language :",language)
|
||||||
|
print("BLEU: ",sum(bleu)/len(bleu))
|
||||||
|
print("Recall :",sum(recalls)/len(recalls))
|
||||||
|
print("F1 : ",sum(f1s)/len(f1s))
|
||||||
|
print("Precision :",sum(precisions)/len(precisions))
|
||||||
|
print("Rouge-1 :",average_rouge1)
|
||||||
|
print("Rouge-2 :",average_rouge2)
|
||||||
|
print("Rouge-L :",average_rougeL)
|
||||||
|
|
41
train.py
41
train.py
@ -37,13 +37,20 @@ Q_LEN = 256 # Question Length
|
|||||||
T_LEN = 32 # Target Length
|
T_LEN = 32 # Target Length
|
||||||
BATCH_SIZE = 4 #dávka dát
|
BATCH_SIZE = 4 #dávka dát
|
||||||
print("Model succesfully loaded")
|
print("Model succesfully loaded")
|
||||||
|
from datasets import load_dataset
|
||||||
|
|
||||||
path_train = '/home/omasta/T5_JUPYTER/skquad-221017/train-v1.json'
|
dataset = load_dataset("squad_v2")
|
||||||
|
print(dataset["train"][0])
|
||||||
|
#path_train = '/home/omasta/T5_JUPYTER/skquad-221017/train-v1.json'
|
||||||
|
path_train = "poquad-train.json"
|
||||||
with open(path_train) as f:
|
with open(path_train) as f:
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def nahradit_znaky(retezec):
|
||||||
|
novy_retezec = retezec.replace('[', ' ').replace(']', ' ')
|
||||||
|
return novy_retezec
|
||||||
|
|
||||||
def prepare_data(data):
|
def prepare_data(data):
|
||||||
articles = []
|
articles = []
|
||||||
for article in data["data"]:
|
for article in data["data"]:
|
||||||
@ -60,15 +67,28 @@ def prepare_data(data):
|
|||||||
articles.append(inputs)
|
articles.append(inputs)
|
||||||
|
|
||||||
return articles
|
return articles
|
||||||
|
def prep_data(data):
|
||||||
|
arcs = list()
|
||||||
|
for i in range(len(data)):
|
||||||
|
questions=data[i]["question"]
|
||||||
|
try:
|
||||||
|
answer = nahradit_znaky(', '.join(data[i]["answers"]["text"]))
|
||||||
|
except KeyError:
|
||||||
|
continue
|
||||||
|
context = data[i]["context"]
|
||||||
|
inputs = {"input":context+"<sep>"+questions,"answer":answer}
|
||||||
|
arcs.append(inputs)
|
||||||
|
return arcs
|
||||||
|
|
||||||
prepared_data = prepare_data(data)
|
#print(dataset["train"][0]["answers"]["text"])
|
||||||
|
|
||||||
|
prepared_data=prep_data(dataset["train"])
|
||||||
|
#prepared_data = prepare_data(data)
|
||||||
print(prepared_data[0])
|
print(prepared_data[0])
|
||||||
|
|
||||||
#Dataframe
|
#Dataframe
|
||||||
data = pd.DataFrame(prepared_data)
|
data = pd.DataFrame(prepared_data)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class QA_Dataset(Dataset):
|
class QA_Dataset(Dataset):
|
||||||
def __init__(self, tokenizer, dataframe, q_len, t_len):
|
def __init__(self, tokenizer, dataframe, q_len, t_len):
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
@ -113,18 +133,13 @@ train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampl
|
|||||||
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)
|
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)
|
||||||
print("Loaders working fine")
|
print("Loaders working fine")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### TRAINING (46MINS ACCORDING THE V1_DATA)
|
### TRAINING (46MINS ACCORDING THE V1_DATA)
|
||||||
|
|
||||||
|
|
||||||
train_loss = 0
|
train_loss = 0
|
||||||
val_loss = 0
|
val_loss = 0
|
||||||
train_batch_count = 0
|
train_batch_count = 0
|
||||||
val_batch_count = 0
|
val_batch_count = 0
|
||||||
|
|
||||||
for epoch in range(4):
|
for epoch in range(2):
|
||||||
MODEL.train()
|
MODEL.train()
|
||||||
for batch in tqdm(train_loader, desc="Training batches"):
|
for batch in tqdm(train_loader, desc="Training batches"):
|
||||||
input_ids = batch["input_ids"].to(DEVICE)
|
input_ids = batch["input_ids"].to(DEVICE)
|
||||||
@ -171,5 +186,5 @@ for epoch in range(4):
|
|||||||
print("Training done succesfully")
|
print("Training done succesfully")
|
||||||
|
|
||||||
## SAVE FINE_TUNED MODEL
|
## SAVE FINE_TUNED MODEL
|
||||||
MODEL.save_pretrained("qa_model_mT5_small")
|
MODEL.save_pretrained("qa_model_mT5_english")
|
||||||
TOKENIZER.save_pretrained('qa_tokenizer_mT5_small')
|
TOKENIZER.save_pretrained('qa_tokenizer_mT5_english')
|
||||||
|
74
usecase.py
74
usecase.py
@ -11,6 +11,11 @@ import warnings
|
|||||||
warnings.filterwarnings("ignore")
|
warnings.filterwarnings("ignore")
|
||||||
##13/03/23 added
|
##13/03/23 added
|
||||||
from rouge import Rouge
|
from rouge import Rouge
|
||||||
|
from tqdm import tqdm
|
||||||
|
from datasets import load_dataset
|
||||||
|
import re
|
||||||
|
##CUSTOM ROUGE METRIC - NEW TODO:
|
||||||
|
|
||||||
|
|
||||||
# Názov modelu
|
# Názov modelu
|
||||||
DEVICE ='cuda:0'
|
DEVICE ='cuda:0'
|
||||||
@ -22,9 +27,9 @@ DEVICE ='cuda:0'
|
|||||||
#tokenizer_dir = "/home/omasta/T5_JUPYTER/qa_tokenizer"
|
#tokenizer_dir = "/home/omasta/T5_JUPYTER/qa_tokenizer"
|
||||||
|
|
||||||
#mT5 SMALL MODEL
|
#mT5 SMALL MODEL
|
||||||
model_name = 'mT5_SMALL'
|
model_name = 'qa_model'
|
||||||
model_dir = '/home/omasta/T5_JUPYTER/qa_model_mT5_small'
|
model_dir = '/home/omasta/T5_JUPYTER/qa_model_mT5_polish'
|
||||||
tokenizer_dir = '/home/omasta/T5_JUPYTER/qa_tokenizer_mT5_small'
|
tokenizer_dir = '/home/omasta/T5_JUPYTER/qa_tokenizer_mT5_polish'
|
||||||
|
|
||||||
#Načítanie modelu z adresára
|
#Načítanie modelu z adresára
|
||||||
MODEL = T5ForConditionalGeneration.from_pretrained(model_dir, from_tf=False, return_dict=True).to(DEVICE)
|
MODEL = T5ForConditionalGeneration.from_pretrained(model_dir, from_tf=False, return_dict=True).to(DEVICE)
|
||||||
@ -35,9 +40,14 @@ Q_LEN = 512
|
|||||||
TOKENIZER.add_tokens('<sep>')
|
TOKENIZER.add_tokens('<sep>')
|
||||||
MODEL.resize_token_embeddings(len(TOKENIZER))
|
MODEL.resize_token_embeddings(len(TOKENIZER))
|
||||||
|
|
||||||
|
def nahradit_znaky(retezec):
|
||||||
|
novy_retezec = retezec.replace('[', ' ').replace(']', ' ')
|
||||||
|
return novy_retezec
|
||||||
|
|
||||||
|
|
||||||
def predict_answer(data, ref_answer=None,random=None):
|
def predict_answer(data, ref_answer=None,random=None):
|
||||||
predictions=[]
|
predictions=[]
|
||||||
for i in data:
|
for i in tqdm(data,desc="predicting"):
|
||||||
inputs = TOKENIZER(i['input'], max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)
|
inputs = TOKENIZER(i['input'], max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)
|
||||||
input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
|
input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
|
||||||
attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)
|
attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)
|
||||||
@ -47,14 +57,14 @@ def predict_answer(data, ref_answer=None,random=None):
|
|||||||
#print(ref_answer)
|
#print(ref_answer)
|
||||||
if ref_answer:
|
if ref_answer:
|
||||||
# Load the Bleu metric
|
# Load the Bleu metric
|
||||||
bleu = evaluate.load("google_bleu")
|
#bleu = evaluate.load("google_bleu")
|
||||||
#print('debug')
|
#print('debug')
|
||||||
#precision = list(precision_score(ref_answer, predicted_answer))
|
#precision = list(precision_score(ref_answer, predicted_answer))
|
||||||
#recall = list(recall_score(ref_answer, predicted_answer))
|
#recall = list(recall_score(ref_answer, predicted_answer))
|
||||||
#f1 = list(f1_score(ref_answer, predicted_answer))
|
#f1 = list(f1_score(ref_answer, predicted_answer))
|
||||||
score = bleu.compute(predictions=[predicted_answer],
|
#score = bleu.compute(predictions=[predicted_answer],
|
||||||
references=[ref_answer])
|
# references=[ref_answer])
|
||||||
predictions.append({'prediction':predicted_answer,'ref_answer':ref_answer,'score':score['google_bleu']})
|
predictions.append({'prediction':predicted_answer,'ref_answer':ref_answer})
|
||||||
return predictions
|
return predictions
|
||||||
|
|
||||||
def prepare_data(data):
|
def prepare_data(data):
|
||||||
@ -66,19 +76,29 @@ def prepare_data(data):
|
|||||||
answer = qa["answers"][0]["text"]
|
answer = qa["answers"][0]["text"]
|
||||||
inputs = {"input": paragraph["context"]+ "<sep>" + question, "answer": answer}
|
inputs = {"input": paragraph["context"]+ "<sep>" + question, "answer": answer}
|
||||||
articles.append(inputs)
|
articles.append(inputs)
|
||||||
|
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
dev_data_path = '/home/omasta/T5_JUPYTER/skquad-221017/dev-v1.json'
|
def prepare_polish_data(data):
|
||||||
with open(dev_data_path,'r') as f:
|
arcs = list()
|
||||||
data=json.load(f)
|
for i in range(len(data)):
|
||||||
#print('data imported')
|
questions=data[i]["question"]
|
||||||
|
try:
|
||||||
|
answer = nahradit_znaky(', '.join(data[i]["answers"]["text"]))
|
||||||
|
except KeyError:
|
||||||
|
continue
|
||||||
|
context = data[i]["context"]
|
||||||
|
inputs = {"input":context+"<sep>"+questions,"answer":answer}
|
||||||
|
arcs.append(inputs)
|
||||||
|
return arcs
|
||||||
|
|
||||||
dev_data = prepare_data(data)
|
|
||||||
|
#dataset = load_dataset("clarin-pl/poquad")
|
||||||
|
dataset = load_dataset("squad_v2")
|
||||||
|
dev_data = prepare_polish_data(dataset["validation"])
|
||||||
|
|
||||||
#print('data prepared')
|
#print('data prepared')
|
||||||
print(f'Number of dev samples {len(dev_data)}')
|
print(f'Number of dev samples {len(dev_data)}')
|
||||||
print(dev_data[0])
|
#print(dev_data[0])
|
||||||
bleu_score = []
|
bleu_score = []
|
||||||
precisions=[]
|
precisions=[]
|
||||||
f1_scores=[]
|
f1_scores=[]
|
||||||
@ -88,10 +108,9 @@ rouge_2 = []
|
|||||||
#X = 150
|
#X = 150
|
||||||
evaluate = predict_answer(dev_data)
|
evaluate = predict_answer(dev_data)
|
||||||
rouge = Rouge()
|
rouge = Rouge()
|
||||||
for item in evaluate:
|
for item in tqdm(evaluate,desc="evaluating"):
|
||||||
bleu_score.append(item['score'])
|
|
||||||
try:
|
try:
|
||||||
#scores = rouge.get_scores(item['prediction'], item['ref_answer'], avg=True)
|
scores = rouge.get_scores(item['prediction'], item['ref_answer'])
|
||||||
precision=precision_score(list(item['ref_answer']), list(item['prediction']),average='macro')
|
precision=precision_score(list(item['ref_answer']), list(item['prediction']),average='macro')
|
||||||
recall=recall_score(list(item['ref_answer']), list(item['prediction']),average='macro')
|
recall=recall_score(list(item['ref_answer']), list(item['prediction']),average='macro')
|
||||||
f1=f1_score(list(item['ref_answer']), list(item['prediction']),average='macro')
|
f1=f1_score(list(item['ref_answer']), list(item['prediction']),average='macro')
|
||||||
@ -119,21 +138,22 @@ def rouge_eval(dict_x):
|
|||||||
print(f'VYHODNOTENIE VYSLEDKOV : ------------------------')
|
print(f'VYHODNOTENIE VYSLEDKOV : ------------------------')
|
||||||
#print(evaluate)
|
#print(evaluate)
|
||||||
#bleu_score_total = statistics.mean(bleu_score)
|
#bleu_score_total = statistics.mean(bleu_score)
|
||||||
#recall_score_total= statistics.mean(recall_scores)
|
recall_score_total= statistics.mean(recall_scores)
|
||||||
#f1_score_total = statistics.mean(f1_scores)
|
f1_score_total = statistics.mean(f1_scores)
|
||||||
#precision_total = statistics.mean(precisions)
|
precision_total = statistics.mean(precisions)
|
||||||
#print(f'Bleu_score of model {model_name} : ',bleu_score_total)
|
#print(f'Bleu_score of model {model_name} : ',bleu_score_total)
|
||||||
#print(f'Recall of model {model_name}: ',recall_score_total)
|
print(f'Recall of model {model_name}: ',recall_score_total)
|
||||||
#print(f'F1 of model {model_name} : ', f1_score_total)
|
print(f'F1 of model {model_name} : ', f1_score_total)
|
||||||
#print(f'Precision of model {model_name}: :',precision_total)
|
print(f'Precision of model {model_name}: :',precision_total)
|
||||||
#print(rouge_eval(evaluate))
|
print(model_dir)
|
||||||
|
print(rouge_eval(evaluate))
|
||||||
print(f'{model_name} results')
|
print(f'{model_name} results')
|
||||||
rouge_scores = rouge_eval(evaluate)
|
rouge_scores = rouge_eval(evaluate)
|
||||||
rouge_values = [score[0]['rouge-1']['f'] for score in rouge_scores]
|
rouge_values = [score[0]['rouge-1']['f'] for score in rouge_scores]
|
||||||
mean_rouge_score = statistics.mean(rouge_values)
|
mean_rouge_score = statistics.mean(rouge_values)
|
||||||
print(f'Rouge:{mean_rouge_score}')
|
print(f'Rouge mean score:{mean_rouge_score}')
|
||||||
|
|
||||||
rouge2_values = [score[0]['rouge-2']['f'] for score in rouge_scores]
|
rouge2_values = [score[0]['rouge-2']['f'] for score in rouge_scores]
|
||||||
mean_rouge_score =statistics.mean(rouge2_values)
|
mean_rouge_score =statistics.mean(rouge2_values)
|
||||||
print(f'Rouge-2:{mean_rouge_score}')
|
print(f'Rouge-2 mean score:{mean_rouge_score}')
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user