317 lines
8.3 KiB
Python
317 lines
8.3 KiB
Python
import os
|
|
import json
|
|
import math
|
|
from pathlib import Path
|
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
|
import torch
|
|
from tqdm import tqdm
|
|
from datasets import load_dataset
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
|
from peft import PeftModel
|
|
from rouge_score import rouge_scorer
|
|
import sacrebleu
|
|
|
|
|
|
MODEL_NAME = "slovak-nlp/mistral-sk-7b"
|
|
DATASET_NAME = "saillab/alpaca-slovak-cleaned"
|
|
|
|
# Tvoj Unsloth + TRL + QLoRA adaptér
|
|
ADAPTER_DIR = "/home/schwarc/diplomovka/mistral_sk_alpaca/mistral-sk-7b-alpaca-slovak-unsloth-lora-full"
|
|
|
|
PROJECT_DIR = Path.home() / "diplomovka" / "evaluation_results"
|
|
PROJECT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
METRICS_FILE = PROJECT_DIR / "unsloth_lora_metrics.json"
|
|
|
|
NUM_EVAL_SAMPLES = 1000
|
|
MAX_LENGTH = 1024
|
|
MAX_NEW_TOKENS = 300
|
|
|
|
# Pri hodnotení je lepšie deterministické generovanie
|
|
DO_SAMPLE = False
|
|
|
|
|
|
def is_empty(value):
|
|
if value is None:
|
|
return True
|
|
|
|
value = str(value).strip()
|
|
|
|
return value == "" or value.lower() == "nan"
|
|
|
|
|
|
def make_prompt(example):
|
|
instruction = str(example["instruction"]).strip()
|
|
input_text = example.get("input")
|
|
|
|
if is_empty(input_text):
|
|
return (
|
|
"### Inštrukcia:\n"
|
|
f"{instruction}\n\n"
|
|
"### Odpoveď:\n"
|
|
)
|
|
|
|
return (
|
|
"### Inštrukcia:\n"
|
|
f"{instruction}\n\n"
|
|
"### Vstup:\n"
|
|
f"{str(input_text).strip()}\n\n"
|
|
"### Odpoveď:\n"
|
|
)
|
|
|
|
|
|
def make_full_text(example, tokenizer):
|
|
prompt = make_prompt(example)
|
|
reference = str(example["output"]).strip()
|
|
|
|
return prompt + reference + tokenizer.eos_token
|
|
|
|
|
|
def load_model():
|
|
print("CUDA available:", torch.cuda.is_available())
|
|
|
|
if torch.cuda.is_available():
|
|
print("GPU:", torch.cuda.get_device_name(0))
|
|
|
|
print("Načítavam tokenizer...")
|
|
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=False)
|
|
|
|
if tokenizer.pad_token is None:
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
print("Načítavam základný model v 4-bit režime...")
|
|
|
|
bnb_config = BitsAndBytesConfig(
|
|
load_in_4bit=True,
|
|
bnb_4bit_quant_type="nf4",
|
|
bnb_4bit_use_double_quant=True,
|
|
bnb_4bit_compute_dtype=torch.float16,
|
|
)
|
|
|
|
base_model = AutoModelForCausalLM.from_pretrained(
|
|
MODEL_NAME,
|
|
quantization_config=bnb_config,
|
|
device_map={"": 0},
|
|
dtype=torch.float16,
|
|
)
|
|
|
|
print("Pripájam LoRA adaptér...")
|
|
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
|
|
model.eval()
|
|
|
|
print("Model pripravený.")
|
|
print("-" * 80)
|
|
|
|
return model, tokenizer
|
|
|
|
|
|
def load_eval_dataset():
|
|
raw_dataset = load_dataset(DATASET_NAME)
|
|
|
|
eval_dataset = raw_dataset["test"].shuffle(seed=42)
|
|
|
|
if NUM_EVAL_SAMPLES is not None:
|
|
eval_dataset = eval_dataset.select(
|
|
range(min(NUM_EVAL_SAMPLES, len(eval_dataset)))
|
|
)
|
|
|
|
print("Eval vzoriek:", len(eval_dataset))
|
|
|
|
return eval_dataset
|
|
|
|
|
|
def generate_predictions(model, tokenizer, eval_dataset):
|
|
predictions = []
|
|
references = []
|
|
|
|
for example in tqdm(eval_dataset, desc="Generujem odpovede"):
|
|
prompt = make_prompt(example)
|
|
reference = str(example["output"]).strip()
|
|
|
|
inputs = tokenizer(
|
|
prompt,
|
|
return_tensors="pt",
|
|
truncation=True,
|
|
max_length=MAX_LENGTH,
|
|
).to(model.device)
|
|
|
|
input_length = inputs["input_ids"].shape[-1]
|
|
|
|
with torch.no_grad():
|
|
if DO_SAMPLE:
|
|
output_ids = model.generate(
|
|
**inputs,
|
|
max_new_tokens=MAX_NEW_TOKENS,
|
|
do_sample=True,
|
|
temperature=0.7,
|
|
top_p=0.9,
|
|
repetition_penalty=1.1,
|
|
eos_token_id=tokenizer.eos_token_id,
|
|
pad_token_id=tokenizer.pad_token_id,
|
|
)
|
|
else:
|
|
output_ids = model.generate(
|
|
**inputs,
|
|
max_new_tokens=MAX_NEW_TOKENS,
|
|
do_sample=False,
|
|
repetition_penalty=1.1,
|
|
eos_token_id=tokenizer.eos_token_id,
|
|
pad_token_id=tokenizer.pad_token_id,
|
|
)
|
|
|
|
prediction = tokenizer.decode(
|
|
output_ids[0][input_length:],
|
|
skip_special_tokens=True,
|
|
).strip()
|
|
|
|
predictions.append(prediction)
|
|
references.append(reference)
|
|
|
|
return predictions, references
|
|
|
|
|
|
def compute_rouge(predictions, references):
|
|
scorer = rouge_scorer.RougeScorer(
|
|
["rouge1", "rouge2", "rougeL"],
|
|
use_stemmer=False,
|
|
)
|
|
|
|
rouge1_scores = []
|
|
rouge2_scores = []
|
|
rougeL_scores = []
|
|
|
|
for prediction, reference in zip(predictions, references):
|
|
scores = scorer.score(reference, prediction)
|
|
|
|
rouge1_scores.append(scores["rouge1"].fmeasure)
|
|
rouge2_scores.append(scores["rouge2"].fmeasure)
|
|
rougeL_scores.append(scores["rougeL"].fmeasure)
|
|
|
|
return {
|
|
"rouge1": sum(rouge1_scores) / len(rouge1_scores),
|
|
"rouge2": sum(rouge2_scores) / len(rouge2_scores),
|
|
"rougeL": sum(rougeL_scores) / len(rougeL_scores),
|
|
}
|
|
|
|
|
|
def compute_bleu(predictions, references):
|
|
bleu = sacrebleu.corpus_bleu(
|
|
predictions,
|
|
[references],
|
|
)
|
|
|
|
return bleu.score
|
|
|
|
|
|
def compute_perplexity(model, tokenizer, eval_dataset):
|
|
total_loss = 0.0
|
|
total_tokens = 0
|
|
|
|
for example in tqdm(eval_dataset, desc="Počítam perplexitu"):
|
|
prompt = make_prompt(example)
|
|
full_text = make_full_text(example, tokenizer)
|
|
|
|
prompt_ids = tokenizer(
|
|
prompt,
|
|
add_special_tokens=False,
|
|
truncation=True,
|
|
max_length=MAX_LENGTH,
|
|
)["input_ids"]
|
|
|
|
encoded = tokenizer(
|
|
full_text,
|
|
add_special_tokens=False,
|
|
truncation=True,
|
|
max_length=MAX_LENGTH,
|
|
padding="max_length",
|
|
return_tensors="pt",
|
|
)
|
|
|
|
input_ids = encoded["input_ids"].to(model.device)
|
|
attention_mask = encoded["attention_mask"].to(model.device)
|
|
|
|
labels = input_ids.clone()
|
|
|
|
prompt_len = min(len(prompt_ids), MAX_LENGTH)
|
|
|
|
labels[:, :prompt_len] = -100
|
|
labels[attention_mask == 0] = -100
|
|
|
|
valid_tokens = (labels != -100).sum().item()
|
|
|
|
if valid_tokens == 0:
|
|
continue
|
|
|
|
with torch.no_grad():
|
|
outputs = model(
|
|
input_ids=input_ids,
|
|
attention_mask=attention_mask,
|
|
labels=labels,
|
|
)
|
|
|
|
loss = outputs.loss.item()
|
|
|
|
total_loss += loss * valid_tokens
|
|
total_tokens += valid_tokens
|
|
|
|
avg_loss = total_loss / total_tokens
|
|
perplexity = math.exp(avg_loss)
|
|
|
|
return avg_loss, perplexity
|
|
|
|
|
|
def save_metrics(metrics):
|
|
with open(METRICS_FILE, "w", encoding="utf-8") as file:
|
|
json.dump(metrics, file, ensure_ascii=False, indent=2)
|
|
|
|
print("Výsledné metriky uložené do:")
|
|
print(METRICS_FILE)
|
|
|
|
|
|
def main():
|
|
model, tokenizer = load_model()
|
|
eval_dataset = load_eval_dataset()
|
|
|
|
predictions, references = generate_predictions(model, tokenizer, eval_dataset)
|
|
|
|
print("Počítam ROUGE...")
|
|
rouge_scores = compute_rouge(predictions, references)
|
|
|
|
print("Počítam BLEU...")
|
|
bleu_score = compute_bleu(predictions, references)
|
|
|
|
print("Počítam perplexitu...")
|
|
eval_loss, perplexity = compute_perplexity(model, tokenizer, eval_dataset)
|
|
|
|
metrics = {
|
|
"num_eval_samples": len(eval_dataset),
|
|
"rouge1": rouge_scores["rouge1"],
|
|
"rouge2": rouge_scores["rouge2"],
|
|
"rougeL": rouge_scores["rougeL"],
|
|
"bleu": bleu_score,
|
|
"eval_loss": eval_loss,
|
|
"perplexity": perplexity,
|
|
"adapter_dir": ADAPTER_DIR,
|
|
"model_name": MODEL_NAME,
|
|
"dataset_name": DATASET_NAME,
|
|
}
|
|
|
|
print("=" * 80)
|
|
print("FINÁLNE METRIKY")
|
|
print("=" * 80)
|
|
print(f"ROUGE-1: {metrics['rouge1']:.4f}")
|
|
print(f"ROUGE-2: {metrics['rouge2']:.4f}")
|
|
print(f"ROUGE-L: {metrics['rougeL']:.4f}")
|
|
print(f"BLEU: {metrics['bleu']:.4f}")
|
|
print(f"Eval loss: {metrics['eval_loss']:.4f}")
|
|
print(f"Perplexity: {metrics['perplexity']:.4f}")
|
|
print("=" * 80)
|
|
|
|
save_metrics(metrics)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |