Diplomovka/kody/evaluate_mistral_lora_metrics.py
2026-06-11 21:42:51 +02:00

317 lines
8.3 KiB
Python

import os
import json
import math
from pathlib import Path
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import torch
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from rouge_score import rouge_scorer
import sacrebleu
MODEL_NAME = "slovak-nlp/mistral-sk-7b"
DATASET_NAME = "saillab/alpaca-slovak-cleaned"
# Tvoj Unsloth + TRL + QLoRA adaptér
ADAPTER_DIR = "/home/schwarc/diplomovka/mistral_sk_alpaca/mistral-sk-7b-alpaca-slovak-unsloth-lora-full"
PROJECT_DIR = Path.home() / "diplomovka" / "evaluation_results"
PROJECT_DIR.mkdir(parents=True, exist_ok=True)
METRICS_FILE = PROJECT_DIR / "unsloth_lora_metrics.json"
NUM_EVAL_SAMPLES = 1000
MAX_LENGTH = 1024
MAX_NEW_TOKENS = 300
# Pri hodnotení je lepšie deterministické generovanie
DO_SAMPLE = False
def is_empty(value):
if value is None:
return True
value = str(value).strip()
return value == "" or value.lower() == "nan"
def make_prompt(example):
instruction = str(example["instruction"]).strip()
input_text = example.get("input")
if is_empty(input_text):
return (
"### Inštrukcia:\n"
f"{instruction}\n\n"
"### Odpoveď:\n"
)
return (
"### Inštrukcia:\n"
f"{instruction}\n\n"
"### Vstup:\n"
f"{str(input_text).strip()}\n\n"
"### Odpoveď:\n"
)
def make_full_text(example, tokenizer):
prompt = make_prompt(example)
reference = str(example["output"]).strip()
return prompt + reference + tokenizer.eos_token
def load_model():
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
print("GPU:", torch.cuda.get_device_name(0))
print("Načítavam tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=False)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("Načítavam základný model v 4-bit režime...")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.float16,
)
base_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config,
device_map={"": 0},
dtype=torch.float16,
)
print("Pripájam LoRA adaptér...")
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
model.eval()
print("Model pripravený.")
print("-" * 80)
return model, tokenizer
def load_eval_dataset():
raw_dataset = load_dataset(DATASET_NAME)
eval_dataset = raw_dataset["test"].shuffle(seed=42)
if NUM_EVAL_SAMPLES is not None:
eval_dataset = eval_dataset.select(
range(min(NUM_EVAL_SAMPLES, len(eval_dataset)))
)
print("Eval vzoriek:", len(eval_dataset))
return eval_dataset
def generate_predictions(model, tokenizer, eval_dataset):
predictions = []
references = []
for example in tqdm(eval_dataset, desc="Generujem odpovede"):
prompt = make_prompt(example)
reference = str(example["output"]).strip()
inputs = tokenizer(
prompt,
return_tensors="pt",
truncation=True,
max_length=MAX_LENGTH,
).to(model.device)
input_length = inputs["input_ids"].shape[-1]
with torch.no_grad():
if DO_SAMPLE:
output_ids = model.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
)
else:
output_ids = model.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
do_sample=False,
repetition_penalty=1.1,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
)
prediction = tokenizer.decode(
output_ids[0][input_length:],
skip_special_tokens=True,
).strip()
predictions.append(prediction)
references.append(reference)
return predictions, references
def compute_rouge(predictions, references):
scorer = rouge_scorer.RougeScorer(
["rouge1", "rouge2", "rougeL"],
use_stemmer=False,
)
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
for prediction, reference in zip(predictions, references):
scores = scorer.score(reference, prediction)
rouge1_scores.append(scores["rouge1"].fmeasure)
rouge2_scores.append(scores["rouge2"].fmeasure)
rougeL_scores.append(scores["rougeL"].fmeasure)
return {
"rouge1": sum(rouge1_scores) / len(rouge1_scores),
"rouge2": sum(rouge2_scores) / len(rouge2_scores),
"rougeL": sum(rougeL_scores) / len(rougeL_scores),
}
def compute_bleu(predictions, references):
bleu = sacrebleu.corpus_bleu(
predictions,
[references],
)
return bleu.score
def compute_perplexity(model, tokenizer, eval_dataset):
total_loss = 0.0
total_tokens = 0
for example in tqdm(eval_dataset, desc="Počítam perplexitu"):
prompt = make_prompt(example)
full_text = make_full_text(example, tokenizer)
prompt_ids = tokenizer(
prompt,
add_special_tokens=False,
truncation=True,
max_length=MAX_LENGTH,
)["input_ids"]
encoded = tokenizer(
full_text,
add_special_tokens=False,
truncation=True,
max_length=MAX_LENGTH,
padding="max_length",
return_tensors="pt",
)
input_ids = encoded["input_ids"].to(model.device)
attention_mask = encoded["attention_mask"].to(model.device)
labels = input_ids.clone()
prompt_len = min(len(prompt_ids), MAX_LENGTH)
labels[:, :prompt_len] = -100
labels[attention_mask == 0] = -100
valid_tokens = (labels != -100).sum().item()
if valid_tokens == 0:
continue
with torch.no_grad():
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels,
)
loss = outputs.loss.item()
total_loss += loss * valid_tokens
total_tokens += valid_tokens
avg_loss = total_loss / total_tokens
perplexity = math.exp(avg_loss)
return avg_loss, perplexity
def save_metrics(metrics):
with open(METRICS_FILE, "w", encoding="utf-8") as file:
json.dump(metrics, file, ensure_ascii=False, indent=2)
print("Výsledné metriky uložené do:")
print(METRICS_FILE)
def main():
model, tokenizer = load_model()
eval_dataset = load_eval_dataset()
predictions, references = generate_predictions(model, tokenizer, eval_dataset)
print("Počítam ROUGE...")
rouge_scores = compute_rouge(predictions, references)
print("Počítam BLEU...")
bleu_score = compute_bleu(predictions, references)
print("Počítam perplexitu...")
eval_loss, perplexity = compute_perplexity(model, tokenizer, eval_dataset)
metrics = {
"num_eval_samples": len(eval_dataset),
"rouge1": rouge_scores["rouge1"],
"rouge2": rouge_scores["rouge2"],
"rougeL": rouge_scores["rougeL"],
"bleu": bleu_score,
"eval_loss": eval_loss,
"perplexity": perplexity,
"adapter_dir": ADAPTER_DIR,
"model_name": MODEL_NAME,
"dataset_name": DATASET_NAME,
}
print("=" * 80)
print("FINÁLNE METRIKY")
print("=" * 80)
print(f"ROUGE-1: {metrics['rouge1']:.4f}")
print(f"ROUGE-2: {metrics['rouge2']:.4f}")
print(f"ROUGE-L: {metrics['rougeL']:.4f}")
print(f"BLEU: {metrics['bleu']:.4f}")
print(f"Eval loss: {metrics['eval_loss']:.4f}")
print(f"Perplexity: {metrics['perplexity']:.4f}")
print("=" * 80)
save_metrics(metrics)
if __name__ == "__main__":
main()