import os import json import math from pathlib import Path os.environ["CUDA_VISIBLE_DEVICES"] = "0" os.environ["TOKENIZERS_PARALLELISM"] = "false" import torch from tqdm import tqdm from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import PeftModel from rouge_score import rouge_scorer import sacrebleu MODEL_NAME = "slovak-nlp/mistral-sk-7b" DATASET_NAME = "saillab/alpaca-slovak-cleaned" # Tvoj Unsloth + TRL + QLoRA adaptér ADAPTER_DIR = "/home/schwarc/diplomovka/mistral_sk_alpaca/mistral-sk-7b-alpaca-slovak-unsloth-lora-full" PROJECT_DIR = Path.home() / "diplomovka" / "evaluation_results" PROJECT_DIR.mkdir(parents=True, exist_ok=True) METRICS_FILE = PROJECT_DIR / "unsloth_lora_metrics.json" NUM_EVAL_SAMPLES = 1000 MAX_LENGTH = 1024 MAX_NEW_TOKENS = 300 # Pri hodnotení je lepšie deterministické generovanie DO_SAMPLE = False def is_empty(value): if value is None: return True value = str(value).strip() return value == "" or value.lower() == "nan" def make_prompt(example): instruction = str(example["instruction"]).strip() input_text = example.get("input") if is_empty(input_text): return ( "### Inštrukcia:\n" f"{instruction}\n\n" "### Odpoveď:\n" ) return ( "### Inštrukcia:\n" f"{instruction}\n\n" "### Vstup:\n" f"{str(input_text).strip()}\n\n" "### Odpoveď:\n" ) def make_full_text(example, tokenizer): prompt = make_prompt(example) reference = str(example["output"]).strip() return prompt + reference + tokenizer.eos_token def load_model(): print("CUDA available:", torch.cuda.is_available()) if torch.cuda.is_available(): print("GPU:", torch.cuda.get_device_name(0)) print("Načítavam tokenizer...") tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print("Načítavam základný model v 4-bit režime...") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.float16, ) base_model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, quantization_config=bnb_config, device_map={"": 0}, dtype=torch.float16, ) print("Pripájam LoRA adaptér...") model = PeftModel.from_pretrained(base_model, ADAPTER_DIR) model.eval() print("Model pripravený.") print("-" * 80) return model, tokenizer def load_eval_dataset(): raw_dataset = load_dataset(DATASET_NAME) eval_dataset = raw_dataset["test"].shuffle(seed=42) if NUM_EVAL_SAMPLES is not None: eval_dataset = eval_dataset.select( range(min(NUM_EVAL_SAMPLES, len(eval_dataset))) ) print("Eval vzoriek:", len(eval_dataset)) return eval_dataset def generate_predictions(model, tokenizer, eval_dataset): predictions = [] references = [] for example in tqdm(eval_dataset, desc="Generujem odpovede"): prompt = make_prompt(example) reference = str(example["output"]).strip() inputs = tokenizer( prompt, return_tensors="pt", truncation=True, max_length=MAX_LENGTH, ).to(model.device) input_length = inputs["input_ids"].shape[-1] with torch.no_grad(): if DO_SAMPLE: output_ids = model.generate( **inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.1, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, ) else: output_ids = model.generate( **inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=False, repetition_penalty=1.1, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, ) prediction = tokenizer.decode( output_ids[0][input_length:], skip_special_tokens=True, ).strip() predictions.append(prediction) references.append(reference) return predictions, references def compute_rouge(predictions, references): scorer = rouge_scorer.RougeScorer( ["rouge1", "rouge2", "rougeL"], use_stemmer=False, ) rouge1_scores = [] rouge2_scores = [] rougeL_scores = [] for prediction, reference in zip(predictions, references): scores = scorer.score(reference, prediction) rouge1_scores.append(scores["rouge1"].fmeasure) rouge2_scores.append(scores["rouge2"].fmeasure) rougeL_scores.append(scores["rougeL"].fmeasure) return { "rouge1": sum(rouge1_scores) / len(rouge1_scores), "rouge2": sum(rouge2_scores) / len(rouge2_scores), "rougeL": sum(rougeL_scores) / len(rougeL_scores), } def compute_bleu(predictions, references): bleu = sacrebleu.corpus_bleu( predictions, [references], ) return bleu.score def compute_perplexity(model, tokenizer, eval_dataset): total_loss = 0.0 total_tokens = 0 for example in tqdm(eval_dataset, desc="Počítam perplexitu"): prompt = make_prompt(example) full_text = make_full_text(example, tokenizer) prompt_ids = tokenizer( prompt, add_special_tokens=False, truncation=True, max_length=MAX_LENGTH, )["input_ids"] encoded = tokenizer( full_text, add_special_tokens=False, truncation=True, max_length=MAX_LENGTH, padding="max_length", return_tensors="pt", ) input_ids = encoded["input_ids"].to(model.device) attention_mask = encoded["attention_mask"].to(model.device) labels = input_ids.clone() prompt_len = min(len(prompt_ids), MAX_LENGTH) labels[:, :prompt_len] = -100 labels[attention_mask == 0] = -100 valid_tokens = (labels != -100).sum().item() if valid_tokens == 0: continue with torch.no_grad(): outputs = model( input_ids=input_ids, attention_mask=attention_mask, labels=labels, ) loss = outputs.loss.item() total_loss += loss * valid_tokens total_tokens += valid_tokens avg_loss = total_loss / total_tokens perplexity = math.exp(avg_loss) return avg_loss, perplexity def save_metrics(metrics): with open(METRICS_FILE, "w", encoding="utf-8") as file: json.dump(metrics, file, ensure_ascii=False, indent=2) print("Výsledné metriky uložené do:") print(METRICS_FILE) def main(): model, tokenizer = load_model() eval_dataset = load_eval_dataset() predictions, references = generate_predictions(model, tokenizer, eval_dataset) print("Počítam ROUGE...") rouge_scores = compute_rouge(predictions, references) print("Počítam BLEU...") bleu_score = compute_bleu(predictions, references) print("Počítam perplexitu...") eval_loss, perplexity = compute_perplexity(model, tokenizer, eval_dataset) metrics = { "num_eval_samples": len(eval_dataset), "rouge1": rouge_scores["rouge1"], "rouge2": rouge_scores["rouge2"], "rougeL": rouge_scores["rougeL"], "bleu": bleu_score, "eval_loss": eval_loss, "perplexity": perplexity, "adapter_dir": ADAPTER_DIR, "model_name": MODEL_NAME, "dataset_name": DATASET_NAME, } print("=" * 80) print("FINÁLNE METRIKY") print("=" * 80) print(f"ROUGE-1: {metrics['rouge1']:.4f}") print(f"ROUGE-2: {metrics['rouge2']:.4f}") print(f"ROUGE-L: {metrics['rougeL']:.4f}") print(f"BLEU: {metrics['bleu']:.4f}") print(f"Eval loss: {metrics['eval_loss']:.4f}") print(f"Perplexity: {metrics['perplexity']:.4f}") print("=" * 80) save_metrics(metrics) if __name__ == "__main__": main()