diff --git a/kody/evaluate_mistral_lora_metrics.py b/kody/evaluate_mistral_lora_metrics.py new file mode 100644 index 0000000..562c12e --- /dev/null +++ b/kody/evaluate_mistral_lora_metrics.py @@ -0,0 +1,317 @@ +import os +import json +import math +from pathlib import Path + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +import torch +from tqdm import tqdm +from datasets import load_dataset +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig +from peft import PeftModel +from rouge_score import rouge_scorer +import sacrebleu + + +MODEL_NAME = "slovak-nlp/mistral-sk-7b" +DATASET_NAME = "saillab/alpaca-slovak-cleaned" + +# Tvoj Unsloth + TRL + QLoRA adaptér +ADAPTER_DIR = "/home/schwarc/diplomovka/mistral_sk_alpaca/mistral-sk-7b-alpaca-slovak-unsloth-lora-full" + +PROJECT_DIR = Path.home() / "diplomovka" / "evaluation_results" +PROJECT_DIR.mkdir(parents=True, exist_ok=True) + +METRICS_FILE = PROJECT_DIR / "unsloth_lora_metrics.json" + +NUM_EVAL_SAMPLES = 1000 +MAX_LENGTH = 1024 +MAX_NEW_TOKENS = 300 + +# Pri hodnotení je lepšie deterministické generovanie +DO_SAMPLE = False + + +def is_empty(value): + if value is None: + return True + + value = str(value).strip() + + return value == "" or value.lower() == "nan" + + +def make_prompt(example): + instruction = str(example["instruction"]).strip() + input_text = example.get("input") + + if is_empty(input_text): + return ( + "### Inštrukcia:\n" + f"{instruction}\n\n" + "### Odpoveď:\n" + ) + + return ( + "### Inštrukcia:\n" + f"{instruction}\n\n" + "### Vstup:\n" + f"{str(input_text).strip()}\n\n" + "### Odpoveď:\n" + ) + + +def make_full_text(example, tokenizer): + prompt = make_prompt(example) + reference = str(example["output"]).strip() + + return prompt + reference + tokenizer.eos_token + + +def load_model(): + print("CUDA available:", torch.cuda.is_available()) + + if torch.cuda.is_available(): + print("GPU:", torch.cuda.get_device_name(0)) + + print("Načítavam tokenizer...") + tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=False) + + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + print("Načítavam základný model v 4-bit režime...") + + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.float16, + ) + + base_model = AutoModelForCausalLM.from_pretrained( + MODEL_NAME, + quantization_config=bnb_config, + device_map={"": 0}, + dtype=torch.float16, + ) + + print("Pripájam LoRA adaptér...") + model = PeftModel.from_pretrained(base_model, ADAPTER_DIR) + model.eval() + + print("Model pripravený.") + print("-" * 80) + + return model, tokenizer + + +def load_eval_dataset(): + raw_dataset = load_dataset(DATASET_NAME) + + eval_dataset = raw_dataset["test"].shuffle(seed=42) + + if NUM_EVAL_SAMPLES is not None: + eval_dataset = eval_dataset.select( + range(min(NUM_EVAL_SAMPLES, len(eval_dataset))) + ) + + print("Eval vzoriek:", len(eval_dataset)) + + return eval_dataset + + +def generate_predictions(model, tokenizer, eval_dataset): + predictions = [] + references = [] + + for example in tqdm(eval_dataset, desc="Generujem odpovede"): + prompt = make_prompt(example) + reference = str(example["output"]).strip() + + inputs = tokenizer( + prompt, + return_tensors="pt", + truncation=True, + max_length=MAX_LENGTH, + ).to(model.device) + + input_length = inputs["input_ids"].shape[-1] + + with torch.no_grad(): + if DO_SAMPLE: + output_ids = model.generate( + **inputs, + max_new_tokens=MAX_NEW_TOKENS, + do_sample=True, + temperature=0.7, + top_p=0.9, + repetition_penalty=1.1, + eos_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.pad_token_id, + ) + else: + output_ids = model.generate( + **inputs, + max_new_tokens=MAX_NEW_TOKENS, + do_sample=False, + repetition_penalty=1.1, + eos_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.pad_token_id, + ) + + prediction = tokenizer.decode( + output_ids[0][input_length:], + skip_special_tokens=True, + ).strip() + + predictions.append(prediction) + references.append(reference) + + return predictions, references + + +def compute_rouge(predictions, references): + scorer = rouge_scorer.RougeScorer( + ["rouge1", "rouge2", "rougeL"], + use_stemmer=False, + ) + + rouge1_scores = [] + rouge2_scores = [] + rougeL_scores = [] + + for prediction, reference in zip(predictions, references): + scores = scorer.score(reference, prediction) + + rouge1_scores.append(scores["rouge1"].fmeasure) + rouge2_scores.append(scores["rouge2"].fmeasure) + rougeL_scores.append(scores["rougeL"].fmeasure) + + return { + "rouge1": sum(rouge1_scores) / len(rouge1_scores), + "rouge2": sum(rouge2_scores) / len(rouge2_scores), + "rougeL": sum(rougeL_scores) / len(rougeL_scores), + } + + +def compute_bleu(predictions, references): + bleu = sacrebleu.corpus_bleu( + predictions, + [references], + ) + + return bleu.score + + +def compute_perplexity(model, tokenizer, eval_dataset): + total_loss = 0.0 + total_tokens = 0 + + for example in tqdm(eval_dataset, desc="Počítam perplexitu"): + prompt = make_prompt(example) + full_text = make_full_text(example, tokenizer) + + prompt_ids = tokenizer( + prompt, + add_special_tokens=False, + truncation=True, + max_length=MAX_LENGTH, + )["input_ids"] + + encoded = tokenizer( + full_text, + add_special_tokens=False, + truncation=True, + max_length=MAX_LENGTH, + padding="max_length", + return_tensors="pt", + ) + + input_ids = encoded["input_ids"].to(model.device) + attention_mask = encoded["attention_mask"].to(model.device) + + labels = input_ids.clone() + + prompt_len = min(len(prompt_ids), MAX_LENGTH) + + labels[:, :prompt_len] = -100 + labels[attention_mask == 0] = -100 + + valid_tokens = (labels != -100).sum().item() + + if valid_tokens == 0: + continue + + with torch.no_grad(): + outputs = model( + input_ids=input_ids, + attention_mask=attention_mask, + labels=labels, + ) + + loss = outputs.loss.item() + + total_loss += loss * valid_tokens + total_tokens += valid_tokens + + avg_loss = total_loss / total_tokens + perplexity = math.exp(avg_loss) + + return avg_loss, perplexity + + +def save_metrics(metrics): + with open(METRICS_FILE, "w", encoding="utf-8") as file: + json.dump(metrics, file, ensure_ascii=False, indent=2) + + print("Výsledné metriky uložené do:") + print(METRICS_FILE) + + +def main(): + model, tokenizer = load_model() + eval_dataset = load_eval_dataset() + + predictions, references = generate_predictions(model, tokenizer, eval_dataset) + + print("Počítam ROUGE...") + rouge_scores = compute_rouge(predictions, references) + + print("Počítam BLEU...") + bleu_score = compute_bleu(predictions, references) + + print("Počítam perplexitu...") + eval_loss, perplexity = compute_perplexity(model, tokenizer, eval_dataset) + + metrics = { + "num_eval_samples": len(eval_dataset), + "rouge1": rouge_scores["rouge1"], + "rouge2": rouge_scores["rouge2"], + "rougeL": rouge_scores["rougeL"], + "bleu": bleu_score, + "eval_loss": eval_loss, + "perplexity": perplexity, + "adapter_dir": ADAPTER_DIR, + "model_name": MODEL_NAME, + "dataset_name": DATASET_NAME, + } + + print("=" * 80) + print("FINÁLNE METRIKY") + print("=" * 80) + print(f"ROUGE-1: {metrics['rouge1']:.4f}") + print(f"ROUGE-2: {metrics['rouge2']:.4f}") + print(f"ROUGE-L: {metrics['rougeL']:.4f}") + print(f"BLEU: {metrics['bleu']:.4f}") + print(f"Eval loss: {metrics['eval_loss']:.4f}") + print(f"Perplexity: {metrics['perplexity']:.4f}") + print("=" * 80) + + save_metrics(metrics) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/kody/mistral_sk_llamafactory_train.yaml b/kody/mistral_sk_llamafactory_train.yaml new file mode 100644 index 0000000..532d745 --- /dev/null +++ b/kody/mistral_sk_llamafactory_train.yaml @@ -0,0 +1,89 @@ +top.booster: unsloth +top.checkpoint_path: [] +top.finetuning_type: lora +top.model_name: Custom +top.quantization_bit: '4' +top.quantization_method: bnb +top.rope_scaling: none +top.template: alpaca +train.additional_target: '' +train.apollo_rank: 16 +train.apollo_scale: 32 +train.apollo_target: all +train.apollo_update_interval: 200 +train.badam_mode: layer +train.badam_switch_interval: 50 +train.badam_switch_mode: ascending +train.badam_update_ratio: 0.05 +train.batch_size: 1 +train.compute_type: fp16 +train.create_new_adapter: false +train.cutoff_len: 1024 +train.dataset: +- alpaca_slovak_cleaned +train.dataset_dir: data +train.ds_offload: false +train.ds_stage: none +train.enable_thinking: false +train.extra_args: '{"optim": "adamw_8bit", "eval_steps": 1000, "eval_strategy": "steps", + "save_total_limit": 2}' +train.freeze_extra_modules: '' +train.freeze_language_model: false +train.freeze_multi_modal_projector: true +train.freeze_trainable_layers: 2 +train.freeze_trainable_modules: all +train.freeze_vision_tower: true +train.galore_rank: 16 +train.galore_scale: 2 +train.galore_target: all +train.galore_update_interval: 200 +train.gradient_accumulation_steps: 8 +train.hub_private_repo: false +train.image_max_pixels: 768*768 +train.image_min_pixels: 32*32 +train.learning_rate: 2e-4 +train.logging_steps: 5 +train.lora_alpha: 32 +train.lora_dropout: 0.05 +train.lora_rank: 16 +train.lora_target: q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj +train.loraplus_lr_ratio: 0 +train.lr_scheduler_type: cosine +train.mask_history: false +train.max_grad_norm: '1.0' +train.max_samples: '50000' +train.neat_packing: false +train.neftune_alpha: 0 +train.num_train_epochs: '1.0' +train.packing: false +train.ppo_score_norm: false +train.ppo_whiten_rewards: false +train.pref_beta: 0.1 +train.pref_ftx: 0 +train.pref_loss: sigmoid +train.project: huggingface +train.report_to: none +train.resize_vocab: false +train.reward_model: [] +train.save_steps: 1000 +train.swanlab_api_key: '' +train.swanlab_link: null +train.swanlab_mode: cloud +train.swanlab_project: llamafactory +train.swanlab_run_name: '' +train.swanlab_workspace: '' +train.trackio_space_id: trackio +train.train_on_prompt: false +train.training_stage: Supervised Fine-Tuning +train.use_apollo: false +train.use_badam: false +train.use_dora: false +train.use_galore: false +train.use_llama_pro: false +train.use_pissa: false +train.use_rslora: false +train.use_swanlab: false +train.val_size: 0.025 +train.video_max_pixels: 256*256 +train.video_min_pixels: 16*16 +train.warmup_steps: 150 diff --git a/kody/test_mistral_lora.py b/kody/test_mistral_lora.py new file mode 100644 index 0000000..5a9010e --- /dev/null +++ b/kody/test_mistral_lora.py @@ -0,0 +1,141 @@ +import os +from pathlib import Path + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig +from peft import PeftModel + + +MODEL_NAME = "slovak-nlp/mistral-sk-7b" + +ADAPTER_DIR = "/home/schwarc/diplomovka/mistral_sk_alpaca/mistral-sk-7b-alpaca-slovak-unsloth-lora-full" + +MAX_NEW_TOKENS = 300 + + +def make_prompt(instruction, input_text=""): + instruction = instruction.strip() + input_text = input_text.strip() + + if input_text: + return ( + "### Inštrukcia:\n" + f"{instruction}\n\n" + "### Vstup:\n" + f"{input_text}\n\n" + "### Odpoveď:\n" + ) + + return ( + "### Inštrukcia:\n" + f"{instruction}\n\n" + "### Odpoveď:\n" + ) + + +def load_model(): + adapter_path = Path(ADAPTER_DIR) + + if not adapter_path.exists(): + raise FileNotFoundError(f"Adaptér neexistuje: {ADAPTER_DIR}") + + print("CUDA available:", torch.cuda.is_available()) + + if torch.cuda.is_available(): + print("GPU:", torch.cuda.get_device_name(0)) + + print("Načítavam tokenizer...") + tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=False) + + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + print("Načítavam základný model v 4-bit režime...") + + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.float16, + ) + + base_model = AutoModelForCausalLM.from_pretrained( + MODEL_NAME, + quantization_config=bnb_config, + device_map={"": 0}, + dtype=torch.float16, + ) + + print("Pripájam LoRA adaptér...") + model = PeftModel.from_pretrained(base_model, ADAPTER_DIR) + model.eval() + + print("Model je pripravený.") + print("-" * 80) + + return model, tokenizer + + +def generate_answer(model, tokenizer, instruction): + prompt = make_prompt(instruction) + + inputs = tokenizer( + prompt, + return_tensors="pt", + truncation=True, + max_length=1024, + ).to(model.device) + + input_length = inputs["input_ids"].shape[-1] + + with torch.no_grad(): + output_ids = model.generate( + **inputs, + max_new_tokens=MAX_NEW_TOKENS, + + do_sample=True, + temperature=0.7, + top_p=0.9, + + repetition_penalty=1.1, + eos_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.pad_token_id, + ) + + answer = tokenizer.decode( + output_ids[0][input_length:], + skip_special_tokens=True, + ) + + return answer.strip() + + +def main(): + model, tokenizer = load_model() + + print("Napíš inštrukciu.") + print("Ukončenie: exit, quit alebo koniec") + print("-" * 80) + + while True: + instruction = input("\nInštrukcia: ").strip() + + if instruction.lower() in ["exit", "quit", "koniec"]: + print("Koniec.") + break + + if not instruction: + continue + + answer = generate_answer(model, tokenizer, instruction) + + print("\nOdpoveď:") + print(answer) + print("-" * 80) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/kody/train_mistral_full.py b/kody/train_mistral_full.py new file mode 100644 index 0000000..e8ec3ad --- /dev/null +++ b/kody/train_mistral_full.py @@ -0,0 +1,247 @@ +import os +from pathlib import Path + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["WANDB_DISABLED"] = "true" + +import torch +from datasets import load_dataset +from transformers import ( + AutoTokenizer, + AutoModelForCausalLM, + BitsAndBytesConfig, + Trainer, + TrainingArguments, + default_data_collator, +) +from transformers.trainer_utils import get_last_checkpoint +from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training + + +MODEL_NAME = "slovak-nlp/mistral-sk-7b" +DATASET_NAME = "saillab/alpaca-slovak-cleaned" + +PROJECT_DIR = Path.home() / "diplomovka" / "mistral_sk_alpaca" +OUTPUT_DIR = PROJECT_DIR / "outputs-full" +ADAPTER_DIR = PROJECT_DIR / "mistral-sk-7b-alpaca-slovak-lora-full" + +PROJECT_DIR.mkdir(parents=True, exist_ok=True) +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +MAX_TRAIN_SAMPLES = None +MAX_EVAL_SAMPLES = 1000 + +MAX_LENGTH = 1024 +BATCH_SIZE = 1 +GRAD_ACCUM = 8 +LEARNING_RATE = 2e-4 +NUM_EPOCHS = 1 + +SAVE_STEPS = 1000 +EVAL_STEPS = 1000 +WARMUP_STEPS = 150 +MAX_STEPS = -1 + + +print("CUDA available:", torch.cuda.is_available()) +if torch.cuda.is_available(): + print("GPU:", torch.cuda.get_device_name(0)) + print("VRAM GB:", round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2)) + +print("Project dir:", PROJECT_DIR) +print("Output dir:", OUTPUT_DIR) +print("Adapter dir:", ADAPTER_DIR) + + +tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False) + +if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + +raw_dataset = load_dataset(DATASET_NAME) + +print(raw_dataset) + + +def is_empty(value): + if value is None: + return True + value = str(value).strip() + return value == "" or value.lower() == "nan" + + +def build_prompt(example): + instruction = str(example["instruction"]).strip() + input_text = example.get("input") + + if is_empty(input_text): + prompt = f"### Inštrukcia:\n{instruction}\n\n### Odpoveď:\n" + else: + prompt = f"### Inštrukcia:\n{instruction}\n\n### Vstup:\n{str(input_text).strip()}\n\n### Odpoveď:\n" + + completion = str(example["output"]).strip() + tokenizer.eos_token + + return { + "prompt": prompt, + "completion": completion, + "text": prompt + completion, + } + + +dataset = raw_dataset.map( + build_prompt, + remove_columns=raw_dataset["train"].column_names, +) + +if MAX_TRAIN_SAMPLES is not None: + dataset["train"] = dataset["train"].select(range(min(MAX_TRAIN_SAMPLES, len(dataset["train"])))) + +if MAX_EVAL_SAMPLES is not None: + dataset["test"] = dataset["test"].select(range(min(MAX_EVAL_SAMPLES, len(dataset["test"])))) + +print(dataset) + + +def tokenize_example(example): + prompt_ids = tokenizer( + example["prompt"], + add_special_tokens=False, + truncation=True, + max_length=MAX_LENGTH, + )["input_ids"] + + full = tokenizer( + example["text"], + add_special_tokens=False, + truncation=True, + max_length=MAX_LENGTH, + padding="max_length", + ) + + input_ids = full["input_ids"] + attention_mask = full["attention_mask"] + labels = input_ids.copy() + + prompt_len = min(len(prompt_ids), MAX_LENGTH) + + labels[:prompt_len] = [-100] * prompt_len + labels = [ + label if mask == 1 else -100 + for label, mask in zip(labels, attention_mask) + ] + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "labels": labels, + } + + +tokenized_dataset = dataset.map( + tokenize_example, + remove_columns=dataset["train"].column_names, + num_proc=1, +) + +print(tokenized_dataset) +print("Tokenizácia hotová.") + + +bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.float16, +) + +model = AutoModelForCausalLM.from_pretrained( + MODEL_NAME, + quantization_config=bnb_config, + device_map={"": 0}, + dtype=torch.float16, +) + +model.config.use_cache = False +model.gradient_checkpointing_enable() +model = prepare_model_for_kbit_training(model) + +lora_config = LoraConfig( + r=16, + lora_alpha=32, + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], +) + +model = get_peft_model(model, lora_config) +model.print_trainable_parameters() + + +training_args = TrainingArguments( + output_dir=str(OUTPUT_DIR), + + per_device_train_batch_size=BATCH_SIZE, + per_device_eval_batch_size=BATCH_SIZE, + gradient_accumulation_steps=GRAD_ACCUM, + + learning_rate=LEARNING_RATE, + num_train_epochs=NUM_EPOCHS, + max_steps=MAX_STEPS, + + fp16=True, + bf16=False, + + logging_steps=10, + save_steps=SAVE_STEPS, + save_total_limit=2, + + eval_strategy="steps", + eval_steps=EVAL_STEPS, + + optim="paged_adamw_8bit", + warmup_steps=WARMUP_STEPS, + lr_scheduler_type="cosine", + max_grad_norm=0.3, + + gradient_checkpointing=True, + report_to="none", + remove_unused_columns=False, +) + + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_dataset["train"], + eval_dataset=tokenized_dataset["test"], + data_collator=default_data_collator, +) + + +last_checkpoint = None +if OUTPUT_DIR.exists(): + last_checkpoint = get_last_checkpoint(str(OUTPUT_DIR)) + +if last_checkpoint is not None: + print("Pokračujem z checkpointu:", last_checkpoint) +else: + print("Začínam nový tréning.") + + +trainer.train(resume_from_checkpoint=last_checkpoint) + +metrics = trainer.evaluate() +print(metrics) + +trainer.save_model(str(ADAPTER_DIR)) +tokenizer.save_pretrained(str(ADAPTER_DIR)) + +print("Hotovo.") +print("LoRA adaptér uložený do:") +print(ADAPTER_DIR) diff --git a/kody/train_mistral_unsloth_trl.py b/kody/train_mistral_unsloth_trl.py new file mode 100644 index 0000000..e1e75c2 --- /dev/null +++ b/kody/train_mistral_unsloth_trl.py @@ -0,0 +1,248 @@ +import os +from pathlib import Path + +# Použijeme GPU 0 +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["WANDB_DISABLED"] = "true" + +import torch +from datasets import load_dataset +from unsloth import FastLanguageModel +from trl import SFTTrainer, SFTConfig +from transformers.trainer_utils import get_last_checkpoint + + +MODEL_NAME = "slovak-nlp/mistral-sk-7b" +DATASET_NAME = "saillab/alpaca-slovak-cleaned" + +PROJECT_DIR = Path.home() / "diplomovka" / "mistral_sk_alpaca" + +# Výstupy pre FULL Unsloth experiment +OUTPUT_DIR = PROJECT_DIR / "outputs-unsloth-full" +ADAPTER_DIR = PROJECT_DIR / "mistral-sk-7b-alpaca-slovak-unsloth-lora-full" + +PROJECT_DIR.mkdir(parents=True, exist_ok=True) +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +# Nastavenia tréningu +MAX_SEQ_LENGTH = 1024 + +MAX_TRAIN_SAMPLES = None # None = celý train dataset +MAX_EVAL_SAMPLES = 1000 # validácia na 1000 príkladoch + +NUM_EPOCHS = 1 +MAX_STEPS = -1 # -1 = pôjde podľa epoch + +BATCH_SIZE = 1 +GRAD_ACCUM = 8 +LEARNING_RATE = 2e-4 + +SAVE_STEPS = 1000 +EVAL_STEPS = 1000 +WARMUP_STEPS = 150 + + +print("=" * 80) +print("Experiment: Unsloth + TRL SFTTrainer + QLoRA") +print("=" * 80) + +print("CUDA available:", torch.cuda.is_available()) + +if torch.cuda.is_available(): + print("GPU:", torch.cuda.get_device_name(0)) + print("VRAM GB:", round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2)) + +print("Torch:", torch.__version__) +print("Model:", MODEL_NAME) +print("Dataset:", DATASET_NAME) +print("Output dir:", OUTPUT_DIR) +print("Adapter dir:", ADAPTER_DIR) +print("=" * 80) + + +# 1. Načítanie modelu cez Unsloth v 4-bit režime +model, tokenizer = FastLanguageModel.from_pretrained( + model_name=MODEL_NAME, + max_seq_length=MAX_SEQ_LENGTH, + dtype=torch.float16, + load_in_4bit=True, +) + +if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + +# 2. Pridanie LoRA adaptéra +model = FastLanguageModel.get_peft_model( + model, + r=16, + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + ], + lora_alpha=32, + lora_dropout=0.05, + bias="none", + use_gradient_checkpointing="unsloth", + random_state=42, +) + +print("LoRA adaptér pridaný.") + + +# 3. Načítanie datasetu +raw_dataset = load_dataset(DATASET_NAME) + +print("Pôvodný dataset:") +print(raw_dataset) + + +# 4. Formátovanie do Alpaca štýlu +def is_empty(value): + if value is None: + return True + + value = str(value).strip() + + return value == "" or value.lower() == "nan" + + +def format_example(example): + instruction = str(example["instruction"]).strip() + input_text = example.get("input") + output = str(example["output"]).strip() + + if is_empty(input_text): + text = ( + f"### Inštrukcia:\n" + f"{instruction}\n\n" + f"### Odpoveď:\n" + f"{output}" + ) + else: + text = ( + f"### Inštrukcia:\n" + f"{instruction}\n\n" + f"### Vstup:\n" + f"{str(input_text).strip()}\n\n" + f"### Odpoveď:\n" + f"{output}" + ) + + return { + "text": text + tokenizer.eos_token + } + + +dataset = raw_dataset.map( + format_example, + remove_columns=raw_dataset["train"].column_names, +) + +# Premiešanie kvôli lepšej reprezentatívnosti +dataset["train"] = dataset["train"].shuffle(seed=42) +dataset["test"] = dataset["test"].shuffle(seed=42) + +if MAX_TRAIN_SAMPLES is not None: + dataset["train"] = dataset["train"].select( + range(min(MAX_TRAIN_SAMPLES, len(dataset["train"]))) + ) + +if MAX_EVAL_SAMPLES is not None: + dataset["test"] = dataset["test"].select( + range(min(MAX_EVAL_SAMPLES, len(dataset["test"]))) + ) + +print("Použitý dataset:") +print(dataset) + +print("Ukážka tréningového textu:") +print(dataset["train"][0]["text"][:1200]) +print("=" * 80) + + +# 5. TRL SFTTrainer +training_args = SFTConfig( + output_dir=str(OUTPUT_DIR), + + per_device_train_batch_size=BATCH_SIZE, + per_device_eval_batch_size=BATCH_SIZE, + gradient_accumulation_steps=GRAD_ACCUM, + + learning_rate=LEARNING_RATE, + num_train_epochs=NUM_EPOCHS, + max_steps=MAX_STEPS, + + logging_steps=10, + + eval_strategy="steps", + eval_steps=EVAL_STEPS, + + save_strategy="steps", + save_steps=SAVE_STEPS, + save_total_limit=2, + + warmup_steps=WARMUP_STEPS, + optim="adamw_8bit", + fp16=True, + bf16=False, + + lr_scheduler_type="cosine", + report_to="none", + + max_length=MAX_SEQ_LENGTH, + packing=False, + dataset_text_field="text", +) + + +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=dataset["train"], + eval_dataset=dataset["test"], + args=training_args, +) + + +# 6. Pokračovanie z checkpointu, ak existuje +last_checkpoint = None + +if OUTPUT_DIR.exists(): + last_checkpoint = get_last_checkpoint(str(OUTPUT_DIR)) + +if last_checkpoint is not None: + print("Pokračujem z checkpointu:", last_checkpoint) +else: + print("Začínam nový Unsloth full tréning.") + + +# 7. Tréning +train_result = trainer.train(resume_from_checkpoint=last_checkpoint) + +print("Výsledok tréningu:") +print(train_result) + + +# 8. Finálne vyhodnotenie +metrics = trainer.evaluate() + +print("Finálne metriky:") +print(metrics) + + +# 9. Uloženie LoRA adaptéra +model.save_pretrained(str(ADAPTER_DIR)) +tokenizer.save_pretrained(str(ADAPTER_DIR)) + +print("=" * 80) +print("Hotovo.") +print("Unsloth LoRA adaptér uložený do:") +print(ADAPTER_DIR) +print("=" * 80) \ No newline at end of file