import os from pathlib import Path # Použijeme GPU 0 os.environ["CUDA_VISIBLE_DEVICES"] = "0" os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["WANDB_DISABLED"] = "true" import torch from datasets import load_dataset from unsloth import FastLanguageModel from trl import SFTTrainer, SFTConfig from transformers.trainer_utils import get_last_checkpoint MODEL_NAME = "slovak-nlp/mistral-sk-7b" DATASET_NAME = "saillab/alpaca-slovak-cleaned" PROJECT_DIR = Path.home() / "diplomovka" / "mistral_sk_alpaca" # Výstupy pre FULL Unsloth experiment OUTPUT_DIR = PROJECT_DIR / "outputs-unsloth-full" ADAPTER_DIR = PROJECT_DIR / "mistral-sk-7b-alpaca-slovak-unsloth-lora-full" PROJECT_DIR.mkdir(parents=True, exist_ok=True) OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Nastavenia tréningu MAX_SEQ_LENGTH = 1024 MAX_TRAIN_SAMPLES = None # None = celý train dataset MAX_EVAL_SAMPLES = 1000 # validácia na 1000 príkladoch NUM_EPOCHS = 1 MAX_STEPS = -1 # -1 = pôjde podľa epoch BATCH_SIZE = 1 GRAD_ACCUM = 8 LEARNING_RATE = 2e-4 SAVE_STEPS = 1000 EVAL_STEPS = 1000 WARMUP_STEPS = 150 print("=" * 80) print("Experiment: Unsloth + TRL SFTTrainer + QLoRA") print("=" * 80) print("CUDA available:", torch.cuda.is_available()) if torch.cuda.is_available(): print("GPU:", torch.cuda.get_device_name(0)) print("VRAM GB:", round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2)) print("Torch:", torch.__version__) print("Model:", MODEL_NAME) print("Dataset:", DATASET_NAME) print("Output dir:", OUTPUT_DIR) print("Adapter dir:", ADAPTER_DIR) print("=" * 80) # 1. Načítanie modelu cez Unsloth v 4-bit režime model, tokenizer = FastLanguageModel.from_pretrained( model_name=MODEL_NAME, max_seq_length=MAX_SEQ_LENGTH, dtype=torch.float16, load_in_4bit=True, ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # 2. Pridanie LoRA adaptéra model = FastLanguageModel.get_peft_model( model, r=16, target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ], lora_alpha=32, lora_dropout=0.05, bias="none", use_gradient_checkpointing="unsloth", random_state=42, ) print("LoRA adaptér pridaný.") # 3. Načítanie datasetu raw_dataset = load_dataset(DATASET_NAME) print("Pôvodný dataset:") print(raw_dataset) # 4. Formátovanie do Alpaca štýlu def is_empty(value): if value is None: return True value = str(value).strip() return value == "" or value.lower() == "nan" def format_example(example): instruction = str(example["instruction"]).strip() input_text = example.get("input") output = str(example["output"]).strip() if is_empty(input_text): text = ( f"### Inštrukcia:\n" f"{instruction}\n\n" f"### Odpoveď:\n" f"{output}" ) else: text = ( f"### Inštrukcia:\n" f"{instruction}\n\n" f"### Vstup:\n" f"{str(input_text).strip()}\n\n" f"### Odpoveď:\n" f"{output}" ) return { "text": text + tokenizer.eos_token } dataset = raw_dataset.map( format_example, remove_columns=raw_dataset["train"].column_names, ) # Premiešanie kvôli lepšej reprezentatívnosti dataset["train"] = dataset["train"].shuffle(seed=42) dataset["test"] = dataset["test"].shuffle(seed=42) if MAX_TRAIN_SAMPLES is not None: dataset["train"] = dataset["train"].select( range(min(MAX_TRAIN_SAMPLES, len(dataset["train"]))) ) if MAX_EVAL_SAMPLES is not None: dataset["test"] = dataset["test"].select( range(min(MAX_EVAL_SAMPLES, len(dataset["test"]))) ) print("Použitý dataset:") print(dataset) print("Ukážka tréningového textu:") print(dataset["train"][0]["text"][:1200]) print("=" * 80) # 5. TRL SFTTrainer training_args = SFTConfig( output_dir=str(OUTPUT_DIR), per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, gradient_accumulation_steps=GRAD_ACCUM, learning_rate=LEARNING_RATE, num_train_epochs=NUM_EPOCHS, max_steps=MAX_STEPS, logging_steps=10, eval_strategy="steps", eval_steps=EVAL_STEPS, save_strategy="steps", save_steps=SAVE_STEPS, save_total_limit=2, warmup_steps=WARMUP_STEPS, optim="adamw_8bit", fp16=True, bf16=False, lr_scheduler_type="cosine", report_to="none", max_length=MAX_SEQ_LENGTH, packing=False, dataset_text_field="text", ) trainer = SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=dataset["train"], eval_dataset=dataset["test"], args=training_args, ) # 6. Pokračovanie z checkpointu, ak existuje last_checkpoint = None if OUTPUT_DIR.exists(): last_checkpoint = get_last_checkpoint(str(OUTPUT_DIR)) if last_checkpoint is not None: print("Pokračujem z checkpointu:", last_checkpoint) else: print("Začínam nový Unsloth full tréning.") # 7. Tréning train_result = trainer.train(resume_from_checkpoint=last_checkpoint) print("Výsledok tréningu:") print(train_result) # 8. Finálne vyhodnotenie metrics = trainer.evaluate() print("Finálne metriky:") print(metrics) # 9. Uloženie LoRA adaptéra model.save_pretrained(str(ADAPTER_DIR)) tokenizer.save_pretrained(str(ADAPTER_DIR)) print("=" * 80) print("Hotovo.") print("Unsloth LoRA adaptér uložený do:") print(ADAPTER_DIR) print("=" * 80)