Diplomovka/kody/train_mistral_unsloth_trl.py
2026-06-11 21:42:51 +02:00

248 lines
5.5 KiB
Python

import os
from pathlib import Path
# Použijeme GPU 0
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
import torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from transformers.trainer_utils import get_last_checkpoint
MODEL_NAME = "slovak-nlp/mistral-sk-7b"
DATASET_NAME = "saillab/alpaca-slovak-cleaned"
PROJECT_DIR = Path.home() / "diplomovka" / "mistral_sk_alpaca"
# Výstupy pre FULL Unsloth experiment
OUTPUT_DIR = PROJECT_DIR / "outputs-unsloth-full"
ADAPTER_DIR = PROJECT_DIR / "mistral-sk-7b-alpaca-slovak-unsloth-lora-full"
PROJECT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Nastavenia tréningu
MAX_SEQ_LENGTH = 1024
MAX_TRAIN_SAMPLES = None # None = celý train dataset
MAX_EVAL_SAMPLES = 1000 # validácia na 1000 príkladoch
NUM_EPOCHS = 1
MAX_STEPS = -1 # -1 = pôjde podľa epoch
BATCH_SIZE = 1
GRAD_ACCUM = 8
LEARNING_RATE = 2e-4
SAVE_STEPS = 1000
EVAL_STEPS = 1000
WARMUP_STEPS = 150
print("=" * 80)
print("Experiment: Unsloth + TRL SFTTrainer + QLoRA")
print("=" * 80)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
print("GPU:", torch.cuda.get_device_name(0))
print("VRAM GB:", round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2))
print("Torch:", torch.__version__)
print("Model:", MODEL_NAME)
print("Dataset:", DATASET_NAME)
print("Output dir:", OUTPUT_DIR)
print("Adapter dir:", ADAPTER_DIR)
print("=" * 80)
# 1. Načítanie modelu cez Unsloth v 4-bit režime
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=MODEL_NAME,
max_seq_length=MAX_SEQ_LENGTH,
dtype=torch.float16,
load_in_4bit=True,
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# 2. Pridanie LoRA adaptéra
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
],
lora_alpha=32,
lora_dropout=0.05,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=42,
)
print("LoRA adaptér pridaný.")
# 3. Načítanie datasetu
raw_dataset = load_dataset(DATASET_NAME)
print("Pôvodný dataset:")
print(raw_dataset)
# 4. Formátovanie do Alpaca štýlu
def is_empty(value):
if value is None:
return True
value = str(value).strip()
return value == "" or value.lower() == "nan"
def format_example(example):
instruction = str(example["instruction"]).strip()
input_text = example.get("input")
output = str(example["output"]).strip()
if is_empty(input_text):
text = (
f"### Inštrukcia:\n"
f"{instruction}\n\n"
f"### Odpoveď:\n"
f"{output}"
)
else:
text = (
f"### Inštrukcia:\n"
f"{instruction}\n\n"
f"### Vstup:\n"
f"{str(input_text).strip()}\n\n"
f"### Odpoveď:\n"
f"{output}"
)
return {
"text": text + tokenizer.eos_token
}
dataset = raw_dataset.map(
format_example,
remove_columns=raw_dataset["train"].column_names,
)
# Premiešanie kvôli lepšej reprezentatívnosti
dataset["train"] = dataset["train"].shuffle(seed=42)
dataset["test"] = dataset["test"].shuffle(seed=42)
if MAX_TRAIN_SAMPLES is not None:
dataset["train"] = dataset["train"].select(
range(min(MAX_TRAIN_SAMPLES, len(dataset["train"])))
)
if MAX_EVAL_SAMPLES is not None:
dataset["test"] = dataset["test"].select(
range(min(MAX_EVAL_SAMPLES, len(dataset["test"])))
)
print("Použitý dataset:")
print(dataset)
print("Ukážka tréningového textu:")
print(dataset["train"][0]["text"][:1200])
print("=" * 80)
# 5. TRL SFTTrainer
training_args = SFTConfig(
output_dir=str(OUTPUT_DIR),
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
gradient_accumulation_steps=GRAD_ACCUM,
learning_rate=LEARNING_RATE,
num_train_epochs=NUM_EPOCHS,
max_steps=MAX_STEPS,
logging_steps=10,
eval_strategy="steps",
eval_steps=EVAL_STEPS,
save_strategy="steps",
save_steps=SAVE_STEPS,
save_total_limit=2,
warmup_steps=WARMUP_STEPS,
optim="adamw_8bit",
fp16=True,
bf16=False,
lr_scheduler_type="cosine",
report_to="none",
max_length=MAX_SEQ_LENGTH,
packing=False,
dataset_text_field="text",
)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
args=training_args,
)
# 6. Pokračovanie z checkpointu, ak existuje
last_checkpoint = None
if OUTPUT_DIR.exists():
last_checkpoint = get_last_checkpoint(str(OUTPUT_DIR))
if last_checkpoint is not None:
print("Pokračujem z checkpointu:", last_checkpoint)
else:
print("Začínam nový Unsloth full tréning.")
# 7. Tréning
train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
print("Výsledok tréningu:")
print(train_result)
# 8. Finálne vyhodnotenie
metrics = trainer.evaluate()
print("Finálne metriky:")
print(metrics)
# 9. Uloženie LoRA adaptéra
model.save_pretrained(str(ADAPTER_DIR))
tokenizer.save_pretrained(str(ADAPTER_DIR))
print("=" * 80)
print("Hotovo.")
print("Unsloth LoRA adaptér uložený do:")
print(ADAPTER_DIR)
print("=" * 80)