248 lines
5.5 KiB
Python
248 lines
5.5 KiB
Python
import os
|
|
from pathlib import Path
|
|
|
|
# Použijeme GPU 0
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
os.environ["WANDB_DISABLED"] = "true"
|
|
|
|
import torch
|
|
from datasets import load_dataset
|
|
from unsloth import FastLanguageModel
|
|
from trl import SFTTrainer, SFTConfig
|
|
from transformers.trainer_utils import get_last_checkpoint
|
|
|
|
|
|
MODEL_NAME = "slovak-nlp/mistral-sk-7b"
|
|
DATASET_NAME = "saillab/alpaca-slovak-cleaned"
|
|
|
|
PROJECT_DIR = Path.home() / "diplomovka" / "mistral_sk_alpaca"
|
|
|
|
# Výstupy pre FULL Unsloth experiment
|
|
OUTPUT_DIR = PROJECT_DIR / "outputs-unsloth-full"
|
|
ADAPTER_DIR = PROJECT_DIR / "mistral-sk-7b-alpaca-slovak-unsloth-lora-full"
|
|
|
|
PROJECT_DIR.mkdir(parents=True, exist_ok=True)
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Nastavenia tréningu
|
|
MAX_SEQ_LENGTH = 1024
|
|
|
|
MAX_TRAIN_SAMPLES = None # None = celý train dataset
|
|
MAX_EVAL_SAMPLES = 1000 # validácia na 1000 príkladoch
|
|
|
|
NUM_EPOCHS = 1
|
|
MAX_STEPS = -1 # -1 = pôjde podľa epoch
|
|
|
|
BATCH_SIZE = 1
|
|
GRAD_ACCUM = 8
|
|
LEARNING_RATE = 2e-4
|
|
|
|
SAVE_STEPS = 1000
|
|
EVAL_STEPS = 1000
|
|
WARMUP_STEPS = 150
|
|
|
|
|
|
print("=" * 80)
|
|
print("Experiment: Unsloth + TRL SFTTrainer + QLoRA")
|
|
print("=" * 80)
|
|
|
|
print("CUDA available:", torch.cuda.is_available())
|
|
|
|
if torch.cuda.is_available():
|
|
print("GPU:", torch.cuda.get_device_name(0))
|
|
print("VRAM GB:", round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2))
|
|
|
|
print("Torch:", torch.__version__)
|
|
print("Model:", MODEL_NAME)
|
|
print("Dataset:", DATASET_NAME)
|
|
print("Output dir:", OUTPUT_DIR)
|
|
print("Adapter dir:", ADAPTER_DIR)
|
|
print("=" * 80)
|
|
|
|
|
|
# 1. Načítanie modelu cez Unsloth v 4-bit režime
|
|
model, tokenizer = FastLanguageModel.from_pretrained(
|
|
model_name=MODEL_NAME,
|
|
max_seq_length=MAX_SEQ_LENGTH,
|
|
dtype=torch.float16,
|
|
load_in_4bit=True,
|
|
)
|
|
|
|
if tokenizer.pad_token is None:
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
|
|
# 2. Pridanie LoRA adaptéra
|
|
model = FastLanguageModel.get_peft_model(
|
|
model,
|
|
r=16,
|
|
target_modules=[
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
"o_proj",
|
|
"gate_proj",
|
|
"up_proj",
|
|
"down_proj",
|
|
],
|
|
lora_alpha=32,
|
|
lora_dropout=0.05,
|
|
bias="none",
|
|
use_gradient_checkpointing="unsloth",
|
|
random_state=42,
|
|
)
|
|
|
|
print("LoRA adaptér pridaný.")
|
|
|
|
|
|
# 3. Načítanie datasetu
|
|
raw_dataset = load_dataset(DATASET_NAME)
|
|
|
|
print("Pôvodný dataset:")
|
|
print(raw_dataset)
|
|
|
|
|
|
# 4. Formátovanie do Alpaca štýlu
|
|
def is_empty(value):
|
|
if value is None:
|
|
return True
|
|
|
|
value = str(value).strip()
|
|
|
|
return value == "" or value.lower() == "nan"
|
|
|
|
|
|
def format_example(example):
|
|
instruction = str(example["instruction"]).strip()
|
|
input_text = example.get("input")
|
|
output = str(example["output"]).strip()
|
|
|
|
if is_empty(input_text):
|
|
text = (
|
|
f"### Inštrukcia:\n"
|
|
f"{instruction}\n\n"
|
|
f"### Odpoveď:\n"
|
|
f"{output}"
|
|
)
|
|
else:
|
|
text = (
|
|
f"### Inštrukcia:\n"
|
|
f"{instruction}\n\n"
|
|
f"### Vstup:\n"
|
|
f"{str(input_text).strip()}\n\n"
|
|
f"### Odpoveď:\n"
|
|
f"{output}"
|
|
)
|
|
|
|
return {
|
|
"text": text + tokenizer.eos_token
|
|
}
|
|
|
|
|
|
dataset = raw_dataset.map(
|
|
format_example,
|
|
remove_columns=raw_dataset["train"].column_names,
|
|
)
|
|
|
|
# Premiešanie kvôli lepšej reprezentatívnosti
|
|
dataset["train"] = dataset["train"].shuffle(seed=42)
|
|
dataset["test"] = dataset["test"].shuffle(seed=42)
|
|
|
|
if MAX_TRAIN_SAMPLES is not None:
|
|
dataset["train"] = dataset["train"].select(
|
|
range(min(MAX_TRAIN_SAMPLES, len(dataset["train"])))
|
|
)
|
|
|
|
if MAX_EVAL_SAMPLES is not None:
|
|
dataset["test"] = dataset["test"].select(
|
|
range(min(MAX_EVAL_SAMPLES, len(dataset["test"])))
|
|
)
|
|
|
|
print("Použitý dataset:")
|
|
print(dataset)
|
|
|
|
print("Ukážka tréningového textu:")
|
|
print(dataset["train"][0]["text"][:1200])
|
|
print("=" * 80)
|
|
|
|
|
|
# 5. TRL SFTTrainer
|
|
training_args = SFTConfig(
|
|
output_dir=str(OUTPUT_DIR),
|
|
|
|
per_device_train_batch_size=BATCH_SIZE,
|
|
per_device_eval_batch_size=BATCH_SIZE,
|
|
gradient_accumulation_steps=GRAD_ACCUM,
|
|
|
|
learning_rate=LEARNING_RATE,
|
|
num_train_epochs=NUM_EPOCHS,
|
|
max_steps=MAX_STEPS,
|
|
|
|
logging_steps=10,
|
|
|
|
eval_strategy="steps",
|
|
eval_steps=EVAL_STEPS,
|
|
|
|
save_strategy="steps",
|
|
save_steps=SAVE_STEPS,
|
|
save_total_limit=2,
|
|
|
|
warmup_steps=WARMUP_STEPS,
|
|
optim="adamw_8bit",
|
|
fp16=True,
|
|
bf16=False,
|
|
|
|
lr_scheduler_type="cosine",
|
|
report_to="none",
|
|
|
|
max_length=MAX_SEQ_LENGTH,
|
|
packing=False,
|
|
dataset_text_field="text",
|
|
)
|
|
|
|
|
|
trainer = SFTTrainer(
|
|
model=model,
|
|
tokenizer=tokenizer,
|
|
train_dataset=dataset["train"],
|
|
eval_dataset=dataset["test"],
|
|
args=training_args,
|
|
)
|
|
|
|
|
|
# 6. Pokračovanie z checkpointu, ak existuje
|
|
last_checkpoint = None
|
|
|
|
if OUTPUT_DIR.exists():
|
|
last_checkpoint = get_last_checkpoint(str(OUTPUT_DIR))
|
|
|
|
if last_checkpoint is not None:
|
|
print("Pokračujem z checkpointu:", last_checkpoint)
|
|
else:
|
|
print("Začínam nový Unsloth full tréning.")
|
|
|
|
|
|
# 7. Tréning
|
|
train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
|
|
|
|
print("Výsledok tréningu:")
|
|
print(train_result)
|
|
|
|
|
|
# 8. Finálne vyhodnotenie
|
|
metrics = trainer.evaluate()
|
|
|
|
print("Finálne metriky:")
|
|
print(metrics)
|
|
|
|
|
|
# 9. Uloženie LoRA adaptéra
|
|
model.save_pretrained(str(ADAPTER_DIR))
|
|
tokenizer.save_pretrained(str(ADAPTER_DIR))
|
|
|
|
print("=" * 80)
|
|
print("Hotovo.")
|
|
print("Unsloth LoRA adaptér uložený do:")
|
|
print(ADAPTER_DIR)
|
|
print("=" * 80) |