Pridanie_Kodov

This commit is contained in:
Jakub Schwarc 2026-06-11 21:42:51 +02:00
parent 0ef6cf229c
commit ae919106c1
5 changed files with 1042 additions and 0 deletions

View File

@ -0,0 +1,317 @@
import os
import json
import math
from pathlib import Path
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import torch
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from rouge_score import rouge_scorer
import sacrebleu
MODEL_NAME = "slovak-nlp/mistral-sk-7b"
DATASET_NAME = "saillab/alpaca-slovak-cleaned"
# Tvoj Unsloth + TRL + QLoRA adaptér
ADAPTER_DIR = "/home/schwarc/diplomovka/mistral_sk_alpaca/mistral-sk-7b-alpaca-slovak-unsloth-lora-full"
PROJECT_DIR = Path.home() / "diplomovka" / "evaluation_results"
PROJECT_DIR.mkdir(parents=True, exist_ok=True)
METRICS_FILE = PROJECT_DIR / "unsloth_lora_metrics.json"
NUM_EVAL_SAMPLES = 1000
MAX_LENGTH = 1024
MAX_NEW_TOKENS = 300
# Pri hodnotení je lepšie deterministické generovanie
DO_SAMPLE = False
def is_empty(value):
if value is None:
return True
value = str(value).strip()
return value == "" or value.lower() == "nan"
def make_prompt(example):
instruction = str(example["instruction"]).strip()
input_text = example.get("input")
if is_empty(input_text):
return (
"### Inštrukcia:\n"
f"{instruction}\n\n"
"### Odpoveď:\n"
)
return (
"### Inštrukcia:\n"
f"{instruction}\n\n"
"### Vstup:\n"
f"{str(input_text).strip()}\n\n"
"### Odpoveď:\n"
)
def make_full_text(example, tokenizer):
prompt = make_prompt(example)
reference = str(example["output"]).strip()
return prompt + reference + tokenizer.eos_token
def load_model():
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
print("GPU:", torch.cuda.get_device_name(0))
print("Načítavam tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=False)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("Načítavam základný model v 4-bit režime...")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.float16,
)
base_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config,
device_map={"": 0},
dtype=torch.float16,
)
print("Pripájam LoRA adaptér...")
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
model.eval()
print("Model pripravený.")
print("-" * 80)
return model, tokenizer
def load_eval_dataset():
raw_dataset = load_dataset(DATASET_NAME)
eval_dataset = raw_dataset["test"].shuffle(seed=42)
if NUM_EVAL_SAMPLES is not None:
eval_dataset = eval_dataset.select(
range(min(NUM_EVAL_SAMPLES, len(eval_dataset)))
)
print("Eval vzoriek:", len(eval_dataset))
return eval_dataset
def generate_predictions(model, tokenizer, eval_dataset):
predictions = []
references = []
for example in tqdm(eval_dataset, desc="Generujem odpovede"):
prompt = make_prompt(example)
reference = str(example["output"]).strip()
inputs = tokenizer(
prompt,
return_tensors="pt",
truncation=True,
max_length=MAX_LENGTH,
).to(model.device)
input_length = inputs["input_ids"].shape[-1]
with torch.no_grad():
if DO_SAMPLE:
output_ids = model.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
)
else:
output_ids = model.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
do_sample=False,
repetition_penalty=1.1,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
)
prediction = tokenizer.decode(
output_ids[0][input_length:],
skip_special_tokens=True,
).strip()
predictions.append(prediction)
references.append(reference)
return predictions, references
def compute_rouge(predictions, references):
scorer = rouge_scorer.RougeScorer(
["rouge1", "rouge2", "rougeL"],
use_stemmer=False,
)
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
for prediction, reference in zip(predictions, references):
scores = scorer.score(reference, prediction)
rouge1_scores.append(scores["rouge1"].fmeasure)
rouge2_scores.append(scores["rouge2"].fmeasure)
rougeL_scores.append(scores["rougeL"].fmeasure)
return {
"rouge1": sum(rouge1_scores) / len(rouge1_scores),
"rouge2": sum(rouge2_scores) / len(rouge2_scores),
"rougeL": sum(rougeL_scores) / len(rougeL_scores),
}
def compute_bleu(predictions, references):
bleu = sacrebleu.corpus_bleu(
predictions,
[references],
)
return bleu.score
def compute_perplexity(model, tokenizer, eval_dataset):
total_loss = 0.0
total_tokens = 0
for example in tqdm(eval_dataset, desc="Počítam perplexitu"):
prompt = make_prompt(example)
full_text = make_full_text(example, tokenizer)
prompt_ids = tokenizer(
prompt,
add_special_tokens=False,
truncation=True,
max_length=MAX_LENGTH,
)["input_ids"]
encoded = tokenizer(
full_text,
add_special_tokens=False,
truncation=True,
max_length=MAX_LENGTH,
padding="max_length",
return_tensors="pt",
)
input_ids = encoded["input_ids"].to(model.device)
attention_mask = encoded["attention_mask"].to(model.device)
labels = input_ids.clone()
prompt_len = min(len(prompt_ids), MAX_LENGTH)
labels[:, :prompt_len] = -100
labels[attention_mask == 0] = -100
valid_tokens = (labels != -100).sum().item()
if valid_tokens == 0:
continue
with torch.no_grad():
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels,
)
loss = outputs.loss.item()
total_loss += loss * valid_tokens
total_tokens += valid_tokens
avg_loss = total_loss / total_tokens
perplexity = math.exp(avg_loss)
return avg_loss, perplexity
def save_metrics(metrics):
with open(METRICS_FILE, "w", encoding="utf-8") as file:
json.dump(metrics, file, ensure_ascii=False, indent=2)
print("Výsledné metriky uložené do:")
print(METRICS_FILE)
def main():
model, tokenizer = load_model()
eval_dataset = load_eval_dataset()
predictions, references = generate_predictions(model, tokenizer, eval_dataset)
print("Počítam ROUGE...")
rouge_scores = compute_rouge(predictions, references)
print("Počítam BLEU...")
bleu_score = compute_bleu(predictions, references)
print("Počítam perplexitu...")
eval_loss, perplexity = compute_perplexity(model, tokenizer, eval_dataset)
metrics = {
"num_eval_samples": len(eval_dataset),
"rouge1": rouge_scores["rouge1"],
"rouge2": rouge_scores["rouge2"],
"rougeL": rouge_scores["rougeL"],
"bleu": bleu_score,
"eval_loss": eval_loss,
"perplexity": perplexity,
"adapter_dir": ADAPTER_DIR,
"model_name": MODEL_NAME,
"dataset_name": DATASET_NAME,
}
print("=" * 80)
print("FINÁLNE METRIKY")
print("=" * 80)
print(f"ROUGE-1: {metrics['rouge1']:.4f}")
print(f"ROUGE-2: {metrics['rouge2']:.4f}")
print(f"ROUGE-L: {metrics['rougeL']:.4f}")
print(f"BLEU: {metrics['bleu']:.4f}")
print(f"Eval loss: {metrics['eval_loss']:.4f}")
print(f"Perplexity: {metrics['perplexity']:.4f}")
print("=" * 80)
save_metrics(metrics)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,89 @@
top.booster: unsloth
top.checkpoint_path: []
top.finetuning_type: lora
top.model_name: Custom
top.quantization_bit: '4'
top.quantization_method: bnb
top.rope_scaling: none
top.template: alpaca
train.additional_target: ''
train.apollo_rank: 16
train.apollo_scale: 32
train.apollo_target: all
train.apollo_update_interval: 200
train.badam_mode: layer
train.badam_switch_interval: 50
train.badam_switch_mode: ascending
train.badam_update_ratio: 0.05
train.batch_size: 1
train.compute_type: fp16
train.create_new_adapter: false
train.cutoff_len: 1024
train.dataset:
- alpaca_slovak_cleaned
train.dataset_dir: data
train.ds_offload: false
train.ds_stage: none
train.enable_thinking: false
train.extra_args: '{"optim": "adamw_8bit", "eval_steps": 1000, "eval_strategy": "steps",
"save_total_limit": 2}'
train.freeze_extra_modules: ''
train.freeze_language_model: false
train.freeze_multi_modal_projector: true
train.freeze_trainable_layers: 2
train.freeze_trainable_modules: all
train.freeze_vision_tower: true
train.galore_rank: 16
train.galore_scale: 2
train.galore_target: all
train.galore_update_interval: 200
train.gradient_accumulation_steps: 8
train.hub_private_repo: false
train.image_max_pixels: 768*768
train.image_min_pixels: 32*32
train.learning_rate: 2e-4
train.logging_steps: 5
train.lora_alpha: 32
train.lora_dropout: 0.05
train.lora_rank: 16
train.lora_target: q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj
train.loraplus_lr_ratio: 0
train.lr_scheduler_type: cosine
train.mask_history: false
train.max_grad_norm: '1.0'
train.max_samples: '50000'
train.neat_packing: false
train.neftune_alpha: 0
train.num_train_epochs: '1.0'
train.packing: false
train.ppo_score_norm: false
train.ppo_whiten_rewards: false
train.pref_beta: 0.1
train.pref_ftx: 0
train.pref_loss: sigmoid
train.project: huggingface
train.report_to: none
train.resize_vocab: false
train.reward_model: []
train.save_steps: 1000
train.swanlab_api_key: ''
train.swanlab_link: null
train.swanlab_mode: cloud
train.swanlab_project: llamafactory
train.swanlab_run_name: ''
train.swanlab_workspace: ''
train.trackio_space_id: trackio
train.train_on_prompt: false
train.training_stage: Supervised Fine-Tuning
train.use_apollo: false
train.use_badam: false
train.use_dora: false
train.use_galore: false
train.use_llama_pro: false
train.use_pissa: false
train.use_rslora: false
train.use_swanlab: false
train.val_size: 0.025
train.video_max_pixels: 256*256
train.video_min_pixels: 16*16
train.warmup_steps: 150

141
kody/test_mistral_lora.py Normal file
View File

@ -0,0 +1,141 @@
import os
from pathlib import Path
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
MODEL_NAME = "slovak-nlp/mistral-sk-7b"
ADAPTER_DIR = "/home/schwarc/diplomovka/mistral_sk_alpaca/mistral-sk-7b-alpaca-slovak-unsloth-lora-full"
MAX_NEW_TOKENS = 300
def make_prompt(instruction, input_text=""):
instruction = instruction.strip()
input_text = input_text.strip()
if input_text:
return (
"### Inštrukcia:\n"
f"{instruction}\n\n"
"### Vstup:\n"
f"{input_text}\n\n"
"### Odpoveď:\n"
)
return (
"### Inštrukcia:\n"
f"{instruction}\n\n"
"### Odpoveď:\n"
)
def load_model():
adapter_path = Path(ADAPTER_DIR)
if not adapter_path.exists():
raise FileNotFoundError(f"Adaptér neexistuje: {ADAPTER_DIR}")
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
print("GPU:", torch.cuda.get_device_name(0))
print("Načítavam tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=False)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("Načítavam základný model v 4-bit režime...")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.float16,
)
base_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config,
device_map={"": 0},
dtype=torch.float16,
)
print("Pripájam LoRA adaptér...")
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
model.eval()
print("Model je pripravený.")
print("-" * 80)
return model, tokenizer
def generate_answer(model, tokenizer, instruction):
prompt = make_prompt(instruction)
inputs = tokenizer(
prompt,
return_tensors="pt",
truncation=True,
max_length=1024,
).to(model.device)
input_length = inputs["input_ids"].shape[-1]
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
)
answer = tokenizer.decode(
output_ids[0][input_length:],
skip_special_tokens=True,
)
return answer.strip()
def main():
model, tokenizer = load_model()
print("Napíš inštrukciu.")
print("Ukončenie: exit, quit alebo koniec")
print("-" * 80)
while True:
instruction = input("\nInštrukcia: ").strip()
if instruction.lower() in ["exit", "quit", "koniec"]:
print("Koniec.")
break
if not instruction:
continue
answer = generate_answer(model, tokenizer, instruction)
print("\nOdpoveď:")
print(answer)
print("-" * 80)
if __name__ == "__main__":
main()

247
kody/train_mistral_full.py Normal file
View File

@ -0,0 +1,247 @@
import os
from pathlib import Path
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
import torch
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
BitsAndBytesConfig,
Trainer,
TrainingArguments,
default_data_collator,
)
from transformers.trainer_utils import get_last_checkpoint
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
MODEL_NAME = "slovak-nlp/mistral-sk-7b"
DATASET_NAME = "saillab/alpaca-slovak-cleaned"
PROJECT_DIR = Path.home() / "diplomovka" / "mistral_sk_alpaca"
OUTPUT_DIR = PROJECT_DIR / "outputs-full"
ADAPTER_DIR = PROJECT_DIR / "mistral-sk-7b-alpaca-slovak-lora-full"
PROJECT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
MAX_TRAIN_SAMPLES = None
MAX_EVAL_SAMPLES = 1000
MAX_LENGTH = 1024
BATCH_SIZE = 1
GRAD_ACCUM = 8
LEARNING_RATE = 2e-4
NUM_EPOCHS = 1
SAVE_STEPS = 1000
EVAL_STEPS = 1000
WARMUP_STEPS = 150
MAX_STEPS = -1
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
print("GPU:", torch.cuda.get_device_name(0))
print("VRAM GB:", round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2))
print("Project dir:", PROJECT_DIR)
print("Output dir:", OUTPUT_DIR)
print("Adapter dir:", ADAPTER_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
raw_dataset = load_dataset(DATASET_NAME)
print(raw_dataset)
def is_empty(value):
if value is None:
return True
value = str(value).strip()
return value == "" or value.lower() == "nan"
def build_prompt(example):
instruction = str(example["instruction"]).strip()
input_text = example.get("input")
if is_empty(input_text):
prompt = f"### Inštrukcia:\n{instruction}\n\n### Odpoveď:\n"
else:
prompt = f"### Inštrukcia:\n{instruction}\n\n### Vstup:\n{str(input_text).strip()}\n\n### Odpoveď:\n"
completion = str(example["output"]).strip() + tokenizer.eos_token
return {
"prompt": prompt,
"completion": completion,
"text": prompt + completion,
}
dataset = raw_dataset.map(
build_prompt,
remove_columns=raw_dataset["train"].column_names,
)
if MAX_TRAIN_SAMPLES is not None:
dataset["train"] = dataset["train"].select(range(min(MAX_TRAIN_SAMPLES, len(dataset["train"]))))
if MAX_EVAL_SAMPLES is not None:
dataset["test"] = dataset["test"].select(range(min(MAX_EVAL_SAMPLES, len(dataset["test"]))))
print(dataset)
def tokenize_example(example):
prompt_ids = tokenizer(
example["prompt"],
add_special_tokens=False,
truncation=True,
max_length=MAX_LENGTH,
)["input_ids"]
full = tokenizer(
example["text"],
add_special_tokens=False,
truncation=True,
max_length=MAX_LENGTH,
padding="max_length",
)
input_ids = full["input_ids"]
attention_mask = full["attention_mask"]
labels = input_ids.copy()
prompt_len = min(len(prompt_ids), MAX_LENGTH)
labels[:prompt_len] = [-100] * prompt_len
labels = [
label if mask == 1 else -100
for label, mask in zip(labels, attention_mask)
]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": labels,
}
tokenized_dataset = dataset.map(
tokenize_example,
remove_columns=dataset["train"].column_names,
num_proc=1,
)
print(tokenized_dataset)
print("Tokenizácia hotová.")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.float16,
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config,
device_map={"": 0},
dtype=torch.float16,
)
model.config.use_cache = False
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
training_args = TrainingArguments(
output_dir=str(OUTPUT_DIR),
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
gradient_accumulation_steps=GRAD_ACCUM,
learning_rate=LEARNING_RATE,
num_train_epochs=NUM_EPOCHS,
max_steps=MAX_STEPS,
fp16=True,
bf16=False,
logging_steps=10,
save_steps=SAVE_STEPS,
save_total_limit=2,
eval_strategy="steps",
eval_steps=EVAL_STEPS,
optim="paged_adamw_8bit",
warmup_steps=WARMUP_STEPS,
lr_scheduler_type="cosine",
max_grad_norm=0.3,
gradient_checkpointing=True,
report_to="none",
remove_unused_columns=False,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["test"],
data_collator=default_data_collator,
)
last_checkpoint = None
if OUTPUT_DIR.exists():
last_checkpoint = get_last_checkpoint(str(OUTPUT_DIR))
if last_checkpoint is not None:
print("Pokračujem z checkpointu:", last_checkpoint)
else:
print("Začínam nový tréning.")
trainer.train(resume_from_checkpoint=last_checkpoint)
metrics = trainer.evaluate()
print(metrics)
trainer.save_model(str(ADAPTER_DIR))
tokenizer.save_pretrained(str(ADAPTER_DIR))
print("Hotovo.")
print("LoRA adaptér uložený do:")
print(ADAPTER_DIR)

View File

@ -0,0 +1,248 @@
import os
from pathlib import Path
# Použijeme GPU 0
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
import torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from transformers.trainer_utils import get_last_checkpoint
MODEL_NAME = "slovak-nlp/mistral-sk-7b"
DATASET_NAME = "saillab/alpaca-slovak-cleaned"
PROJECT_DIR = Path.home() / "diplomovka" / "mistral_sk_alpaca"
# Výstupy pre FULL Unsloth experiment
OUTPUT_DIR = PROJECT_DIR / "outputs-unsloth-full"
ADAPTER_DIR = PROJECT_DIR / "mistral-sk-7b-alpaca-slovak-unsloth-lora-full"
PROJECT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Nastavenia tréningu
MAX_SEQ_LENGTH = 1024
MAX_TRAIN_SAMPLES = None # None = celý train dataset
MAX_EVAL_SAMPLES = 1000 # validácia na 1000 príkladoch
NUM_EPOCHS = 1
MAX_STEPS = -1 # -1 = pôjde podľa epoch
BATCH_SIZE = 1
GRAD_ACCUM = 8
LEARNING_RATE = 2e-4
SAVE_STEPS = 1000
EVAL_STEPS = 1000
WARMUP_STEPS = 150
print("=" * 80)
print("Experiment: Unsloth + TRL SFTTrainer + QLoRA")
print("=" * 80)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
print("GPU:", torch.cuda.get_device_name(0))
print("VRAM GB:", round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2))
print("Torch:", torch.__version__)
print("Model:", MODEL_NAME)
print("Dataset:", DATASET_NAME)
print("Output dir:", OUTPUT_DIR)
print("Adapter dir:", ADAPTER_DIR)
print("=" * 80)
# 1. Načítanie modelu cez Unsloth v 4-bit režime
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=MODEL_NAME,
max_seq_length=MAX_SEQ_LENGTH,
dtype=torch.float16,
load_in_4bit=True,
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# 2. Pridanie LoRA adaptéra
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
],
lora_alpha=32,
lora_dropout=0.05,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=42,
)
print("LoRA adaptér pridaný.")
# 3. Načítanie datasetu
raw_dataset = load_dataset(DATASET_NAME)
print("Pôvodný dataset:")
print(raw_dataset)
# 4. Formátovanie do Alpaca štýlu
def is_empty(value):
if value is None:
return True
value = str(value).strip()
return value == "" or value.lower() == "nan"
def format_example(example):
instruction = str(example["instruction"]).strip()
input_text = example.get("input")
output = str(example["output"]).strip()
if is_empty(input_text):
text = (
f"### Inštrukcia:\n"
f"{instruction}\n\n"
f"### Odpoveď:\n"
f"{output}"
)
else:
text = (
f"### Inštrukcia:\n"
f"{instruction}\n\n"
f"### Vstup:\n"
f"{str(input_text).strip()}\n\n"
f"### Odpoveď:\n"
f"{output}"
)
return {
"text": text + tokenizer.eos_token
}
dataset = raw_dataset.map(
format_example,
remove_columns=raw_dataset["train"].column_names,
)
# Premiešanie kvôli lepšej reprezentatívnosti
dataset["train"] = dataset["train"].shuffle(seed=42)
dataset["test"] = dataset["test"].shuffle(seed=42)
if MAX_TRAIN_SAMPLES is not None:
dataset["train"] = dataset["train"].select(
range(min(MAX_TRAIN_SAMPLES, len(dataset["train"])))
)
if MAX_EVAL_SAMPLES is not None:
dataset["test"] = dataset["test"].select(
range(min(MAX_EVAL_SAMPLES, len(dataset["test"])))
)
print("Použitý dataset:")
print(dataset)
print("Ukážka tréningového textu:")
print(dataset["train"][0]["text"][:1200])
print("=" * 80)
# 5. TRL SFTTrainer
training_args = SFTConfig(
output_dir=str(OUTPUT_DIR),
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
gradient_accumulation_steps=GRAD_ACCUM,
learning_rate=LEARNING_RATE,
num_train_epochs=NUM_EPOCHS,
max_steps=MAX_STEPS,
logging_steps=10,
eval_strategy="steps",
eval_steps=EVAL_STEPS,
save_strategy="steps",
save_steps=SAVE_STEPS,
save_total_limit=2,
warmup_steps=WARMUP_STEPS,
optim="adamw_8bit",
fp16=True,
bf16=False,
lr_scheduler_type="cosine",
report_to="none",
max_length=MAX_SEQ_LENGTH,
packing=False,
dataset_text_field="text",
)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
args=training_args,
)
# 6. Pokračovanie z checkpointu, ak existuje
last_checkpoint = None
if OUTPUT_DIR.exists():
last_checkpoint = get_last_checkpoint(str(OUTPUT_DIR))
if last_checkpoint is not None:
print("Pokračujem z checkpointu:", last_checkpoint)
else:
print("Začínam nový Unsloth full tréning.")
# 7. Tréning
train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
print("Výsledok tréningu:")
print(train_result)
# 8. Finálne vyhodnotenie
metrics = trainer.evaluate()
print("Finálne metriky:")
print(metrics)
# 9. Uloženie LoRA adaptéra
model.save_pretrained(str(ADAPTER_DIR))
tokenizer.save_pretrained(str(ADAPTER_DIR))
print("=" * 80)
print("Hotovo.")
print("Unsloth LoRA adaptér uložený do:")
print(ADAPTER_DIR)
print("=" * 80)