Pridanie_Kodov
This commit is contained in:
parent
0ef6cf229c
commit
ae919106c1
317
kody/evaluate_mistral_lora_metrics.py
Normal file
317
kody/evaluate_mistral_lora_metrics.py
Normal file
@ -0,0 +1,317 @@
|
||||
import os
|
||||
import json
|
||||
import math
|
||||
from pathlib import Path
|
||||
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||
from peft import PeftModel
|
||||
from rouge_score import rouge_scorer
|
||||
import sacrebleu
|
||||
|
||||
|
||||
MODEL_NAME = "slovak-nlp/mistral-sk-7b"
|
||||
DATASET_NAME = "saillab/alpaca-slovak-cleaned"
|
||||
|
||||
# Tvoj Unsloth + TRL + QLoRA adaptér
|
||||
ADAPTER_DIR = "/home/schwarc/diplomovka/mistral_sk_alpaca/mistral-sk-7b-alpaca-slovak-unsloth-lora-full"
|
||||
|
||||
PROJECT_DIR = Path.home() / "diplomovka" / "evaluation_results"
|
||||
PROJECT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
METRICS_FILE = PROJECT_DIR / "unsloth_lora_metrics.json"
|
||||
|
||||
NUM_EVAL_SAMPLES = 1000
|
||||
MAX_LENGTH = 1024
|
||||
MAX_NEW_TOKENS = 300
|
||||
|
||||
# Pri hodnotení je lepšie deterministické generovanie
|
||||
DO_SAMPLE = False
|
||||
|
||||
|
||||
def is_empty(value):
|
||||
if value is None:
|
||||
return True
|
||||
|
||||
value = str(value).strip()
|
||||
|
||||
return value == "" or value.lower() == "nan"
|
||||
|
||||
|
||||
def make_prompt(example):
|
||||
instruction = str(example["instruction"]).strip()
|
||||
input_text = example.get("input")
|
||||
|
||||
if is_empty(input_text):
|
||||
return (
|
||||
"### Inštrukcia:\n"
|
||||
f"{instruction}\n\n"
|
||||
"### Odpoveď:\n"
|
||||
)
|
||||
|
||||
return (
|
||||
"### Inštrukcia:\n"
|
||||
f"{instruction}\n\n"
|
||||
"### Vstup:\n"
|
||||
f"{str(input_text).strip()}\n\n"
|
||||
"### Odpoveď:\n"
|
||||
)
|
||||
|
||||
|
||||
def make_full_text(example, tokenizer):
|
||||
prompt = make_prompt(example)
|
||||
reference = str(example["output"]).strip()
|
||||
|
||||
return prompt + reference + tokenizer.eos_token
|
||||
|
||||
|
||||
def load_model():
|
||||
print("CUDA available:", torch.cuda.is_available())
|
||||
|
||||
if torch.cuda.is_available():
|
||||
print("GPU:", torch.cuda.get_device_name(0))
|
||||
|
||||
print("Načítavam tokenizer...")
|
||||
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=False)
|
||||
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
print("Načítavam základný model v 4-bit režime...")
|
||||
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
)
|
||||
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_NAME,
|
||||
quantization_config=bnb_config,
|
||||
device_map={"": 0},
|
||||
dtype=torch.float16,
|
||||
)
|
||||
|
||||
print("Pripájam LoRA adaptér...")
|
||||
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
|
||||
model.eval()
|
||||
|
||||
print("Model pripravený.")
|
||||
print("-" * 80)
|
||||
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def load_eval_dataset():
|
||||
raw_dataset = load_dataset(DATASET_NAME)
|
||||
|
||||
eval_dataset = raw_dataset["test"].shuffle(seed=42)
|
||||
|
||||
if NUM_EVAL_SAMPLES is not None:
|
||||
eval_dataset = eval_dataset.select(
|
||||
range(min(NUM_EVAL_SAMPLES, len(eval_dataset)))
|
||||
)
|
||||
|
||||
print("Eval vzoriek:", len(eval_dataset))
|
||||
|
||||
return eval_dataset
|
||||
|
||||
|
||||
def generate_predictions(model, tokenizer, eval_dataset):
|
||||
predictions = []
|
||||
references = []
|
||||
|
||||
for example in tqdm(eval_dataset, desc="Generujem odpovede"):
|
||||
prompt = make_prompt(example)
|
||||
reference = str(example["output"]).strip()
|
||||
|
||||
inputs = tokenizer(
|
||||
prompt,
|
||||
return_tensors="pt",
|
||||
truncation=True,
|
||||
max_length=MAX_LENGTH,
|
||||
).to(model.device)
|
||||
|
||||
input_length = inputs["input_ids"].shape[-1]
|
||||
|
||||
with torch.no_grad():
|
||||
if DO_SAMPLE:
|
||||
output_ids = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=MAX_NEW_TOKENS,
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
repetition_penalty=1.1,
|
||||
eos_token_id=tokenizer.eos_token_id,
|
||||
pad_token_id=tokenizer.pad_token_id,
|
||||
)
|
||||
else:
|
||||
output_ids = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=MAX_NEW_TOKENS,
|
||||
do_sample=False,
|
||||
repetition_penalty=1.1,
|
||||
eos_token_id=tokenizer.eos_token_id,
|
||||
pad_token_id=tokenizer.pad_token_id,
|
||||
)
|
||||
|
||||
prediction = tokenizer.decode(
|
||||
output_ids[0][input_length:],
|
||||
skip_special_tokens=True,
|
||||
).strip()
|
||||
|
||||
predictions.append(prediction)
|
||||
references.append(reference)
|
||||
|
||||
return predictions, references
|
||||
|
||||
|
||||
def compute_rouge(predictions, references):
|
||||
scorer = rouge_scorer.RougeScorer(
|
||||
["rouge1", "rouge2", "rougeL"],
|
||||
use_stemmer=False,
|
||||
)
|
||||
|
||||
rouge1_scores = []
|
||||
rouge2_scores = []
|
||||
rougeL_scores = []
|
||||
|
||||
for prediction, reference in zip(predictions, references):
|
||||
scores = scorer.score(reference, prediction)
|
||||
|
||||
rouge1_scores.append(scores["rouge1"].fmeasure)
|
||||
rouge2_scores.append(scores["rouge2"].fmeasure)
|
||||
rougeL_scores.append(scores["rougeL"].fmeasure)
|
||||
|
||||
return {
|
||||
"rouge1": sum(rouge1_scores) / len(rouge1_scores),
|
||||
"rouge2": sum(rouge2_scores) / len(rouge2_scores),
|
||||
"rougeL": sum(rougeL_scores) / len(rougeL_scores),
|
||||
}
|
||||
|
||||
|
||||
def compute_bleu(predictions, references):
|
||||
bleu = sacrebleu.corpus_bleu(
|
||||
predictions,
|
||||
[references],
|
||||
)
|
||||
|
||||
return bleu.score
|
||||
|
||||
|
||||
def compute_perplexity(model, tokenizer, eval_dataset):
|
||||
total_loss = 0.0
|
||||
total_tokens = 0
|
||||
|
||||
for example in tqdm(eval_dataset, desc="Počítam perplexitu"):
|
||||
prompt = make_prompt(example)
|
||||
full_text = make_full_text(example, tokenizer)
|
||||
|
||||
prompt_ids = tokenizer(
|
||||
prompt,
|
||||
add_special_tokens=False,
|
||||
truncation=True,
|
||||
max_length=MAX_LENGTH,
|
||||
)["input_ids"]
|
||||
|
||||
encoded = tokenizer(
|
||||
full_text,
|
||||
add_special_tokens=False,
|
||||
truncation=True,
|
||||
max_length=MAX_LENGTH,
|
||||
padding="max_length",
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
input_ids = encoded["input_ids"].to(model.device)
|
||||
attention_mask = encoded["attention_mask"].to(model.device)
|
||||
|
||||
labels = input_ids.clone()
|
||||
|
||||
prompt_len = min(len(prompt_ids), MAX_LENGTH)
|
||||
|
||||
labels[:, :prompt_len] = -100
|
||||
labels[attention_mask == 0] = -100
|
||||
|
||||
valid_tokens = (labels != -100).sum().item()
|
||||
|
||||
if valid_tokens == 0:
|
||||
continue
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
labels=labels,
|
||||
)
|
||||
|
||||
loss = outputs.loss.item()
|
||||
|
||||
total_loss += loss * valid_tokens
|
||||
total_tokens += valid_tokens
|
||||
|
||||
avg_loss = total_loss / total_tokens
|
||||
perplexity = math.exp(avg_loss)
|
||||
|
||||
return avg_loss, perplexity
|
||||
|
||||
|
||||
def save_metrics(metrics):
|
||||
with open(METRICS_FILE, "w", encoding="utf-8") as file:
|
||||
json.dump(metrics, file, ensure_ascii=False, indent=2)
|
||||
|
||||
print("Výsledné metriky uložené do:")
|
||||
print(METRICS_FILE)
|
||||
|
||||
|
||||
def main():
|
||||
model, tokenizer = load_model()
|
||||
eval_dataset = load_eval_dataset()
|
||||
|
||||
predictions, references = generate_predictions(model, tokenizer, eval_dataset)
|
||||
|
||||
print("Počítam ROUGE...")
|
||||
rouge_scores = compute_rouge(predictions, references)
|
||||
|
||||
print("Počítam BLEU...")
|
||||
bleu_score = compute_bleu(predictions, references)
|
||||
|
||||
print("Počítam perplexitu...")
|
||||
eval_loss, perplexity = compute_perplexity(model, tokenizer, eval_dataset)
|
||||
|
||||
metrics = {
|
||||
"num_eval_samples": len(eval_dataset),
|
||||
"rouge1": rouge_scores["rouge1"],
|
||||
"rouge2": rouge_scores["rouge2"],
|
||||
"rougeL": rouge_scores["rougeL"],
|
||||
"bleu": bleu_score,
|
||||
"eval_loss": eval_loss,
|
||||
"perplexity": perplexity,
|
||||
"adapter_dir": ADAPTER_DIR,
|
||||
"model_name": MODEL_NAME,
|
||||
"dataset_name": DATASET_NAME,
|
||||
}
|
||||
|
||||
print("=" * 80)
|
||||
print("FINÁLNE METRIKY")
|
||||
print("=" * 80)
|
||||
print(f"ROUGE-1: {metrics['rouge1']:.4f}")
|
||||
print(f"ROUGE-2: {metrics['rouge2']:.4f}")
|
||||
print(f"ROUGE-L: {metrics['rougeL']:.4f}")
|
||||
print(f"BLEU: {metrics['bleu']:.4f}")
|
||||
print(f"Eval loss: {metrics['eval_loss']:.4f}")
|
||||
print(f"Perplexity: {metrics['perplexity']:.4f}")
|
||||
print("=" * 80)
|
||||
|
||||
save_metrics(metrics)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
89
kody/mistral_sk_llamafactory_train.yaml
Normal file
89
kody/mistral_sk_llamafactory_train.yaml
Normal file
@ -0,0 +1,89 @@
|
||||
top.booster: unsloth
|
||||
top.checkpoint_path: []
|
||||
top.finetuning_type: lora
|
||||
top.model_name: Custom
|
||||
top.quantization_bit: '4'
|
||||
top.quantization_method: bnb
|
||||
top.rope_scaling: none
|
||||
top.template: alpaca
|
||||
train.additional_target: ''
|
||||
train.apollo_rank: 16
|
||||
train.apollo_scale: 32
|
||||
train.apollo_target: all
|
||||
train.apollo_update_interval: 200
|
||||
train.badam_mode: layer
|
||||
train.badam_switch_interval: 50
|
||||
train.badam_switch_mode: ascending
|
||||
train.badam_update_ratio: 0.05
|
||||
train.batch_size: 1
|
||||
train.compute_type: fp16
|
||||
train.create_new_adapter: false
|
||||
train.cutoff_len: 1024
|
||||
train.dataset:
|
||||
- alpaca_slovak_cleaned
|
||||
train.dataset_dir: data
|
||||
train.ds_offload: false
|
||||
train.ds_stage: none
|
||||
train.enable_thinking: false
|
||||
train.extra_args: '{"optim": "adamw_8bit", "eval_steps": 1000, "eval_strategy": "steps",
|
||||
"save_total_limit": 2}'
|
||||
train.freeze_extra_modules: ''
|
||||
train.freeze_language_model: false
|
||||
train.freeze_multi_modal_projector: true
|
||||
train.freeze_trainable_layers: 2
|
||||
train.freeze_trainable_modules: all
|
||||
train.freeze_vision_tower: true
|
||||
train.galore_rank: 16
|
||||
train.galore_scale: 2
|
||||
train.galore_target: all
|
||||
train.galore_update_interval: 200
|
||||
train.gradient_accumulation_steps: 8
|
||||
train.hub_private_repo: false
|
||||
train.image_max_pixels: 768*768
|
||||
train.image_min_pixels: 32*32
|
||||
train.learning_rate: 2e-4
|
||||
train.logging_steps: 5
|
||||
train.lora_alpha: 32
|
||||
train.lora_dropout: 0.05
|
||||
train.lora_rank: 16
|
||||
train.lora_target: q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj
|
||||
train.loraplus_lr_ratio: 0
|
||||
train.lr_scheduler_type: cosine
|
||||
train.mask_history: false
|
||||
train.max_grad_norm: '1.0'
|
||||
train.max_samples: '50000'
|
||||
train.neat_packing: false
|
||||
train.neftune_alpha: 0
|
||||
train.num_train_epochs: '1.0'
|
||||
train.packing: false
|
||||
train.ppo_score_norm: false
|
||||
train.ppo_whiten_rewards: false
|
||||
train.pref_beta: 0.1
|
||||
train.pref_ftx: 0
|
||||
train.pref_loss: sigmoid
|
||||
train.project: huggingface
|
||||
train.report_to: none
|
||||
train.resize_vocab: false
|
||||
train.reward_model: []
|
||||
train.save_steps: 1000
|
||||
train.swanlab_api_key: ''
|
||||
train.swanlab_link: null
|
||||
train.swanlab_mode: cloud
|
||||
train.swanlab_project: llamafactory
|
||||
train.swanlab_run_name: ''
|
||||
train.swanlab_workspace: ''
|
||||
train.trackio_space_id: trackio
|
||||
train.train_on_prompt: false
|
||||
train.training_stage: Supervised Fine-Tuning
|
||||
train.use_apollo: false
|
||||
train.use_badam: false
|
||||
train.use_dora: false
|
||||
train.use_galore: false
|
||||
train.use_llama_pro: false
|
||||
train.use_pissa: false
|
||||
train.use_rslora: false
|
||||
train.use_swanlab: false
|
||||
train.val_size: 0.025
|
||||
train.video_max_pixels: 256*256
|
||||
train.video_min_pixels: 16*16
|
||||
train.warmup_steps: 150
|
||||
141
kody/test_mistral_lora.py
Normal file
141
kody/test_mistral_lora.py
Normal file
@ -0,0 +1,141 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||
from peft import PeftModel
|
||||
|
||||
|
||||
MODEL_NAME = "slovak-nlp/mistral-sk-7b"
|
||||
|
||||
ADAPTER_DIR = "/home/schwarc/diplomovka/mistral_sk_alpaca/mistral-sk-7b-alpaca-slovak-unsloth-lora-full"
|
||||
|
||||
MAX_NEW_TOKENS = 300
|
||||
|
||||
|
||||
def make_prompt(instruction, input_text=""):
|
||||
instruction = instruction.strip()
|
||||
input_text = input_text.strip()
|
||||
|
||||
if input_text:
|
||||
return (
|
||||
"### Inštrukcia:\n"
|
||||
f"{instruction}\n\n"
|
||||
"### Vstup:\n"
|
||||
f"{input_text}\n\n"
|
||||
"### Odpoveď:\n"
|
||||
)
|
||||
|
||||
return (
|
||||
"### Inštrukcia:\n"
|
||||
f"{instruction}\n\n"
|
||||
"### Odpoveď:\n"
|
||||
)
|
||||
|
||||
|
||||
def load_model():
|
||||
adapter_path = Path(ADAPTER_DIR)
|
||||
|
||||
if not adapter_path.exists():
|
||||
raise FileNotFoundError(f"Adaptér neexistuje: {ADAPTER_DIR}")
|
||||
|
||||
print("CUDA available:", torch.cuda.is_available())
|
||||
|
||||
if torch.cuda.is_available():
|
||||
print("GPU:", torch.cuda.get_device_name(0))
|
||||
|
||||
print("Načítavam tokenizer...")
|
||||
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=False)
|
||||
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
print("Načítavam základný model v 4-bit režime...")
|
||||
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
)
|
||||
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_NAME,
|
||||
quantization_config=bnb_config,
|
||||
device_map={"": 0},
|
||||
dtype=torch.float16,
|
||||
)
|
||||
|
||||
print("Pripájam LoRA adaptér...")
|
||||
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
|
||||
model.eval()
|
||||
|
||||
print("Model je pripravený.")
|
||||
print("-" * 80)
|
||||
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def generate_answer(model, tokenizer, instruction):
|
||||
prompt = make_prompt(instruction)
|
||||
|
||||
inputs = tokenizer(
|
||||
prompt,
|
||||
return_tensors="pt",
|
||||
truncation=True,
|
||||
max_length=1024,
|
||||
).to(model.device)
|
||||
|
||||
input_length = inputs["input_ids"].shape[-1]
|
||||
|
||||
with torch.no_grad():
|
||||
output_ids = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=MAX_NEW_TOKENS,
|
||||
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
|
||||
repetition_penalty=1.1,
|
||||
eos_token_id=tokenizer.eos_token_id,
|
||||
pad_token_id=tokenizer.pad_token_id,
|
||||
)
|
||||
|
||||
answer = tokenizer.decode(
|
||||
output_ids[0][input_length:],
|
||||
skip_special_tokens=True,
|
||||
)
|
||||
|
||||
return answer.strip()
|
||||
|
||||
|
||||
def main():
|
||||
model, tokenizer = load_model()
|
||||
|
||||
print("Napíš inštrukciu.")
|
||||
print("Ukončenie: exit, quit alebo koniec")
|
||||
print("-" * 80)
|
||||
|
||||
while True:
|
||||
instruction = input("\nInštrukcia: ").strip()
|
||||
|
||||
if instruction.lower() in ["exit", "quit", "koniec"]:
|
||||
print("Koniec.")
|
||||
break
|
||||
|
||||
if not instruction:
|
||||
continue
|
||||
|
||||
answer = generate_answer(model, tokenizer, instruction)
|
||||
|
||||
print("\nOdpoveď:")
|
||||
print(answer)
|
||||
print("-" * 80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
247
kody/train_mistral_full.py
Normal file
247
kody/train_mistral_full.py
Normal file
@ -0,0 +1,247 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
os.environ["WANDB_DISABLED"] = "true"
|
||||
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForCausalLM,
|
||||
BitsAndBytesConfig,
|
||||
Trainer,
|
||||
TrainingArguments,
|
||||
default_data_collator,
|
||||
)
|
||||
from transformers.trainer_utils import get_last_checkpoint
|
||||
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
||||
|
||||
|
||||
MODEL_NAME = "slovak-nlp/mistral-sk-7b"
|
||||
DATASET_NAME = "saillab/alpaca-slovak-cleaned"
|
||||
|
||||
PROJECT_DIR = Path.home() / "diplomovka" / "mistral_sk_alpaca"
|
||||
OUTPUT_DIR = PROJECT_DIR / "outputs-full"
|
||||
ADAPTER_DIR = PROJECT_DIR / "mistral-sk-7b-alpaca-slovak-lora-full"
|
||||
|
||||
PROJECT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
MAX_TRAIN_SAMPLES = None
|
||||
MAX_EVAL_SAMPLES = 1000
|
||||
|
||||
MAX_LENGTH = 1024
|
||||
BATCH_SIZE = 1
|
||||
GRAD_ACCUM = 8
|
||||
LEARNING_RATE = 2e-4
|
||||
NUM_EPOCHS = 1
|
||||
|
||||
SAVE_STEPS = 1000
|
||||
EVAL_STEPS = 1000
|
||||
WARMUP_STEPS = 150
|
||||
MAX_STEPS = -1
|
||||
|
||||
|
||||
print("CUDA available:", torch.cuda.is_available())
|
||||
if torch.cuda.is_available():
|
||||
print("GPU:", torch.cuda.get_device_name(0))
|
||||
print("VRAM GB:", round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2))
|
||||
|
||||
print("Project dir:", PROJECT_DIR)
|
||||
print("Output dir:", OUTPUT_DIR)
|
||||
print("Adapter dir:", ADAPTER_DIR)
|
||||
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
|
||||
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
|
||||
raw_dataset = load_dataset(DATASET_NAME)
|
||||
|
||||
print(raw_dataset)
|
||||
|
||||
|
||||
def is_empty(value):
|
||||
if value is None:
|
||||
return True
|
||||
value = str(value).strip()
|
||||
return value == "" or value.lower() == "nan"
|
||||
|
||||
|
||||
def build_prompt(example):
|
||||
instruction = str(example["instruction"]).strip()
|
||||
input_text = example.get("input")
|
||||
|
||||
if is_empty(input_text):
|
||||
prompt = f"### Inštrukcia:\n{instruction}\n\n### Odpoveď:\n"
|
||||
else:
|
||||
prompt = f"### Inštrukcia:\n{instruction}\n\n### Vstup:\n{str(input_text).strip()}\n\n### Odpoveď:\n"
|
||||
|
||||
completion = str(example["output"]).strip() + tokenizer.eos_token
|
||||
|
||||
return {
|
||||
"prompt": prompt,
|
||||
"completion": completion,
|
||||
"text": prompt + completion,
|
||||
}
|
||||
|
||||
|
||||
dataset = raw_dataset.map(
|
||||
build_prompt,
|
||||
remove_columns=raw_dataset["train"].column_names,
|
||||
)
|
||||
|
||||
if MAX_TRAIN_SAMPLES is not None:
|
||||
dataset["train"] = dataset["train"].select(range(min(MAX_TRAIN_SAMPLES, len(dataset["train"]))))
|
||||
|
||||
if MAX_EVAL_SAMPLES is not None:
|
||||
dataset["test"] = dataset["test"].select(range(min(MAX_EVAL_SAMPLES, len(dataset["test"]))))
|
||||
|
||||
print(dataset)
|
||||
|
||||
|
||||
def tokenize_example(example):
|
||||
prompt_ids = tokenizer(
|
||||
example["prompt"],
|
||||
add_special_tokens=False,
|
||||
truncation=True,
|
||||
max_length=MAX_LENGTH,
|
||||
)["input_ids"]
|
||||
|
||||
full = tokenizer(
|
||||
example["text"],
|
||||
add_special_tokens=False,
|
||||
truncation=True,
|
||||
max_length=MAX_LENGTH,
|
||||
padding="max_length",
|
||||
)
|
||||
|
||||
input_ids = full["input_ids"]
|
||||
attention_mask = full["attention_mask"]
|
||||
labels = input_ids.copy()
|
||||
|
||||
prompt_len = min(len(prompt_ids), MAX_LENGTH)
|
||||
|
||||
labels[:prompt_len] = [-100] * prompt_len
|
||||
labels = [
|
||||
label if mask == 1 else -100
|
||||
for label, mask in zip(labels, attention_mask)
|
||||
]
|
||||
|
||||
return {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"labels": labels,
|
||||
}
|
||||
|
||||
|
||||
tokenized_dataset = dataset.map(
|
||||
tokenize_example,
|
||||
remove_columns=dataset["train"].column_names,
|
||||
num_proc=1,
|
||||
)
|
||||
|
||||
print(tokenized_dataset)
|
||||
print("Tokenizácia hotová.")
|
||||
|
||||
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_NAME,
|
||||
quantization_config=bnb_config,
|
||||
device_map={"": 0},
|
||||
dtype=torch.float16,
|
||||
)
|
||||
|
||||
model.config.use_cache = False
|
||||
model.gradient_checkpointing_enable()
|
||||
model = prepare_model_for_kbit_training(model)
|
||||
|
||||
lora_config = LoraConfig(
|
||||
r=16,
|
||||
lora_alpha=32,
|
||||
lora_dropout=0.05,
|
||||
bias="none",
|
||||
task_type="CAUSAL_LM",
|
||||
target_modules=[
|
||||
"q_proj", "k_proj", "v_proj", "o_proj",
|
||||
"gate_proj", "up_proj", "down_proj",
|
||||
],
|
||||
)
|
||||
|
||||
model = get_peft_model(model, lora_config)
|
||||
model.print_trainable_parameters()
|
||||
|
||||
|
||||
training_args = TrainingArguments(
|
||||
output_dir=str(OUTPUT_DIR),
|
||||
|
||||
per_device_train_batch_size=BATCH_SIZE,
|
||||
per_device_eval_batch_size=BATCH_SIZE,
|
||||
gradient_accumulation_steps=GRAD_ACCUM,
|
||||
|
||||
learning_rate=LEARNING_RATE,
|
||||
num_train_epochs=NUM_EPOCHS,
|
||||
max_steps=MAX_STEPS,
|
||||
|
||||
fp16=True,
|
||||
bf16=False,
|
||||
|
||||
logging_steps=10,
|
||||
save_steps=SAVE_STEPS,
|
||||
save_total_limit=2,
|
||||
|
||||
eval_strategy="steps",
|
||||
eval_steps=EVAL_STEPS,
|
||||
|
||||
optim="paged_adamw_8bit",
|
||||
warmup_steps=WARMUP_STEPS,
|
||||
lr_scheduler_type="cosine",
|
||||
max_grad_norm=0.3,
|
||||
|
||||
gradient_checkpointing=True,
|
||||
report_to="none",
|
||||
remove_unused_columns=False,
|
||||
)
|
||||
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_dataset["train"],
|
||||
eval_dataset=tokenized_dataset["test"],
|
||||
data_collator=default_data_collator,
|
||||
)
|
||||
|
||||
|
||||
last_checkpoint = None
|
||||
if OUTPUT_DIR.exists():
|
||||
last_checkpoint = get_last_checkpoint(str(OUTPUT_DIR))
|
||||
|
||||
if last_checkpoint is not None:
|
||||
print("Pokračujem z checkpointu:", last_checkpoint)
|
||||
else:
|
||||
print("Začínam nový tréning.")
|
||||
|
||||
|
||||
trainer.train(resume_from_checkpoint=last_checkpoint)
|
||||
|
||||
metrics = trainer.evaluate()
|
||||
print(metrics)
|
||||
|
||||
trainer.save_model(str(ADAPTER_DIR))
|
||||
tokenizer.save_pretrained(str(ADAPTER_DIR))
|
||||
|
||||
print("Hotovo.")
|
||||
print("LoRA adaptér uložený do:")
|
||||
print(ADAPTER_DIR)
|
||||
248
kody/train_mistral_unsloth_trl.py
Normal file
248
kody/train_mistral_unsloth_trl.py
Normal file
@ -0,0 +1,248 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Použijeme GPU 0
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
os.environ["WANDB_DISABLED"] = "true"
|
||||
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from unsloth import FastLanguageModel
|
||||
from trl import SFTTrainer, SFTConfig
|
||||
from transformers.trainer_utils import get_last_checkpoint
|
||||
|
||||
|
||||
MODEL_NAME = "slovak-nlp/mistral-sk-7b"
|
||||
DATASET_NAME = "saillab/alpaca-slovak-cleaned"
|
||||
|
||||
PROJECT_DIR = Path.home() / "diplomovka" / "mistral_sk_alpaca"
|
||||
|
||||
# Výstupy pre FULL Unsloth experiment
|
||||
OUTPUT_DIR = PROJECT_DIR / "outputs-unsloth-full"
|
||||
ADAPTER_DIR = PROJECT_DIR / "mistral-sk-7b-alpaca-slovak-unsloth-lora-full"
|
||||
|
||||
PROJECT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Nastavenia tréningu
|
||||
MAX_SEQ_LENGTH = 1024
|
||||
|
||||
MAX_TRAIN_SAMPLES = None # None = celý train dataset
|
||||
MAX_EVAL_SAMPLES = 1000 # validácia na 1000 príkladoch
|
||||
|
||||
NUM_EPOCHS = 1
|
||||
MAX_STEPS = -1 # -1 = pôjde podľa epoch
|
||||
|
||||
BATCH_SIZE = 1
|
||||
GRAD_ACCUM = 8
|
||||
LEARNING_RATE = 2e-4
|
||||
|
||||
SAVE_STEPS = 1000
|
||||
EVAL_STEPS = 1000
|
||||
WARMUP_STEPS = 150
|
||||
|
||||
|
||||
print("=" * 80)
|
||||
print("Experiment: Unsloth + TRL SFTTrainer + QLoRA")
|
||||
print("=" * 80)
|
||||
|
||||
print("CUDA available:", torch.cuda.is_available())
|
||||
|
||||
if torch.cuda.is_available():
|
||||
print("GPU:", torch.cuda.get_device_name(0))
|
||||
print("VRAM GB:", round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2))
|
||||
|
||||
print("Torch:", torch.__version__)
|
||||
print("Model:", MODEL_NAME)
|
||||
print("Dataset:", DATASET_NAME)
|
||||
print("Output dir:", OUTPUT_DIR)
|
||||
print("Adapter dir:", ADAPTER_DIR)
|
||||
print("=" * 80)
|
||||
|
||||
|
||||
# 1. Načítanie modelu cez Unsloth v 4-bit režime
|
||||
model, tokenizer = FastLanguageModel.from_pretrained(
|
||||
model_name=MODEL_NAME,
|
||||
max_seq_length=MAX_SEQ_LENGTH,
|
||||
dtype=torch.float16,
|
||||
load_in_4bit=True,
|
||||
)
|
||||
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
|
||||
# 2. Pridanie LoRA adaptéra
|
||||
model = FastLanguageModel.get_peft_model(
|
||||
model,
|
||||
r=16,
|
||||
target_modules=[
|
||||
"q_proj",
|
||||
"k_proj",
|
||||
"v_proj",
|
||||
"o_proj",
|
||||
"gate_proj",
|
||||
"up_proj",
|
||||
"down_proj",
|
||||
],
|
||||
lora_alpha=32,
|
||||
lora_dropout=0.05,
|
||||
bias="none",
|
||||
use_gradient_checkpointing="unsloth",
|
||||
random_state=42,
|
||||
)
|
||||
|
||||
print("LoRA adaptér pridaný.")
|
||||
|
||||
|
||||
# 3. Načítanie datasetu
|
||||
raw_dataset = load_dataset(DATASET_NAME)
|
||||
|
||||
print("Pôvodný dataset:")
|
||||
print(raw_dataset)
|
||||
|
||||
|
||||
# 4. Formátovanie do Alpaca štýlu
|
||||
def is_empty(value):
|
||||
if value is None:
|
||||
return True
|
||||
|
||||
value = str(value).strip()
|
||||
|
||||
return value == "" or value.lower() == "nan"
|
||||
|
||||
|
||||
def format_example(example):
|
||||
instruction = str(example["instruction"]).strip()
|
||||
input_text = example.get("input")
|
||||
output = str(example["output"]).strip()
|
||||
|
||||
if is_empty(input_text):
|
||||
text = (
|
||||
f"### Inštrukcia:\n"
|
||||
f"{instruction}\n\n"
|
||||
f"### Odpoveď:\n"
|
||||
f"{output}"
|
||||
)
|
||||
else:
|
||||
text = (
|
||||
f"### Inštrukcia:\n"
|
||||
f"{instruction}\n\n"
|
||||
f"### Vstup:\n"
|
||||
f"{str(input_text).strip()}\n\n"
|
||||
f"### Odpoveď:\n"
|
||||
f"{output}"
|
||||
)
|
||||
|
||||
return {
|
||||
"text": text + tokenizer.eos_token
|
||||
}
|
||||
|
||||
|
||||
dataset = raw_dataset.map(
|
||||
format_example,
|
||||
remove_columns=raw_dataset["train"].column_names,
|
||||
)
|
||||
|
||||
# Premiešanie kvôli lepšej reprezentatívnosti
|
||||
dataset["train"] = dataset["train"].shuffle(seed=42)
|
||||
dataset["test"] = dataset["test"].shuffle(seed=42)
|
||||
|
||||
if MAX_TRAIN_SAMPLES is not None:
|
||||
dataset["train"] = dataset["train"].select(
|
||||
range(min(MAX_TRAIN_SAMPLES, len(dataset["train"])))
|
||||
)
|
||||
|
||||
if MAX_EVAL_SAMPLES is not None:
|
||||
dataset["test"] = dataset["test"].select(
|
||||
range(min(MAX_EVAL_SAMPLES, len(dataset["test"])))
|
||||
)
|
||||
|
||||
print("Použitý dataset:")
|
||||
print(dataset)
|
||||
|
||||
print("Ukážka tréningového textu:")
|
||||
print(dataset["train"][0]["text"][:1200])
|
||||
print("=" * 80)
|
||||
|
||||
|
||||
# 5. TRL SFTTrainer
|
||||
training_args = SFTConfig(
|
||||
output_dir=str(OUTPUT_DIR),
|
||||
|
||||
per_device_train_batch_size=BATCH_SIZE,
|
||||
per_device_eval_batch_size=BATCH_SIZE,
|
||||
gradient_accumulation_steps=GRAD_ACCUM,
|
||||
|
||||
learning_rate=LEARNING_RATE,
|
||||
num_train_epochs=NUM_EPOCHS,
|
||||
max_steps=MAX_STEPS,
|
||||
|
||||
logging_steps=10,
|
||||
|
||||
eval_strategy="steps",
|
||||
eval_steps=EVAL_STEPS,
|
||||
|
||||
save_strategy="steps",
|
||||
save_steps=SAVE_STEPS,
|
||||
save_total_limit=2,
|
||||
|
||||
warmup_steps=WARMUP_STEPS,
|
||||
optim="adamw_8bit",
|
||||
fp16=True,
|
||||
bf16=False,
|
||||
|
||||
lr_scheduler_type="cosine",
|
||||
report_to="none",
|
||||
|
||||
max_length=MAX_SEQ_LENGTH,
|
||||
packing=False,
|
||||
dataset_text_field="text",
|
||||
)
|
||||
|
||||
|
||||
trainer = SFTTrainer(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
train_dataset=dataset["train"],
|
||||
eval_dataset=dataset["test"],
|
||||
args=training_args,
|
||||
)
|
||||
|
||||
|
||||
# 6. Pokračovanie z checkpointu, ak existuje
|
||||
last_checkpoint = None
|
||||
|
||||
if OUTPUT_DIR.exists():
|
||||
last_checkpoint = get_last_checkpoint(str(OUTPUT_DIR))
|
||||
|
||||
if last_checkpoint is not None:
|
||||
print("Pokračujem z checkpointu:", last_checkpoint)
|
||||
else:
|
||||
print("Začínam nový Unsloth full tréning.")
|
||||
|
||||
|
||||
# 7. Tréning
|
||||
train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
|
||||
|
||||
print("Výsledok tréningu:")
|
||||
print(train_result)
|
||||
|
||||
|
||||
# 8. Finálne vyhodnotenie
|
||||
metrics = trainer.evaluate()
|
||||
|
||||
print("Finálne metriky:")
|
||||
print(metrics)
|
||||
|
||||
|
||||
# 9. Uloženie LoRA adaptéra
|
||||
model.save_pretrained(str(ADAPTER_DIR))
|
||||
tokenizer.save_pretrained(str(ADAPTER_DIR))
|
||||
|
||||
print("=" * 80)
|
||||
print("Hotovo.")
|
||||
print("Unsloth LoRA adaptér uložený do:")
|
||||
print(ADAPTER_DIR)
|
||||
print("=" * 80)
|
||||
Loading…
Reference in New Issue
Block a user