Diplomovka/kody/test_mistral_lora.py
2026-06-11 21:42:51 +02:00

141 lines
3.3 KiB
Python

import os
from pathlib import Path
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
MODEL_NAME = "slovak-nlp/mistral-sk-7b"
ADAPTER_DIR = "/home/schwarc/diplomovka/mistral_sk_alpaca/mistral-sk-7b-alpaca-slovak-unsloth-lora-full"
MAX_NEW_TOKENS = 300
def make_prompt(instruction, input_text=""):
instruction = instruction.strip()
input_text = input_text.strip()
if input_text:
return (
"### Inštrukcia:\n"
f"{instruction}\n\n"
"### Vstup:\n"
f"{input_text}\n\n"
"### Odpoveď:\n"
)
return (
"### Inštrukcia:\n"
f"{instruction}\n\n"
"### Odpoveď:\n"
)
def load_model():
adapter_path = Path(ADAPTER_DIR)
if not adapter_path.exists():
raise FileNotFoundError(f"Adaptér neexistuje: {ADAPTER_DIR}")
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
print("GPU:", torch.cuda.get_device_name(0))
print("Načítavam tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=False)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("Načítavam základný model v 4-bit režime...")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.float16,
)
base_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config,
device_map={"": 0},
dtype=torch.float16,
)
print("Pripájam LoRA adaptér...")
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
model.eval()
print("Model je pripravený.")
print("-" * 80)
return model, tokenizer
def generate_answer(model, tokenizer, instruction):
prompt = make_prompt(instruction)
inputs = tokenizer(
prompt,
return_tensors="pt",
truncation=True,
max_length=1024,
).to(model.device)
input_length = inputs["input_ids"].shape[-1]
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
)
answer = tokenizer.decode(
output_ids[0][input_length:],
skip_special_tokens=True,
)
return answer.strip()
def main():
model, tokenizer = load_model()
print("Napíš inštrukciu.")
print("Ukončenie: exit, quit alebo koniec")
print("-" * 80)
while True:
instruction = input("\nInštrukcia: ").strip()
if instruction.lower() in ["exit", "quit", "koniec"]:
print("Koniec.")
break
if not instruction:
continue
answer = generate_answer(model, tokenizer, instruction)
print("\nOdpoveď:")
print(answer)
print("-" * 80)
if __name__ == "__main__":
main()