141 lines
3.3 KiB
Python
141 lines
3.3 KiB
Python
import os
|
|
from pathlib import Path
|
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
|
import torch
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
|
from peft import PeftModel
|
|
|
|
|
|
MODEL_NAME = "slovak-nlp/mistral-sk-7b"
|
|
|
|
ADAPTER_DIR = "/home/schwarc/diplomovka/mistral_sk_alpaca/mistral-sk-7b-alpaca-slovak-unsloth-lora-full"
|
|
|
|
MAX_NEW_TOKENS = 300
|
|
|
|
|
|
def make_prompt(instruction, input_text=""):
|
|
instruction = instruction.strip()
|
|
input_text = input_text.strip()
|
|
|
|
if input_text:
|
|
return (
|
|
"### Inštrukcia:\n"
|
|
f"{instruction}\n\n"
|
|
"### Vstup:\n"
|
|
f"{input_text}\n\n"
|
|
"### Odpoveď:\n"
|
|
)
|
|
|
|
return (
|
|
"### Inštrukcia:\n"
|
|
f"{instruction}\n\n"
|
|
"### Odpoveď:\n"
|
|
)
|
|
|
|
|
|
def load_model():
|
|
adapter_path = Path(ADAPTER_DIR)
|
|
|
|
if not adapter_path.exists():
|
|
raise FileNotFoundError(f"Adaptér neexistuje: {ADAPTER_DIR}")
|
|
|
|
print("CUDA available:", torch.cuda.is_available())
|
|
|
|
if torch.cuda.is_available():
|
|
print("GPU:", torch.cuda.get_device_name(0))
|
|
|
|
print("Načítavam tokenizer...")
|
|
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=False)
|
|
|
|
if tokenizer.pad_token is None:
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
print("Načítavam základný model v 4-bit režime...")
|
|
|
|
bnb_config = BitsAndBytesConfig(
|
|
load_in_4bit=True,
|
|
bnb_4bit_quant_type="nf4",
|
|
bnb_4bit_use_double_quant=True,
|
|
bnb_4bit_compute_dtype=torch.float16,
|
|
)
|
|
|
|
base_model = AutoModelForCausalLM.from_pretrained(
|
|
MODEL_NAME,
|
|
quantization_config=bnb_config,
|
|
device_map={"": 0},
|
|
dtype=torch.float16,
|
|
)
|
|
|
|
print("Pripájam LoRA adaptér...")
|
|
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
|
|
model.eval()
|
|
|
|
print("Model je pripravený.")
|
|
print("-" * 80)
|
|
|
|
return model, tokenizer
|
|
|
|
|
|
def generate_answer(model, tokenizer, instruction):
|
|
prompt = make_prompt(instruction)
|
|
|
|
inputs = tokenizer(
|
|
prompt,
|
|
return_tensors="pt",
|
|
truncation=True,
|
|
max_length=1024,
|
|
).to(model.device)
|
|
|
|
input_length = inputs["input_ids"].shape[-1]
|
|
|
|
with torch.no_grad():
|
|
output_ids = model.generate(
|
|
**inputs,
|
|
max_new_tokens=MAX_NEW_TOKENS,
|
|
|
|
do_sample=True,
|
|
temperature=0.7,
|
|
top_p=0.9,
|
|
|
|
repetition_penalty=1.1,
|
|
eos_token_id=tokenizer.eos_token_id,
|
|
pad_token_id=tokenizer.pad_token_id,
|
|
)
|
|
|
|
answer = tokenizer.decode(
|
|
output_ids[0][input_length:],
|
|
skip_special_tokens=True,
|
|
)
|
|
|
|
return answer.strip()
|
|
|
|
|
|
def main():
|
|
model, tokenizer = load_model()
|
|
|
|
print("Napíš inštrukciu.")
|
|
print("Ukončenie: exit, quit alebo koniec")
|
|
print("-" * 80)
|
|
|
|
while True:
|
|
instruction = input("\nInštrukcia: ").strip()
|
|
|
|
if instruction.lower() in ["exit", "quit", "koniec"]:
|
|
print("Koniec.")
|
|
break
|
|
|
|
if not instruction:
|
|
continue
|
|
|
|
answer = generate_answer(model, tokenizer, instruction)
|
|
|
|
print("\nOdpoveď:")
|
|
print(answer)
|
|
print("-" * 80)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |