import os from pathlib import Path os.environ["CUDA_VISIBLE_DEVICES"] = "0" os.environ["TOKENIZERS_PARALLELISM"] = "false" import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import PeftModel MODEL_NAME = "slovak-nlp/mistral-sk-7b" ADAPTER_DIR = "/home/schwarc/diplomovka/mistral_sk_alpaca/mistral-sk-7b-alpaca-slovak-unsloth-lora-full" MAX_NEW_TOKENS = 300 def make_prompt(instruction, input_text=""): instruction = instruction.strip() input_text = input_text.strip() if input_text: return ( "### Inštrukcia:\n" f"{instruction}\n\n" "### Vstup:\n" f"{input_text}\n\n" "### Odpoveď:\n" ) return ( "### Inštrukcia:\n" f"{instruction}\n\n" "### Odpoveď:\n" ) def load_model(): adapter_path = Path(ADAPTER_DIR) if not adapter_path.exists(): raise FileNotFoundError(f"Adaptér neexistuje: {ADAPTER_DIR}") print("CUDA available:", torch.cuda.is_available()) if torch.cuda.is_available(): print("GPU:", torch.cuda.get_device_name(0)) print("Načítavam tokenizer...") tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print("Načítavam základný model v 4-bit režime...") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.float16, ) base_model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, quantization_config=bnb_config, device_map={"": 0}, dtype=torch.float16, ) print("Pripájam LoRA adaptér...") model = PeftModel.from_pretrained(base_model, ADAPTER_DIR) model.eval() print("Model je pripravený.") print("-" * 80) return model, tokenizer def generate_answer(model, tokenizer, instruction): prompt = make_prompt(instruction) inputs = tokenizer( prompt, return_tensors="pt", truncation=True, max_length=1024, ).to(model.device) input_length = inputs["input_ids"].shape[-1] with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.1, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, ) answer = tokenizer.decode( output_ids[0][input_length:], skip_special_tokens=True, ) return answer.strip() def main(): model, tokenizer = load_model() print("Napíš inštrukciu.") print("Ukončenie: exit, quit alebo koniec") print("-" * 80) while True: instruction = input("\nInštrukcia: ").strip() if instruction.lower() in ["exit", "quit", "koniec"]: print("Koniec.") break if not instruction: continue answer = generate_answer(model, tokenizer, instruction) print("\nOdpoveď:") print(answer) print("-" * 80) if __name__ == "__main__": main()