From 3bb4286763258b4beeb18fad9ceaa990d05ee251 Mon Sep 17 00:00:00 2001 From: Artur Hyrenko Date: Wed, 4 Feb 2026 21:01:03 +0000 Subject: [PATCH 1/4] Added comments --- program/response_evaluate.py | 66 +++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 19 deletions(-) diff --git a/program/response_evaluate.py b/program/response_evaluate.py index f8c3ffe..4698258 100644 --- a/program/response_evaluate.py +++ b/program/response_evaluate.py @@ -22,7 +22,7 @@ LLAMA_INPUT_DIR = "/home/hyrenko/Diploma/response/llama" GEMMA_INPUT_DIR = "/home/hyrenko/Diploma/response/gemma" QWEN_INPUT_DIR = "/home/hyrenko/Diploma/response/qwen" -# NEW: mistral-sk translated outputs root (where your translated runs live) +# Root for translated mistral-sk runs (each run dir contains responses.json) MISTRAL_SK_TRANSLATED_ROOT = "/home/hyrenko/Diploma/outputs_translated" MISTRAL_RESPONSES_FILENAME = "responses.json" @@ -39,6 +39,7 @@ os.makedirs(OUTPUT_ROOT, exist_ok=True) # Interactive pickers # ========================= def pick_gpu_interactive() -> str: + """Interactive device picker: returns GPU id string or empty string for CPU.""" def fmt_gb(b): return f"{b / (1024**3):.1f} GB" print("\n=== Device selection ===") if not torch.cuda.is_available(): @@ -59,16 +60,15 @@ def pick_gpu_interactive() -> str: def _is_mistral_sk_dir(name: str) -> bool: + """Heuristic: directory name contains 'mistral' and 'sk' (in any form).""" n = name.lower() - # matches: "mistral-sk", "mistral_sk", or any name containing both "mistral" and "sk" return ("mistral" in n and "sk" in n) def _list_mistral_sk_run_dirs(root: str) -> List[str]: """ - Returns all directories under root that look like mistral-sk runs - and contain responses.json. - Sorted newest-first by timestamp prefix or mtime. + List directories under 'root' that look like mistral-sk runs and contain responses.json. + Sorted newest-first by timestamp prefix (YYYY-MM-DD_HH-MM-SS) if present, otherwise by mtime. """ if not os.path.isdir(root): return [] @@ -86,7 +86,6 @@ def _list_mistral_sk_run_dirs(root: str) -> List[str]: def sort_key(path: str) -> Tuple[int, float, str]: base = os.path.basename(path) - # prefer YYYY-MM-DD_HH-MM-SS prefix try: dt = datetime.datetime.strptime(base[:19], "%Y-%m-%d_%H-%M-%S") return (1, dt.timestamp(), base) @@ -100,6 +99,10 @@ def _list_mistral_sk_run_dirs(root: str) -> List[str]: def pick_mistral_sk_run_dirs_interactive(root: str) -> List[str]: + """ + Interactive picker for mistral-sk runs under 'root'. + Returns a list of selected run directories. + """ runs = _list_mistral_sk_run_dirs(root) if not runs: return [] @@ -157,9 +160,9 @@ def pick_mistral_sk_run_dirs_interactive(root: str) -> List[str]: def pick_input_dirs_interactive() -> List[str]: """ - Returns list of input directories. - - For llama/gemma/qwen: returns [that_dir] - - For mistral-sk: returns [run_dir1, run_dir2, ...] chosen by user + Returns a list of input directories to evaluate. + - llama/gemma/qwen: returns [that_dir] + - mistral-sk: returns selected run dir(s) under MISTRAL_SK_TRANSLATED_ROOT """ options = { "1": LLAMA_INPUT_DIR, @@ -199,9 +202,15 @@ def pick_input_dirs_interactive() -> List[str]: def resolve_model_key(input_dir: str) -> str: + """ + Map an input directory to a stable model key: + - mistral_sk for runs under MISTRAL_SK_TRANSLATED_ROOT + - llama/gemma/qwen for fixed input dirs + - fallback: basename(input_dir) + """ norm = os.path.abspath(input_dir) - # detect mistral-sk run dir + # Detect mistral-sk run dir under outputs_translated if os.path.abspath(os.path.dirname(norm)) == os.path.abspath(MISTRAL_SK_TRANSLATED_ROOT) and _is_mistral_sk_dir(os.path.basename(norm)): return "mistral_sk" @@ -219,10 +228,14 @@ def resolve_model_key(input_dir: str) -> str: # Model loader # ========================= def init_model_and_tokenizer(model_path: str, cuda_visible_devices: str): + """ + Load the classifier model and tokenizer. + Also verifies that weights exist in the model directory (supports sharded safetensors). + """ if not os.path.isdir(model_path): raise FileNotFoundError(f"[ERROR] Model path not found: {model_path}") - # --- Verify model weights (supports sharded safetensors) --- + # Verify weight files weight_files = [ f for f in os.listdir(model_path) if f.endswith(".bin") or f.endswith(".safetensors") or f.endswith(".index.json") @@ -261,7 +274,8 @@ def init_model_and_tokenizer(model_path: str, cuda_visible_devices: str): # ========================= def list_json_files(input_dir: str) -> List[str]: """ - Берём все *.json в директории (включая responses.json у mistral-sk run). + Return all *.json files in the directory. + For mistral-sk runs this typically includes responses.json. """ files = sorted(glob.glob(os.path.join(input_dir, "*.json"))) if not files: @@ -270,6 +284,13 @@ def list_json_files(input_dir: str) -> List[str]: def load_records(path: str) -> List[Dict[str, Any]]: + """ + Load records from a file that can be either: + - a JSON list + - a single JSON object + - JSONL (one JSON object per line) + Returns a list of dict records. + """ with open(path, "r", encoding="utf-8") as f: text = f.read().strip() if not text: @@ -287,6 +308,7 @@ def load_records(path: str) -> List[Dict[str, Any]]: def ensure_dir(path: str): + """Create output directory if missing.""" os.makedirs(path, exist_ok=True) @@ -314,6 +336,7 @@ REFUSAL_PATTERNS = [ def is_prompt_heuristic_unsafe(prompt: str) -> bool: + """Heuristic prompt classifier: marks prompt unsafe based on regex patterns.""" text = (prompt or "").lower() for pat in UNSAFE_PROMPT_PATTERNS: if re.search(pat, text): @@ -322,6 +345,7 @@ def is_prompt_heuristic_unsafe(prompt: str) -> bool: def is_refusal_response(resp: str) -> bool: + """Heuristic response classifier: marks response as refusal based on regex patterns.""" if not resp: return False for pat in REFUSAL_PATTERNS: @@ -335,8 +359,10 @@ def is_refusal_response(resp: str) -> bool: # ========================= def extract_label_from_text(txt: str) -> str: """ - Извлекаем финальное решение Guard: - идём с конца и ищем последнее 'safe' или 'unsafe'. + Extract the final decision token from classifier output. + Strategy: + - scan tokens from the end and return the last occurrence of 'safe' or 'unsafe' + - default to 'safe' if nothing is found """ tokens = (txt or "").lower().split() for token in reversed(tokens): @@ -353,9 +379,11 @@ def extract_label_from_text(txt: str) -> str: # ========================= def evaluate_dual_batch(batch, tokenizer, model): """ - Assistant Behavior Evaluation (Variant C + эвристики): - - Prompt: Guard + правила (dirty joke/sex/nude -> unsafe) - - Response: Guard + правила (refusal -> safe) + Dual evaluation: + - Prompt classification: model + heuristic override (keyword rules -> unsafe) + - Response classification: model + heuristic override (refusal patterns -> safe) + + The model is asked to return 'safe' or 'unsafe' (optionally with a short category). """ prompt_texts = [] for item in batch: @@ -467,7 +495,7 @@ def main(): print(f"[WARN] All files empty in {input_dir}. Skipping.") continue - # Чтобы при оценке нескольких mistral-sk run ничего не перетиралось: + # For multiple mistral-sk run dirs, keep outputs separated per run if model_key == "mistral_sk": run_name = os.path.basename(os.path.abspath(input_dir)) model_out_root = os.path.join(OUTPUT_ROOT, "mistral_sk", run_name) @@ -476,7 +504,7 @@ def main(): ensure_dir(model_out_root) - # Глобальные метрики по текущему input_dir + # Global metrics for the current input_dir global_prompt_safe = 0 global_prompt_unsafe = 0 global_response_safe = 0 From d0db4121eef47e83877915130d83486fecea8898 Mon Sep 17 00:00:00 2001 From: Artur Hyrenko Date: Thu, 5 Feb 2026 15:28:46 +0000 Subject: [PATCH 2/4] Added the neccessary info --- read_me.md | 281 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 227 insertions(+), 54 deletions(-) diff --git a/read_me.md b/read_me.md index f94391b..38b9de5 100644 --- a/read_me.md +++ b/read_me.md @@ -1,98 +1,271 @@ -============================================================================================================================================================ -1. LLM_test.py – Generovanie odpovedí modelov +# Diploma Thesis Repository — SFT + DPO Training, Translation, and Safety Evaluation (SK/EN) - Tento skript je prvý krok celého procesu. - Spúšťaš ho, keď chceš nechať model (Gemma, LLaMA alebo Qwen) odpovedať na rôzne “nepríjemné” datasety (harmful prompts). Podľa toho sa potom hodnotí jeho bezpečnosť. +This repository contains the scripts I used in my thesis experiments to: -1.1 Ako to funguje +- prepare **PKU-SafeRLHF-30K** for **SFT** and **DPO**, +- translate datasets and model outputs with **NLLB (SK ↔ EN)**, +- train QLoRA adapters (SFT and DPO) for selected base models, +- generate model responses on safety datasets, +- evaluate responses with **Llama Guard 3**. - Pri spustení si script od teba vypýta: +Most scripts assume a local folder layout under `/home/hyrenko/Diploma/...` (models, datasets, outputs). If your paths are different, update the constants at the top of each script. - aký model chceš použiť, +--- - aké GPU (ak máš), +## Repository structure - aký dataset chceš otestovať, +The scripts are grouped by purpose: - koľko promptov chceš spracovať. +```text +preparation/ + prepar_dat_pku_dpo.py - Dataset si script načíta automaticky. - - Ak je gated, vypýta si HF token. +program/ + copymaster.py + Llama_test_trained.py + LLM_test.py + response_evaluate.py - Model každému promptu vygeneruje odpoveď a script kontroluje, či to náhodou nebola odpoveď v štýle “nemôžem odpovedať, som AI”. Toto sa počíta ako refusal. +Training/ + convert_dpo_sft.py + training_dpo_sft_llama.py + training_dpo_sft_mistral_sk.py - Výsledky idú do priečinka outputs/-model-dataset/. +translate/ + translate_do-not_answer.py + translate_PKF.py + Translate_sk_to_eng.py +``` -Vo vnútri nájdeš: +--- - responses.json – odpovede v strojovom formáte, +## Requirements - responses.txt – všetky prompty a odpovede pre ľudí, +- Python **3.8+** +- CUDA-capable GPU(s) for translation/training +- Typical packages: + - `torch`, `transformers`, `datasets` + - `peft`, `trl`, `accelerate`, `bitsandbytes` + - `tqdm` - summary.txt – súhrn odmietnutí podľa kategórií. -============================================================================================================================================================ +Install (example): -2. copymaster.py – Triedenie výstupov +```bash +pip install -U torch transformers datasets peft trl accelerate bitsandbytes tqdm +``` - Keď už máš hromadu priečinkov v outputs/, potrebuješ to nejako zoradiť, aby každý model mal svoje miesto. O to sa stará copymaster.py. +Notes: +- `translate/translate_PKF.py` is written for a **2‑GPU** setup. +- Some scripts expect local model folders (e.g. NLLB, Llama Guard). Update paths in the script headers. -2.1 Čo robí: +--- - Opýta sa, ktorý model chceš spracovať. +## Suggested workflow (high level) - Prejde všetky priečinky v outputs. +A typical run looks like this: - Všetko, čo obsahuje v názve “gemma”, “llama” alebo “qwen”, podľa toho čo si vybral, skopíruje do: - /response// +1) **Translate PKU** to Slovak (optional, if you need SK training data) +2) **Prepare** SFT + DPO datasets on disk +3) **Train** adapters (SFT/DPO) +4) **Generate** responses (base vs adapters) +5) **Translate** responses SK → EN (for Llama Guard) +6) **Evaluate** outputs with Llama Guard 3 - Každý JSON dostane svoje číslo: 1.json, 2.json, 3.json… +You can also skip the Slovak translation and work directly with the original English PKU dataset via `Training/convert_dpo_sft.py`. -============================================================================================================================================================ +--- -3. response_evaluate.py – Hodnotenie bezpečnosti +## Scripts (what they do + how to run) - Toto je hlavný a najväčší skript. - Robí reálne hodnotenie, či sú prompty a odpovede modelov bezpečné alebo nie. - - Používaš ho na porovnávanie modelov medzi sebou. +Run commands below from the **repository root**. -3.1 Čo robí: +--- - Pýta si od teba, ktorú sadu z response/ chceš hodnotiť (llama, gemma, qwen). +### `preparation/prepar_dat_pku_dpo.py` +**Purpose:** Takes a translated PKU dataset saved on disk and produces two outputs: +- an **SFT** dataset (plain text prompts/completions), +- a **DPO** dataset (prompt/chosen/rejected). - Nechá ťa vybrať GPU alebo CPU. +**Run:** +```bash +python3 preparation/prepar_dat_pku_dpo.py +``` - Načíta si Llama Guard 3–8B. +**What to edit first:** +- `SRC_DIR` (input dataset saved via `datasets.save_to_disk`) +- output paths (`SFT_OUT`, `DPO_OUT`) - Každý prompt aj odpoveď vyhodnotí zvlášť: +--- - prompt → je bezpečný / nebezpečný +### `Training/convert_dpo_sft.py` +**Purpose:** Downloads **PKU-Alignment/PKU-SafeRLHF-30K** from Hugging Face and converts it into: +- SFT: `./data/pku_sft.jsonl` +- DPO: `./data/pku_dpo/` (HF `save_to_disk` format) - odpoveď → model odpovedal bezpečne / nebezpečne +It shows a small interactive menu (SFT / DPO / BOTH). - Okrem Guardu používa aj tvoje vlastné heuristiky: +**Run:** +```bash +python3 Training/convert_dpo_sft.py +``` - ak prompt obsahuje “sex”, “dirty joke” atď. → označí ho rovno ako unsafe, +--- - ak odpoveď obsahuje odmietnutie → automaticky safe. +### `translate/translate_PKF.py` +**Purpose:** Translates PKU-SafeRLHF-30K to Slovak using a **local NLLB** model and a **2‑GPU** multiprocessing setup. - Každý hodnotený záznam uloží do samostatného JSON. +**Run:** +```bash +python3 translate/translate_PKF.py +``` - Po spracovaní celého priečinka vytvorí: +**Resume / merge only:** +```bash +python3 translate/translate_PKF.py --resume +``` - summary.json pre každý vstupný súbor, +**What to edit first:** +- `NLLB_PATH` (local NLLB model directory) +- output directory constants inside the script - summary_all.json pre celý model. +--- -3.2 Výsledkom je úplná štatistika: +### `translate/translate_do-not_answer.py` +**Purpose:** Translates **LibrAI/do-not-answer** (by default the `question` field) using NLLB and saves the translated dataset to disk. - koľko promptov bolo unsafe, +**Run (defaults are usable as-is):** +```bash +python3 translate/translate_do-not_answer.py +``` - koľko odpovedí bolo unsafe, +**Useful options:** +```bash +python3 translate/translate_do-not_answer.py --help +python3 translate/translate_do-not_answer.py --base_dir /home/hyrenko/Diploma/datasets --out_name do_not_answer_sk --model /home/hyrenko/Diploma/models/nllb-200-1.3B --translate_fields question,risk_area +``` - koľko párov bolo naraz unsafe, +--- - porovnanie modelov podľa bezpečnosti. -============================================================================================================================================================ +### `Training/training_dpo_sft_llama.py` +**Purpose:** Unified training script for **Llama (e.g. llama3.1‑8b)**: +- SFT (QLoRA + masked loss) +- DPO (TRL DPOTrainer) + +It opens a menu (SFT / DPO / BOTH) and then relaunches itself via **accelerate** for multi-process training. + +**Run:** +```bash +python3 Training/training_dpo_sft_llama.py +``` + +If `accelerate` is not configured on your machine yet: +```bash +accelerate config +``` + +--- + +### `Training/training_dpo_sft_mistral_sk.py` +**Purpose:** Combined QLoRA training for **mistral-sk-7b**: +- `sft` +- `dpo` +- `both` + +This script uses subcommands and supports CLI overrides (see `--help`). + +**Help:** +```bash +python3 Training/training_dpo_sft_mistral_sk.py --help +``` + +**Multi-GPU examples:** +```bash +torchrun --nproc_per_node=2 Training/training_dpo_sft_mistral_sk.py sft +torchrun --nproc_per_node=2 Training/training_dpo_sft_mistral_sk.py dpo +torchrun --nproc_per_node=2 Training/training_dpo_sft_mistral_sk.py both +``` + +--- + +### `program/LLM_test.py` +**Purpose:** Interactive generator for model responses. Lets you pick: +- model source (base / adapters), +- GPU, +- dataset, +- generation limits. + +Writes a `responses.json` under your configured outputs directory. + +**Run:** +```bash +python3 program/LLM_test.py +``` + +--- + +### `program/Llama_test_trained.py` +**Purpose:** Targeted evaluator for Llama runs. Useful for running **base vs SFT vs DPO** on a chosen dataset with explicit CLI flags. + +**Run:** +```bash +python3 program/Llama_test_trained.py --help +``` + +**Example:** +```bash +python3 program/Llama_test_trained.py --dataset do-not-answer --mode dpo --limit 200 +``` + +--- + +### `translate/Translate_sk_to_eng.py` +**Purpose:** Translates Slovak `responses.json` → English (so Llama Guard can score English outputs). +It scans your runs folder and writes translated outputs into `outputs_translated` (path is in the script). + +**Run:** +```bash +python3 translate/Translate_sk_to_eng.py +``` + +--- + +### `program/copymaster.py` +**Purpose:** Copies `responses.json` files from `outputs/` into structured folders under `response/{gemma|llama|qwen}/` (used by the evaluator). + +**Run:** +```bash +python3 program/copymaster.py +``` + +**What to edit first:** +- `OUTPUTS_DIR` (source runs folder) +- `DEST_DIR` (destination base folder) + +--- + +### `program/response_evaluate.py` +**Purpose:** Runs **Llama Guard 3** on prompts + generated responses and saves: +- per-item evaluation JSON, +- summary stats (under `OUTPUT_ROOT`, configured in the script). + +Inputs are expected in folders like: +- `/home/hyrenko/Diploma/response/{llama|gemma|qwen}` +- `/home/hyrenko/Diploma/outputs_translated` (translated mistral-sk runs) + +**Run:** +```bash +python3 program/response_evaluate.py +``` + +**What to edit first:** +- `MODEL_PATH` (local Llama Guard 3 model folder) +- input/output directories (`*_INPUT_DIR`, `OUTPUT_ROOT`) + +--- + +## Tips + +- If something “can’t find file/path”, check the **constants at the top** of the script first. +- Keep outputs named consistently (`responses.json`) — several scripts rely on that. +- For multi-GPU training, make sure your CUDA devices are visible and `torchrun/accelerate` sees them. From ad86c54d193311280c589a49e6c564733034fa7d Mon Sep 17 00:00:00 2001 From: Artur Hyrenko Date: Thu, 5 Feb 2026 15:30:19 +0000 Subject: [PATCH 3/4] Update read_me.md --- read_me.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/read_me.md b/read_me.md index 38b9de5..c96b22b 100644 --- a/read_me.md +++ b/read_me.md @@ -1,4 +1,4 @@ -# Diploma Thesis Repository — SFT + DPO Training, Translation, and Safety Evaluation (SK/EN) +# Diploma Thesis Repository — Increasing safety of large langauge models (SK/EN) This repository contains the scripts I used in my thesis experiments to: From 14862f5670b50c7f7c4098c9a11af525ed77e586 Mon Sep 17 00:00:00 2001 From: Artur Hyrenko Date: Thu, 5 Feb 2026 15:34:02 +0000 Subject: [PATCH 4/4] Update read_me.md --- read_me.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/read_me.md b/read_me.md index c96b22b..f44e849 100644 --- a/read_me.md +++ b/read_me.md @@ -1,4 +1,4 @@ -# Diploma Thesis Repository — Increasing safety of large langauge models (SK/EN) +# Diploma Thesis Repository — Increasing safety of large language models (SK/EN) This repository contains the scripts I used in my thesis experiments to: