From 3bb4286763258b4beeb18fad9ceaa990d05ee251 Mon Sep 17 00:00:00 2001
From: Artur Hyrenko <artur.hyrenko@student.tuke.sk>
Date: Wed, 4 Feb 2026 21:01:03 +0000
Subject: [PATCH 1/4] Added comments

---
 program/response_evaluate.py | 66 +++++++++++++++++++++++++-----------
 1 file changed, 47 insertions(+), 19 deletions(-)

diff --git a/program/response_evaluate.py b/program/response_evaluate.py
index f8c3ffe..4698258 100644
--- a/program/response_evaluate.py
+++ b/program/response_evaluate.py
@@ -22,7 +22,7 @@ LLAMA_INPUT_DIR = "/home/hyrenko/Diploma/response/llama"
 GEMMA_INPUT_DIR = "/home/hyrenko/Diploma/response/gemma"
 QWEN_INPUT_DIR  = "/home/hyrenko/Diploma/response/qwen"
 
-# NEW: mistral-sk translated outputs root (where your translated runs live)
+# Root for translated mistral-sk runs (each run dir contains responses.json)
 MISTRAL_SK_TRANSLATED_ROOT = "/home/hyrenko/Diploma/outputs_translated"
 MISTRAL_RESPONSES_FILENAME = "responses.json"
 
@@ -39,6 +39,7 @@ os.makedirs(OUTPUT_ROOT, exist_ok=True)
 # Interactive pickers
 # =========================
 def pick_gpu_interactive() -> str:
+    """Interactive device picker: returns GPU id string or empty string for CPU."""
     def fmt_gb(b): return f"{b / (1024**3):.1f} GB"
     print("\n=== Device selection ===")
     if not torch.cuda.is_available():
@@ -59,16 +60,15 @@ def pick_gpu_interactive() -> str:
 
 
 def _is_mistral_sk_dir(name: str) -> bool:
+    """Heuristic: directory name contains 'mistral' and 'sk' (in any form)."""
     n = name.lower()
-    # matches: "mistral-sk", "mistral_sk", or any name containing both "mistral" and "sk"
     return ("mistral" in n and "sk" in n)
 
 
 def _list_mistral_sk_run_dirs(root: str) -> List[str]:
     """
-    Returns all directories under root that look like mistral-sk runs
-    and contain responses.json.
-    Sorted newest-first by timestamp prefix or mtime.
+    List directories under 'root' that look like mistral-sk runs and contain responses.json.
+    Sorted newest-first by timestamp prefix (YYYY-MM-DD_HH-MM-SS) if present, otherwise by mtime.
     """
     if not os.path.isdir(root):
         return []
@@ -86,7 +86,6 @@ def _list_mistral_sk_run_dirs(root: str) -> List[str]:
 
     def sort_key(path: str) -> Tuple[int, float, str]:
         base = os.path.basename(path)
-        # prefer YYYY-MM-DD_HH-MM-SS prefix
         try:
             dt = datetime.datetime.strptime(base[:19], "%Y-%m-%d_%H-%M-%S")
             return (1, dt.timestamp(), base)
@@ -100,6 +99,10 @@ def _list_mistral_sk_run_dirs(root: str) -> List[str]:
 
 
 def pick_mistral_sk_run_dirs_interactive(root: str) -> List[str]:
+    """
+    Interactive picker for mistral-sk runs under 'root'.
+    Returns a list of selected run directories.
+    """
     runs = _list_mistral_sk_run_dirs(root)
     if not runs:
         return []
@@ -157,9 +160,9 @@ def pick_mistral_sk_run_dirs_interactive(root: str) -> List[str]:
 
 def pick_input_dirs_interactive() -> List[str]:
     """
-    Returns list of input directories.
-    - For llama/gemma/qwen: returns [that_dir]
-    - For mistral-sk: returns [run_dir1, run_dir2, ...] chosen by user
+    Returns a list of input directories to evaluate.
+    - llama/gemma/qwen: returns [that_dir]
+    - mistral-sk: returns selected run dir(s) under MISTRAL_SK_TRANSLATED_ROOT
     """
     options = {
         "1": LLAMA_INPUT_DIR,
@@ -199,9 +202,15 @@ def pick_input_dirs_interactive() -> List[str]:
 
 
 def resolve_model_key(input_dir: str) -> str:
+    """
+    Map an input directory to a stable model key:
+    - mistral_sk for runs under MISTRAL_SK_TRANSLATED_ROOT
+    - llama/gemma/qwen for fixed input dirs
+    - fallback: basename(input_dir)
+    """
     norm = os.path.abspath(input_dir)
 
-    # detect mistral-sk run dir
+    # Detect mistral-sk run dir under outputs_translated
     if os.path.abspath(os.path.dirname(norm)) == os.path.abspath(MISTRAL_SK_TRANSLATED_ROOT) and _is_mistral_sk_dir(os.path.basename(norm)):
         return "mistral_sk"
 
@@ -219,10 +228,14 @@ def resolve_model_key(input_dir: str) -> str:
 # Model loader
 # =========================
 def init_model_and_tokenizer(model_path: str, cuda_visible_devices: str):
+    """
+    Load the classifier model and tokenizer.
+    Also verifies that weights exist in the model directory (supports sharded safetensors).
+    """
     if not os.path.isdir(model_path):
         raise FileNotFoundError(f"[ERROR] Model path not found: {model_path}")
 
-    # --- Verify model weights (supports sharded safetensors) ---
+    # Verify weight files
     weight_files = [
         f for f in os.listdir(model_path)
         if f.endswith(".bin") or f.endswith(".safetensors") or f.endswith(".index.json")
@@ -261,7 +274,8 @@ def init_model_and_tokenizer(model_path: str, cuda_visible_devices: str):
 # =========================
 def list_json_files(input_dir: str) -> List[str]:
     """
-    Берём все *.json в директории (включая responses.json у mistral-sk run).
+    Return all *.json files in the directory.
+    For mistral-sk runs this typically includes responses.json.
     """
     files = sorted(glob.glob(os.path.join(input_dir, "*.json")))
     if not files:
@@ -270,6 +284,13 @@ def list_json_files(input_dir: str) -> List[str]:
 
 
 def load_records(path: str) -> List[Dict[str, Any]]:
+    """
+    Load records from a file that can be either:
+    - a JSON list
+    - a single JSON object
+    - JSONL (one JSON object per line)
+    Returns a list of dict records.
+    """
     with open(path, "r", encoding="utf-8") as f:
         text = f.read().strip()
     if not text:
@@ -287,6 +308,7 @@ def load_records(path: str) -> List[Dict[str, Any]]:
 
 
 def ensure_dir(path: str):
+    """Create output directory if missing."""
     os.makedirs(path, exist_ok=True)
 
 
@@ -314,6 +336,7 @@ REFUSAL_PATTERNS = [
 
 
 def is_prompt_heuristic_unsafe(prompt: str) -> bool:
+    """Heuristic prompt classifier: marks prompt unsafe based on regex patterns."""
     text = (prompt or "").lower()
     for pat in UNSAFE_PROMPT_PATTERNS:
         if re.search(pat, text):
@@ -322,6 +345,7 @@ def is_prompt_heuristic_unsafe(prompt: str) -> bool:
 
 
 def is_refusal_response(resp: str) -> bool:
+    """Heuristic response classifier: marks response as refusal based on regex patterns."""
     if not resp:
         return False
     for pat in REFUSAL_PATTERNS:
@@ -335,8 +359,10 @@ def is_refusal_response(resp: str) -> bool:
 # =========================
 def extract_label_from_text(txt: str) -> str:
     """
-    Извлекаем финальное решение Guard:
-    идём с конца и ищем последнее 'safe' или 'unsafe'.
+    Extract the final decision token from classifier output.
+    Strategy:
+    - scan tokens from the end and return the last occurrence of 'safe' or 'unsafe'
+    - default to 'safe' if nothing is found
     """
     tokens = (txt or "").lower().split()
     for token in reversed(tokens):
@@ -353,9 +379,11 @@ def extract_label_from_text(txt: str) -> str:
 # =========================
 def evaluate_dual_batch(batch, tokenizer, model):
     """
-    Assistant Behavior Evaluation (Variant C + эвристики):
-    - Prompt: Guard + правила (dirty joke/sex/nude -> unsafe)
-    - Response: Guard + правила (refusal -> safe)
+    Dual evaluation:
+    - Prompt classification: model + heuristic override (keyword rules -> unsafe)
+    - Response classification: model + heuristic override (refusal patterns -> safe)
+
+    The model is asked to return 'safe' or 'unsafe' (optionally with a short category).
     """
     prompt_texts = []
     for item in batch:
@@ -467,7 +495,7 @@ def main():
             print(f"[WARN] All files empty in {input_dir}. Skipping.")
             continue
 
-        # Чтобы при оценке нескольких mistral-sk run ничего не перетиралось:
+        # For multiple mistral-sk run dirs, keep outputs separated per run
         if model_key == "mistral_sk":
             run_name = os.path.basename(os.path.abspath(input_dir))
             model_out_root = os.path.join(OUTPUT_ROOT, "mistral_sk", run_name)
@@ -476,7 +504,7 @@ def main():
 
         ensure_dir(model_out_root)
 
-        # Глобальные метрики по текущему input_dir
+        # Global metrics for the current input_dir
         global_prompt_safe = 0
         global_prompt_unsafe = 0
         global_response_safe = 0

From d0db4121eef47e83877915130d83486fecea8898 Mon Sep 17 00:00:00 2001
From: Artur Hyrenko <artur.hyrenko@student.tuke.sk>
Date: Thu, 5 Feb 2026 15:28:46 +0000
Subject: [PATCH 2/4] Added the neccessary info

---
 read_me.md | 281 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 227 insertions(+), 54 deletions(-)

diff --git a/read_me.md b/read_me.md
index f94391b..38b9de5 100644
--- a/read_me.md
+++ b/read_me.md
@@ -1,98 +1,271 @@
-============================================================================================================================================================
-1. LLM_test.py – Generovanie odpovedí modelov																      
+# Diploma Thesis Repository — SFT + DPO Training, Translation, and Safety Evaluation (SK/EN)
 
-	Tento skript je prvý krok celého procesu.
-	Spúšťaš ho, keď chceš nechať model (Gemma, LLaMA alebo Qwen) odpovedať na rôzne “nepríjemné” datasety (harmful prompts). Podľa toho sa potom hodnotí jeho bezpečnosť.
+This repository contains the scripts I used in my thesis experiments to:
 
-1.1 Ako to funguje
+- prepare **PKU-SafeRLHF-30K** for **SFT** and **DPO**,
+- translate datasets and model outputs with **NLLB (SK ↔ EN)**,
+- train QLoRA adapters (SFT and DPO) for selected base models,
+- generate model responses on safety datasets,
+- evaluate responses with **Llama Guard 3**.
 
-	Pri spustení si script od teba vypýta:
+Most scripts assume a local folder layout under `/home/hyrenko/Diploma/...` (models, datasets, outputs). If your paths are different, update the constants at the top of each script.
 
-		aký model chceš použiť,
+---
 
-		aké GPU (ak máš),
+## Repository structure
 
-		aký dataset chceš otestovať,
+The scripts are grouped by purpose:
 
-		koľko promptov chceš spracovať.
+```text
+preparation/
+  prepar_dat_pku_dpo.py
 
-		Dataset si script načíta automaticky.
-		
-		Ak je gated, vypýta si HF token.
+program/
+  copymaster.py
+  Llama_test_trained.py
+  LLM_test.py
+  response_evaluate.py
 
-	Model každému promptu vygeneruje odpoveď a script kontroluje, či to náhodou nebola odpoveď v štýle “nemôžem odpovedať, som AI”. Toto sa počíta ako refusal.
+Training/
+  convert_dpo_sft.py
+  training_dpo_sft_llama.py
+  training_dpo_sft_mistral_sk.py
 
-	Výsledky idú do priečinka outputs/<timestamp>-model-dataset/.
+translate/
+  translate_do-not_answer.py
+  translate_PKF.py
+  Translate_sk_to_eng.py
+```
 
-Vo vnútri nájdeš:
+---
 
-	responses.json – odpovede v strojovom formáte,
+## Requirements
 
-	responses.txt – všetky prompty a odpovede pre ľudí,
+- Python **3.8+**
+- CUDA-capable GPU(s) for translation/training
+- Typical packages:
+  - `torch`, `transformers`, `datasets`
+  - `peft`, `trl`, `accelerate`, `bitsandbytes`
+  - `tqdm`
 
-	summary.txt – súhrn odmietnutí podľa kategórií.
-============================================================================================================================================================
+Install (example):
 
-2. copymaster.py – Triedenie výstupov
+```bash
+pip install -U torch transformers datasets peft trl accelerate bitsandbytes tqdm
+```
 
-	Keď už máš hromadu priečinkov v outputs/, potrebuješ to nejako zoradiť, aby každý model mal svoje miesto. O to sa stará copymaster.py.
+Notes:
+- `translate/translate_PKF.py` is written for a **2‑GPU** setup.
+- Some scripts expect local model folders (e.g. NLLB, Llama Guard). Update paths in the script headers.
 
-2.1 Čo robí:
+---
 
-	Opýta sa, ktorý model chceš spracovať.
+## Suggested workflow (high level)
 
-	Prejde všetky priečinky v outputs.
+A typical run looks like this:
 
-	Všetko, čo obsahuje v názve “gemma”, “llama” alebo “qwen”, podľa toho čo si vybral, skopíruje do:
-	/response/<model>/
+1) **Translate PKU** to Slovak (optional, if you need SK training data)  
+2) **Prepare** SFT + DPO datasets on disk  
+3) **Train** adapters (SFT/DPO)  
+4) **Generate** responses (base vs adapters)  
+5) **Translate** responses SK → EN (for Llama Guard)  
+6) **Evaluate** outputs with Llama Guard 3  
 
-	Každý JSON dostane svoje číslo: 1.json, 2.json, 3.json…
+You can also skip the Slovak translation and work directly with the original English PKU dataset via `Training/convert_dpo_sft.py`.
 
-============================================================================================================================================================
+---
 
-3. response_evaluate.py – Hodnotenie bezpečnosti
+## Scripts (what they do + how to run)
 
-	Toto je hlavný a najväčší skript.
-	Robí reálne hodnotenie, či sú prompty a odpovede modelov bezpečné alebo nie.
-	
-	Používaš ho na porovnávanie modelov medzi sebou.
+Run commands below from the **repository root**.
 
-3.1 Čo robí:
+---
 
-	Pýta si od teba, ktorú sadu z response/ chceš hodnotiť (llama, gemma, qwen).
+### `preparation/prepar_dat_pku_dpo.py`
+**Purpose:** Takes a translated PKU dataset saved on disk and produces two outputs:
+- an **SFT** dataset (plain text prompts/completions),
+- a **DPO** dataset (prompt/chosen/rejected).
 
-	Nechá ťa vybrať GPU alebo CPU.
+**Run:**
+```bash
+python3 preparation/prepar_dat_pku_dpo.py
+```
 
-	Načíta si Llama Guard 3–8B.
+**What to edit first:**
+- `SRC_DIR` (input dataset saved via `datasets.save_to_disk`)
+- output paths (`SFT_OUT`, `DPO_OUT`)
 
-	Každý prompt aj odpoveď vyhodnotí zvlášť:
+---
 
-	prompt → je bezpečný / nebezpečný
+### `Training/convert_dpo_sft.py`
+**Purpose:** Downloads **PKU-Alignment/PKU-SafeRLHF-30K** from Hugging Face and converts it into:
+- SFT: `./data/pku_sft.jsonl`
+- DPO: `./data/pku_dpo/` (HF `save_to_disk` format)
 
-	odpoveď → model odpovedal bezpečne / nebezpečne
+It shows a small interactive menu (SFT / DPO / BOTH).
 
-	Okrem Guardu používa aj tvoje vlastné heuristiky:
+**Run:**
+```bash
+python3 Training/convert_dpo_sft.py
+```
 
-	ak prompt obsahuje “sex”, “dirty joke” atď. → označí ho rovno ako unsafe,
+---
 
-	ak odpoveď obsahuje odmietnutie → automaticky safe.
+### `translate/translate_PKF.py`
+**Purpose:** Translates PKU-SafeRLHF-30K to Slovak using a **local NLLB** model and a **2‑GPU** multiprocessing setup.
 
-	Každý hodnotený záznam uloží do samostatného JSON.
+**Run:**
+```bash
+python3 translate/translate_PKF.py
+```
 
-	Po spracovaní celého priečinka vytvorí:
+**Resume / merge only:**
+```bash
+python3 translate/translate_PKF.py --resume
+```
 
-	summary.json pre každý vstupný súbor,
+**What to edit first:**
+- `NLLB_PATH` (local NLLB model directory)
+- output directory constants inside the script
 
-	summary_all.json pre celý model.
+---
 
-3.2 Výsledkom je úplná štatistika:
+### `translate/translate_do-not_answer.py`
+**Purpose:** Translates **LibrAI/do-not-answer** (by default the `question` field) using NLLB and saves the translated dataset to disk.
 
-	koľko promptov bolo unsafe,
+**Run (defaults are usable as-is):**
+```bash
+python3 translate/translate_do-not_answer.py
+```
 
-	koľko odpovedí bolo unsafe,
+**Useful options:**
+```bash
+python3 translate/translate_do-not_answer.py --help
+python3 translate/translate_do-not_answer.py   --base_dir /home/hyrenko/Diploma/datasets   --out_name do_not_answer_sk   --model /home/hyrenko/Diploma/models/nllb-200-1.3B   --translate_fields question,risk_area
+```
 
-	koľko párov bolo naraz unsafe,
+---
 
-	porovnanie modelov podľa bezpečnosti.
-============================================================================================================================================================
+### `Training/training_dpo_sft_llama.py`
+**Purpose:** Unified training script for **Llama (e.g. llama3.1‑8b)**:
+- SFT (QLoRA + masked loss)
+- DPO (TRL DPOTrainer)
+
+It opens a menu (SFT / DPO / BOTH) and then relaunches itself via **accelerate** for multi-process training.
+
+**Run:**
+```bash
+python3 Training/training_dpo_sft_llama.py
+```
+
+If `accelerate` is not configured on your machine yet:
+```bash
+accelerate config
+```
+
+---
+
+### `Training/training_dpo_sft_mistral_sk.py`
+**Purpose:** Combined QLoRA training for **mistral-sk-7b**:
+- `sft`
+- `dpo`
+- `both`
+
+This script uses subcommands and supports CLI overrides (see `--help`).
+
+**Help:**
+```bash
+python3 Training/training_dpo_sft_mistral_sk.py --help
+```
+
+**Multi-GPU examples:**
+```bash
+torchrun --nproc_per_node=2 Training/training_dpo_sft_mistral_sk.py sft
+torchrun --nproc_per_node=2 Training/training_dpo_sft_mistral_sk.py dpo
+torchrun --nproc_per_node=2 Training/training_dpo_sft_mistral_sk.py both
+```
+
+---
+
+### `program/LLM_test.py`
+**Purpose:** Interactive generator for model responses. Lets you pick:
+- model source (base / adapters),
+- GPU,
+- dataset,
+- generation limits.
+
+Writes a `responses.json` under your configured outputs directory.
+
+**Run:**
+```bash
+python3 program/LLM_test.py
+```
+
+---
+
+### `program/Llama_test_trained.py`
+**Purpose:** Targeted evaluator for Llama runs. Useful for running **base vs SFT vs DPO** on a chosen dataset with explicit CLI flags.
+
+**Run:**
+```bash
+python3 program/Llama_test_trained.py --help
+```
+
+**Example:**
+```bash
+python3 program/Llama_test_trained.py --dataset do-not-answer --mode dpo --limit 200
+```
+
+---
+
+### `translate/Translate_sk_to_eng.py`
+**Purpose:** Translates Slovak `responses.json` → English (so Llama Guard can score English outputs).  
+It scans your runs folder and writes translated outputs into `outputs_translated` (path is in the script).
+
+**Run:**
+```bash
+python3 translate/Translate_sk_to_eng.py
+```
+
+---
+
+### `program/copymaster.py`
+**Purpose:** Copies `responses.json` files from `outputs/` into structured folders under `response/{gemma|llama|qwen}/` (used by the evaluator).
+
+**Run:**
+```bash
+python3 program/copymaster.py
+```
+
+**What to edit first:**
+- `OUTPUTS_DIR` (source runs folder)
+- `DEST_DIR` (destination base folder)
+
+---
+
+### `program/response_evaluate.py`
+**Purpose:** Runs **Llama Guard 3** on prompts + generated responses and saves:
+- per-item evaluation JSON,
+- summary stats (under `OUTPUT_ROOT`, configured in the script).
+
+Inputs are expected in folders like:
+- `/home/hyrenko/Diploma/response/{llama|gemma|qwen}`
+- `/home/hyrenko/Diploma/outputs_translated` (translated mistral-sk runs)
+
+**Run:**
+```bash
+python3 program/response_evaluate.py
+```
+
+**What to edit first:**
+- `MODEL_PATH` (local Llama Guard 3 model folder)
+- input/output directories (`*_INPUT_DIR`, `OUTPUT_ROOT`)
+
+---
+
+## Tips
+
+- If something “can’t find file/path”, check the **constants at the top** of the script first.
+- Keep outputs named consistently (`responses.json`) — several scripts rely on that.
+- For multi-GPU training, make sure your CUDA devices are visible and `torchrun/accelerate` sees them.
 

From ad86c54d193311280c589a49e6c564733034fa7d Mon Sep 17 00:00:00 2001
From: Artur Hyrenko <artur.hyrenko@student.tuke.sk>
Date: Thu, 5 Feb 2026 15:30:19 +0000
Subject: [PATCH 3/4] Update read_me.md

---
 read_me.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/read_me.md b/read_me.md
index 38b9de5..c96b22b 100644
--- a/read_me.md
+++ b/read_me.md
@@ -1,4 +1,4 @@
-# Diploma Thesis Repository — SFT + DPO Training, Translation, and Safety Evaluation (SK/EN)
+# Diploma Thesis Repository — Increasing safety of large langauge models (SK/EN)
 
 This repository contains the scripts I used in my thesis experiments to:
 

From 14862f5670b50c7f7c4098c9a11af525ed77e586 Mon Sep 17 00:00:00 2001
From: Artur Hyrenko <artur.hyrenko@student.tuke.sk>
Date: Thu, 5 Feb 2026 15:34:02 +0000
Subject: [PATCH 4/4] Update read_me.md

---
 read_me.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/read_me.md b/read_me.md
index c96b22b..f44e849 100644
--- a/read_me.md
+++ b/read_me.md
@@ -1,4 +1,4 @@
-# Diploma Thesis Repository — Increasing safety of large langauge models (SK/EN)
+# Diploma Thesis Repository — Increasing safety of large language models (SK/EN)
 
 This repository contains the scripts I used in my thesis experiments to: