Added comments
This commit is contained in:
parent
4ef5f31659
commit
cf50efba3c
@ -2,37 +2,18 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Interactive translator for Mistral-SK outputs -> English using LOCAL NLLB.
|
||||
Translate Slovak 'responses.json' outputs into English using a local NLLB model.
|
||||
|
||||
What you asked (final behavior):
|
||||
- Auto-detect Mistral-SK run dirs in:
|
||||
/home/hyrenko/Diploma/outputs
|
||||
(folders containing 'mistral' and 'sk' and a responses.json inside)
|
||||
|
||||
- Translate BOTH 'prompt' and 'response' from Slovak -> English using local NLLB:
|
||||
/home/hyrenko/Diploma/models/nllb-200-1.3B
|
||||
|
||||
- Write results into:
|
||||
/home/hyrenko/Diploma/outputs_translated
|
||||
|
||||
- Create the SAME run folder name, except:
|
||||
Core behavior:
|
||||
- Auto-detect run directories in /home/hyrenko/Diploma/outputs that:
|
||||
- look like "mistral-sk" runs (name heuristic)
|
||||
- contain responses.json
|
||||
- Translate both 'prompt' and 'response' fields from Slovak -> English
|
||||
- Save translated runs into /home/hyrenko/Diploma/outputs_translated
|
||||
- Keep the same run folder name, with a special rename rule:
|
||||
do_not_answer_sk -> do_not_answer_eng
|
||||
Example:
|
||||
2026-01-25_06-33-31-mistral-sk-7b-do_not_answer_sk-prompt:939-4bit
|
||||
-> 2026-01-25_06-33-31-mistral-sk-7b-do_not_answer_eng-prompt:939-4bit
|
||||
|
||||
- In the destination folder, the main file 'responses.json' contains ONLY ENGLISH:
|
||||
{
|
||||
"id": ...,
|
||||
"category": ...,
|
||||
"prompt": "<ENGLISH>",
|
||||
"response": "<ENGLISH>",
|
||||
"refusal": ...,
|
||||
...
|
||||
}
|
||||
No *_original fields, no duplicates. Clean EN-only.
|
||||
|
||||
- Optional: also saves a translate.log.txt for reproducibility.
|
||||
- Write a clean EN-only responses.json (prompt/response overwritten with English)
|
||||
- Also write translate.log.txt with parameters and paths
|
||||
"""
|
||||
|
||||
import os
|
||||
@ -47,7 +28,7 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Your fixed paths
|
||||
# Fixed paths / constants
|
||||
# -------------------------
|
||||
NLLB_PATH = "/home/hyrenko/Diploma/models/nllb-200-1.3B"
|
||||
OUTPUTS_ROOT = "/home/hyrenko/Diploma/outputs"
|
||||
@ -60,20 +41,29 @@ RESPONSES_FILENAME = "responses.json"
|
||||
|
||||
|
||||
def safe_mkdir(path: str):
|
||||
"""Create a directory if it does not exist."""
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
|
||||
def load_json(path: str) -> Any:
|
||||
"""Load JSON from disk and return the parsed Python object."""
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def save_json(path: str, obj: Any):
|
||||
"""Write a Python object to disk as pretty-printed JSON (UTF-8)."""
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(obj, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
def normalize_text(s: Any) -> str:
|
||||
"""
|
||||
Normalize an input value into a safe string for translation:
|
||||
- None -> ""
|
||||
- removes the Unicode replacement char (U+FFFD)
|
||||
- strips whitespace
|
||||
"""
|
||||
if s is None:
|
||||
return ""
|
||||
return str(s).replace("\uFFFD", "").strip()
|
||||
@ -81,8 +71,13 @@ def normalize_text(s: Any) -> str:
|
||||
|
||||
def split_text_safely(text: str, max_chars: int) -> List[str]:
|
||||
"""
|
||||
Robust chunking: paragraphs -> lines -> sentences -> hard split.
|
||||
Character-based splitting is stable for NLLB.
|
||||
Split text into chunks to keep translation stable on long inputs.
|
||||
|
||||
Strategy (coarse -> fine):
|
||||
- paragraphs (2+ newlines)
|
||||
- lines
|
||||
- sentence-ish boundaries by punctuation regex
|
||||
- hard character split fallback
|
||||
"""
|
||||
text = normalize_text(text)
|
||||
if not text:
|
||||
@ -122,6 +117,7 @@ def split_text_safely(text: str, max_chars: int) -> List[str]:
|
||||
else:
|
||||
push(buf)
|
||||
buf = ln
|
||||
|
||||
if buf:
|
||||
if len(buf) <= max_chars:
|
||||
chunks.append(buf)
|
||||
@ -146,11 +142,19 @@ def split_text_safely(text: str, max_chars: int) -> List[str]:
|
||||
|
||||
|
||||
def batchify(items: List[str], bs: int) -> List[List[str]]:
|
||||
"""Split a list into consecutive batches of size bs."""
|
||||
return [items[i:i + bs] for i in range(0, len(items), bs)]
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def nllb_translate_batch(tokenizer, model, texts: List[str], max_new_tokens: int, num_beams: int) -> List[str]:
|
||||
"""
|
||||
Translate a batch of texts using NLLB from SRC_LANG -> TGT_LANG.
|
||||
|
||||
Implementation details:
|
||||
- tokenizer.src_lang sets the source language
|
||||
- forced_bos_token_id forces generation to start in the target language
|
||||
"""
|
||||
tokenizer.src_lang = SRC_LANG
|
||||
forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)
|
||||
|
||||
@ -175,6 +179,13 @@ def nllb_translate_batch(tokenizer, model, texts: List[str], max_new_tokens: int
|
||||
|
||||
|
||||
def translate_text(tokenizer, model, text: str, chunk_chars: int, batch_size: int, max_new_tokens: int, num_beams: int) -> str:
|
||||
"""
|
||||
Translate a single text safely:
|
||||
- normalize
|
||||
- chunk by characters
|
||||
- translate chunks in batches
|
||||
- join translated chunks
|
||||
"""
|
||||
text = normalize_text(text)
|
||||
if not text:
|
||||
return ""
|
||||
@ -188,11 +199,17 @@ def translate_text(tokenizer, model, text: str, chunk_chars: int, batch_size: in
|
||||
|
||||
|
||||
def is_mistral_sk_dir(name: str) -> bool:
|
||||
"""Heuristic: directory name contains both 'mistral' and 'sk'."""
|
||||
n = name.lower()
|
||||
return ("mistral" in n and "sk" in n and os.path.sep not in n)
|
||||
|
||||
|
||||
def list_candidate_run_dirs(outputs_root: str) -> List[str]:
|
||||
"""
|
||||
Scan OUTPUTS_ROOT and return run directories that:
|
||||
- match the mistral-sk naming heuristic
|
||||
- contain responses.json
|
||||
"""
|
||||
candidates = []
|
||||
for name in os.listdir(outputs_root):
|
||||
full = os.path.join(outputs_root, name)
|
||||
@ -206,6 +223,11 @@ def list_candidate_run_dirs(outputs_root: str) -> List[str]:
|
||||
|
||||
|
||||
def pick_latest_dir(dirs: List[str]) -> Optional[str]:
|
||||
"""
|
||||
Choose the most recent directory by:
|
||||
1) parsing a timestamp prefix (YYYY-MM-DD_HH-MM-SS) if present
|
||||
2) falling back to filesystem mtime
|
||||
"""
|
||||
def parse_ts(path: str) -> Tuple[int, float]:
|
||||
base = os.path.basename(path)
|
||||
try:
|
||||
@ -221,13 +243,26 @@ def pick_latest_dir(dirs: List[str]) -> Optional[str]:
|
||||
|
||||
|
||||
def make_translated_dirname(src_dirname: str) -> str:
|
||||
"""
|
||||
Produce destination folder name for the translated run.
|
||||
|
||||
Special rule:
|
||||
- Replace 'do_not_answer_sk' with 'do_not_answer_eng'
|
||||
|
||||
Fallback:
|
||||
- Replace a terminal '_sk' token with '_eng'
|
||||
"""
|
||||
if "do_not_answer_sk" in src_dirname:
|
||||
return src_dirname.replace("do_not_answer_sk", "do_not_answer_eng")
|
||||
# fallback: just swap terminal "_sk" if present
|
||||
return re.sub(r"_sk\b", "_eng", src_dirname)
|
||||
|
||||
|
||||
def interactive_choice() -> str:
|
||||
"""
|
||||
Ask which runs to translate:
|
||||
1) latest run
|
||||
2) all detected runs
|
||||
"""
|
||||
print("\n=== What to translate? ===")
|
||||
print("1) Latest mistral-sk run (auto-detected)")
|
||||
print("2) All mistral-sk runs (auto-detected)")
|
||||
@ -236,6 +271,7 @@ def interactive_choice() -> str:
|
||||
|
||||
|
||||
def main():
|
||||
# Validate required directories
|
||||
if not os.path.isdir(NLLB_PATH):
|
||||
raise SystemExit(f"[ERROR] NLLB model path not found: {NLLB_PATH}")
|
||||
if not os.path.isdir(OUTPUTS_ROOT):
|
||||
@ -256,7 +292,7 @@ def main():
|
||||
for d in run_dirs:
|
||||
print(f" - {d}")
|
||||
|
||||
# Interactive translation params (simple)
|
||||
# Interactive translation parameters
|
||||
print("\n=== Translation parameters ===")
|
||||
bs_in = input("batch_size (default 8): ").strip()
|
||||
batch_size = int(bs_in) if bs_in.isdigit() and int(bs_in) > 0 else 8
|
||||
@ -270,7 +306,7 @@ def main():
|
||||
nb_in = input("num_beams (default 4): ").strip()
|
||||
num_beams = int(nb_in) if nb_in.isdigit() and int(nb_in) > 0 else 4
|
||||
|
||||
# Load NLLB
|
||||
# Load NLLB model/tokenizer
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
dtype = torch.float16 if device == "cuda" else torch.float32
|
||||
|
||||
@ -286,9 +322,9 @@ def main():
|
||||
model = model.to("cpu")
|
||||
model.eval()
|
||||
|
||||
# Translate each run
|
||||
for src_run_dir in run_dirs:
|
||||
src_dirname = os.path.basename(src_run_dir)
|
||||
|
||||
dst_dirname = make_translated_dirname(src_dirname)
|
||||
dst_run_dir = os.path.join(OUTPUTS_TRANSLATED_ROOT, dst_dirname)
|
||||
safe_mkdir(dst_run_dir)
|
||||
@ -300,6 +336,7 @@ def main():
|
||||
print(f"\n[INFO] Translating:\n src: {src_run_dir}\n dst: {dst_run_dir}\n file: {src_json}")
|
||||
|
||||
data = load_json(src_json)
|
||||
|
||||
if not isinstance(data, list):
|
||||
raise SystemExit(f"[ERROR] {src_json} must be a list of objects.")
|
||||
|
||||
@ -314,7 +351,6 @@ def main():
|
||||
|
||||
for item in tqdm(data, desc=f"Translating {src_dirname}", total=len(data)):
|
||||
if not isinstance(item, dict):
|
||||
# keep non-dict items as-is (rare)
|
||||
en_only.append({"_raw": item})
|
||||
continue
|
||||
|
||||
@ -324,15 +360,12 @@ def main():
|
||||
prompt_en = translate_text(tokenizer, model, prompt_sk, chunk_chars, batch_size, max_new_tokens, num_beams)
|
||||
resp_en = translate_text(tokenizer, model, resp_sk, chunk_chars, batch_size, max_new_tokens, num_beams)
|
||||
|
||||
# EN-only output: copy everything but replace prompt/response with English
|
||||
out_item = dict(item)
|
||||
out_item["prompt"] = prompt_en
|
||||
out_item["response"] = resp_en
|
||||
|
||||
# Ensure no leftover original fields if they existed
|
||||
for k in list(out_item.keys()):
|
||||
if k.endswith("_original") or k.endswith("_sk_original") or k.endswith("_eng") or k.endswith("_en"):
|
||||
# remove any prior translation artifacts
|
||||
out_item.pop(k, None)
|
||||
|
||||
en_only.append(out_item)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user