diff --git a/translate/Translate_sk_to_eng.py b/translate/Translate_sk_to_eng.py index 53fad23..51da226 100644 --- a/translate/Translate_sk_to_eng.py +++ b/translate/Translate_sk_to_eng.py @@ -2,37 +2,18 @@ # -*- coding: utf-8 -*- """ -Interactive translator for Mistral-SK outputs -> English using LOCAL NLLB. +Translate Slovak 'responses.json' outputs into English using a local NLLB model. -What you asked (final behavior): -- Auto-detect Mistral-SK run dirs in: - /home/hyrenko/Diploma/outputs - (folders containing 'mistral' and 'sk' and a responses.json inside) - -- Translate BOTH 'prompt' and 'response' from Slovak -> English using local NLLB: - /home/hyrenko/Diploma/models/nllb-200-1.3B - -- Write results into: - /home/hyrenko/Diploma/outputs_translated - -- Create the SAME run folder name, except: +Core behavior: +- Auto-detect run directories in /home/hyrenko/Diploma/outputs that: + - look like "mistral-sk" runs (name heuristic) + - contain responses.json +- Translate both 'prompt' and 'response' fields from Slovak -> English +- Save translated runs into /home/hyrenko/Diploma/outputs_translated +- Keep the same run folder name, with a special rename rule: do_not_answer_sk -> do_not_answer_eng - Example: - 2026-01-25_06-33-31-mistral-sk-7b-do_not_answer_sk-prompt:939-4bit - -> 2026-01-25_06-33-31-mistral-sk-7b-do_not_answer_eng-prompt:939-4bit - -- In the destination folder, the main file 'responses.json' contains ONLY ENGLISH: - { - "id": ..., - "category": ..., - "prompt": "", - "response": "", - "refusal": ..., - ... - } - No *_original fields, no duplicates. Clean EN-only. - -- Optional: also saves a translate.log.txt for reproducibility. +- Write a clean EN-only responses.json (prompt/response overwritten with English) +- Also write translate.log.txt with parameters and paths """ import os @@ -47,7 +28,7 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM # ------------------------- -# Your fixed paths +# Fixed paths / constants # ------------------------- NLLB_PATH = "/home/hyrenko/Diploma/models/nllb-200-1.3B" OUTPUTS_ROOT = "/home/hyrenko/Diploma/outputs" @@ -60,20 +41,29 @@ RESPONSES_FILENAME = "responses.json" def safe_mkdir(path: str): + """Create a directory if it does not exist.""" os.makedirs(path, exist_ok=True) def load_json(path: str) -> Any: + """Load JSON from disk and return the parsed Python object.""" with open(path, "r", encoding="utf-8") as f: return json.load(f) def save_json(path: str, obj: Any): + """Write a Python object to disk as pretty-printed JSON (UTF-8).""" with open(path, "w", encoding="utf-8") as f: json.dump(obj, f, ensure_ascii=False, indent=2) def normalize_text(s: Any) -> str: + """ + Normalize an input value into a safe string for translation: + - None -> "" + - removes the Unicode replacement char (U+FFFD) + - strips whitespace + """ if s is None: return "" return str(s).replace("\uFFFD", "").strip() @@ -81,8 +71,13 @@ def normalize_text(s: Any) -> str: def split_text_safely(text: str, max_chars: int) -> List[str]: """ - Robust chunking: paragraphs -> lines -> sentences -> hard split. - Character-based splitting is stable for NLLB. + Split text into chunks to keep translation stable on long inputs. + + Strategy (coarse -> fine): + - paragraphs (2+ newlines) + - lines + - sentence-ish boundaries by punctuation regex + - hard character split fallback """ text = normalize_text(text) if not text: @@ -122,6 +117,7 @@ def split_text_safely(text: str, max_chars: int) -> List[str]: else: push(buf) buf = ln + if buf: if len(buf) <= max_chars: chunks.append(buf) @@ -146,11 +142,19 @@ def split_text_safely(text: str, max_chars: int) -> List[str]: def batchify(items: List[str], bs: int) -> List[List[str]]: + """Split a list into consecutive batches of size bs.""" return [items[i:i + bs] for i in range(0, len(items), bs)] @torch.inference_mode() def nllb_translate_batch(tokenizer, model, texts: List[str], max_new_tokens: int, num_beams: int) -> List[str]: + """ + Translate a batch of texts using NLLB from SRC_LANG -> TGT_LANG. + + Implementation details: + - tokenizer.src_lang sets the source language + - forced_bos_token_id forces generation to start in the target language + """ tokenizer.src_lang = SRC_LANG forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG) @@ -175,6 +179,13 @@ def nllb_translate_batch(tokenizer, model, texts: List[str], max_new_tokens: int def translate_text(tokenizer, model, text: str, chunk_chars: int, batch_size: int, max_new_tokens: int, num_beams: int) -> str: + """ + Translate a single text safely: + - normalize + - chunk by characters + - translate chunks in batches + - join translated chunks + """ text = normalize_text(text) if not text: return "" @@ -188,11 +199,17 @@ def translate_text(tokenizer, model, text: str, chunk_chars: int, batch_size: in def is_mistral_sk_dir(name: str) -> bool: + """Heuristic: directory name contains both 'mistral' and 'sk'.""" n = name.lower() return ("mistral" in n and "sk" in n and os.path.sep not in n) def list_candidate_run_dirs(outputs_root: str) -> List[str]: + """ + Scan OUTPUTS_ROOT and return run directories that: + - match the mistral-sk naming heuristic + - contain responses.json + """ candidates = [] for name in os.listdir(outputs_root): full = os.path.join(outputs_root, name) @@ -206,6 +223,11 @@ def list_candidate_run_dirs(outputs_root: str) -> List[str]: def pick_latest_dir(dirs: List[str]) -> Optional[str]: + """ + Choose the most recent directory by: + 1) parsing a timestamp prefix (YYYY-MM-DD_HH-MM-SS) if present + 2) falling back to filesystem mtime + """ def parse_ts(path: str) -> Tuple[int, float]: base = os.path.basename(path) try: @@ -221,13 +243,26 @@ def pick_latest_dir(dirs: List[str]) -> Optional[str]: def make_translated_dirname(src_dirname: str) -> str: + """ + Produce destination folder name for the translated run. + + Special rule: + - Replace 'do_not_answer_sk' with 'do_not_answer_eng' + + Fallback: + - Replace a terminal '_sk' token with '_eng' + """ if "do_not_answer_sk" in src_dirname: return src_dirname.replace("do_not_answer_sk", "do_not_answer_eng") - # fallback: just swap terminal "_sk" if present return re.sub(r"_sk\b", "_eng", src_dirname) def interactive_choice() -> str: + """ + Ask which runs to translate: + 1) latest run + 2) all detected runs + """ print("\n=== What to translate? ===") print("1) Latest mistral-sk run (auto-detected)") print("2) All mistral-sk runs (auto-detected)") @@ -236,6 +271,7 @@ def interactive_choice() -> str: def main(): + # Validate required directories if not os.path.isdir(NLLB_PATH): raise SystemExit(f"[ERROR] NLLB model path not found: {NLLB_PATH}") if not os.path.isdir(OUTPUTS_ROOT): @@ -256,7 +292,7 @@ def main(): for d in run_dirs: print(f" - {d}") - # Interactive translation params (simple) + # Interactive translation parameters print("\n=== Translation parameters ===") bs_in = input("batch_size (default 8): ").strip() batch_size = int(bs_in) if bs_in.isdigit() and int(bs_in) > 0 else 8 @@ -270,7 +306,7 @@ def main(): nb_in = input("num_beams (default 4): ").strip() num_beams = int(nb_in) if nb_in.isdigit() and int(nb_in) > 0 else 4 - # Load NLLB + # Load NLLB model/tokenizer device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if device == "cuda" else torch.float32 @@ -286,9 +322,9 @@ def main(): model = model.to("cpu") model.eval() - # Translate each run for src_run_dir in run_dirs: src_dirname = os.path.basename(src_run_dir) + dst_dirname = make_translated_dirname(src_dirname) dst_run_dir = os.path.join(OUTPUTS_TRANSLATED_ROOT, dst_dirname) safe_mkdir(dst_run_dir) @@ -300,6 +336,7 @@ def main(): print(f"\n[INFO] Translating:\n src: {src_run_dir}\n dst: {dst_run_dir}\n file: {src_json}") data = load_json(src_json) + if not isinstance(data, list): raise SystemExit(f"[ERROR] {src_json} must be a list of objects.") @@ -314,7 +351,6 @@ def main(): for item in tqdm(data, desc=f"Translating {src_dirname}", total=len(data)): if not isinstance(item, dict): - # keep non-dict items as-is (rare) en_only.append({"_raw": item}) continue @@ -324,15 +360,12 @@ def main(): prompt_en = translate_text(tokenizer, model, prompt_sk, chunk_chars, batch_size, max_new_tokens, num_beams) resp_en = translate_text(tokenizer, model, resp_sk, chunk_chars, batch_size, max_new_tokens, num_beams) - # EN-only output: copy everything but replace prompt/response with English out_item = dict(item) out_item["prompt"] = prompt_en out_item["response"] = resp_en - # Ensure no leftover original fields if they existed for k in list(out_item.keys()): if k.endswith("_original") or k.endswith("_sk_original") or k.endswith("_eng") or k.endswith("_en"): - # remove any prior translation artifacts out_item.pop(k, None) en_only.append(out_item)