diff --git a/translate/translate_do-not_answer.py b/translate/translate_do-not_answer.py index 58a5f98..c621bb8 100644 --- a/translate/translate_do-not_answer.py +++ b/translate/translate_do-not_answer.py @@ -6,15 +6,51 @@ import torch from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForSeq2SeqLM + +# ----------------------------------------------------------------------------- +# chunk_list(lst, n) +# Purpose: +# Splits a list into consecutive chunks of size n. +# +# How it works: +# - Iterates from 0 to len(lst) in steps of n. +# - Yields slices lst[i:i+n]. +# +# Why it exists: +# - The dataset mapping can provide large batches, but translation should be +# performed in smaller sub-batches to control GPU memory usage and latency. +# ----------------------------------------------------------------------------- def chunk_list(lst, n): for i in range(0, len(lst), n): yield lst[i:i+n] + +# ----------------------------------------------------------------------------- +# translate_batch(...) +# Purpose: +# Translates a list of texts in one forward pass (generation) using an NLLB +# seq2seq model and tokenizer. +# +# Key details: +# - Sets tokenizer.src_lang to the source language code. +# - Uses forced_bos_token_id to force generation to start in the target language. +# - Tokenizes with padding/truncation, moves tensors to the target device. +# - Calls model.generate() with beam search parameters. +# - Decodes tokens back into strings. +# +# Torch behavior: +# - @torch.inference_mode() disables gradient tracking for faster inference and +# reduced memory usage. +# ----------------------------------------------------------------------------- @torch.inference_mode() def translate_batch(texts, tokenizer, model, src_lang, tgt_lang, max_length, num_beams, device): tokenizer.src_lang = src_lang forced_bos_token_id = tokenizer.convert_tokens_to_ids(tgt_lang) + # Tokenize inputs: + # - return_tensors="pt" produces PyTorch tensors + # - padding=True aligns batch sequences + # - truncation=True ensures max_length is respected inputs = tokenizer( texts, return_tensors="pt", @@ -23,46 +59,79 @@ def translate_batch(texts, tokenizer, model, src_lang, tgt_lang, max_length, num max_length=max_length, ).to(device) + # Generate translated sequences generated = model.generate( **inputs, forced_bos_token_id=forced_bos_token_id, max_length=max_length, num_beams=num_beams, ) + + # Convert generated token IDs into readable strings return tokenizer.batch_decode(generated, skip_special_tokens=True) + +# ----------------------------------------------------------------------------- +# main() +# Purpose: +# End-to-end pipeline to translate selected fields of the "LibrAI/do-not-answer" +# dataset into Slovak (or any NLLB-supported target language) and save the result +# to disk. +# +# Pipeline steps: +# 1) Parse CLI arguments (paths, model, split, translation fields, generation params). +# 2) Load the dataset split from Hugging Face Hub. +# 3) Load NLLB tokenizer + model, choose device (CUDA if available). +# 4) Translate requested fields via datasets.map() in batched mode: +# - The datasets.map batch size (batch_size=128) is separate from translation sub-batches. +# - Translation sub-batches are controlled by args.batch_size. +# 5) Save the translated dataset to disk and print the output path. +# ----------------------------------------------------------------------------- def main(): + # CLI arguments p = argparse.ArgumentParser() - p.add_argument("--base_dir", default="/home/hyrenko/Diploma/datasets", - help="Базовая директория для сохранения результата") - p.add_argument("--out_name", default="do_not_answer_sk", - help="Имя папки, которая будет создана внутри base_dir") + p.add_argument( + "--base_dir", + default="/home/hyrenko/Diploma/datasets", + help="Base directory where the translated dataset will be saved", + ) + p.add_argument( + "--out_name", + default="do_not_answer_sk", + help="Name of the output folder to create inside base_dir", + ) p.add_argument("--model", default="facebook/nllb-200-1.3B") p.add_argument("--split", default="train") - p.add_argument("--translate_fields", default="question", - help="Поля для перевода через запятую. Например: question,risk_area,types_of_harm,specific_harms") + p.add_argument( + "--translate_fields", + default="question", + help="Comma-separated list of fields to translate (e.g., question,risk_area,types_of_harm,specific_harms)", + ) - # Параметры генерации/производительности + # Generation / performance parameters p.add_argument("--batch_size", type=int, default=32) p.add_argument("--max_length", type=int, default=256) p.add_argument("--num_beams", type=int, default=4) - # Языковые коды NLLB + # NLLB language codes p.add_argument("--src_lang", default="eng_Latn") p.add_argument("--tgt_lang", default="slk_Latn") args = p.parse_args() + # Output directory setup out_dir = os.path.join(args.base_dir, args.out_name) os.makedirs(out_dir, exist_ok=True) + # Parse fields to translate fields = [x.strip() for x in args.translate_fields.split(",") if x.strip()] - # 1) Load dataset + # 1) Load dataset split from Hugging Face Hub ds = load_dataset("LibrAI/do-not-answer", split=args.split) - # 2) Load NLLB (FP16 на GPU) + # 2) Load NLLB model/tokenizer + # If CUDA is available, use FP16 for better performance and lower VRAM usage. device = "cuda" if torch.cuda.is_available() else "cpu" tok = AutoTokenizer.from_pretrained(args.model) @@ -75,6 +144,19 @@ def main(): mdl.eval() # 3) Translate + # map_fn(batch) + # Purpose: + # For each requested field, translate the field values and store them into + # a new column named "_sk". + # + # Behavior: + # - Copies the incoming batch dict to out. + # - For each field in 'fields': + # * Takes batch[f] as a list of strings. + # * Splits into translation sub-batches of size args.batch_size. + # * Calls translate_batch() and concatenates results. + # * Writes translated list to out[f"{f}_sk"]. + # - Returns out, which datasets.map merges into the dataset. def map_fn(batch): out = dict(batch) for f in fields: @@ -94,12 +176,15 @@ def main(): out[f"{f}_sk"] = translated_all return out - # datasets.map батч: можно больше, чем batch_size перевода (это разные уровни) + # datasets.map batch size can be larger than translation batch_size + # (these are two different levels of batching). ds_sk = ds.map(map_fn, batched=True, batch_size=128, desc="Translating to Slovak") - # 4) Save + # 4) Save to disk ds_sk.save_to_disk(out_dir) print(f"Saved translated dataset to: {out_dir}") + if __name__ == "__main__": + # Standard entry point guard: run main() only when executed as a script main()