Added comments
This commit is contained in:
parent
10f4002231
commit
de2336183b
@ -6,15 +6,51 @@ import torch
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# chunk_list(lst, n)
|
||||
# Purpose:
|
||||
# Splits a list into consecutive chunks of size n.
|
||||
#
|
||||
# How it works:
|
||||
# - Iterates from 0 to len(lst) in steps of n.
|
||||
# - Yields slices lst[i:i+n].
|
||||
#
|
||||
# Why it exists:
|
||||
# - The dataset mapping can provide large batches, but translation should be
|
||||
# performed in smaller sub-batches to control GPU memory usage and latency.
|
||||
# -----------------------------------------------------------------------------
|
||||
def chunk_list(lst, n):
|
||||
for i in range(0, len(lst), n):
|
||||
yield lst[i:i+n]
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# translate_batch(...)
|
||||
# Purpose:
|
||||
# Translates a list of texts in one forward pass (generation) using an NLLB
|
||||
# seq2seq model and tokenizer.
|
||||
#
|
||||
# Key details:
|
||||
# - Sets tokenizer.src_lang to the source language code.
|
||||
# - Uses forced_bos_token_id to force generation to start in the target language.
|
||||
# - Tokenizes with padding/truncation, moves tensors to the target device.
|
||||
# - Calls model.generate() with beam search parameters.
|
||||
# - Decodes tokens back into strings.
|
||||
#
|
||||
# Torch behavior:
|
||||
# - @torch.inference_mode() disables gradient tracking for faster inference and
|
||||
# reduced memory usage.
|
||||
# -----------------------------------------------------------------------------
|
||||
@torch.inference_mode()
|
||||
def translate_batch(texts, tokenizer, model, src_lang, tgt_lang, max_length, num_beams, device):
|
||||
tokenizer.src_lang = src_lang
|
||||
forced_bos_token_id = tokenizer.convert_tokens_to_ids(tgt_lang)
|
||||
|
||||
# Tokenize inputs:
|
||||
# - return_tensors="pt" produces PyTorch tensors
|
||||
# - padding=True aligns batch sequences
|
||||
# - truncation=True ensures max_length is respected
|
||||
inputs = tokenizer(
|
||||
texts,
|
||||
return_tensors="pt",
|
||||
@ -23,46 +59,79 @@ def translate_batch(texts, tokenizer, model, src_lang, tgt_lang, max_length, num
|
||||
max_length=max_length,
|
||||
).to(device)
|
||||
|
||||
# Generate translated sequences
|
||||
generated = model.generate(
|
||||
**inputs,
|
||||
forced_bos_token_id=forced_bos_token_id,
|
||||
max_length=max_length,
|
||||
num_beams=num_beams,
|
||||
)
|
||||
|
||||
# Convert generated token IDs into readable strings
|
||||
return tokenizer.batch_decode(generated, skip_special_tokens=True)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# main()
|
||||
# Purpose:
|
||||
# End-to-end pipeline to translate selected fields of the "LibrAI/do-not-answer"
|
||||
# dataset into Slovak (or any NLLB-supported target language) and save the result
|
||||
# to disk.
|
||||
#
|
||||
# Pipeline steps:
|
||||
# 1) Parse CLI arguments (paths, model, split, translation fields, generation params).
|
||||
# 2) Load the dataset split from Hugging Face Hub.
|
||||
# 3) Load NLLB tokenizer + model, choose device (CUDA if available).
|
||||
# 4) Translate requested fields via datasets.map() in batched mode:
|
||||
# - The datasets.map batch size (batch_size=128) is separate from translation sub-batches.
|
||||
# - Translation sub-batches are controlled by args.batch_size.
|
||||
# 5) Save the translated dataset to disk and print the output path.
|
||||
# -----------------------------------------------------------------------------
|
||||
def main():
|
||||
# CLI arguments
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--base_dir", default="/home/hyrenko/Diploma/datasets",
|
||||
help="Базовая директория для сохранения результата")
|
||||
p.add_argument("--out_name", default="do_not_answer_sk",
|
||||
help="Имя папки, которая будет создана внутри base_dir")
|
||||
p.add_argument(
|
||||
"--base_dir",
|
||||
default="/home/hyrenko/Diploma/datasets",
|
||||
help="Base directory where the translated dataset will be saved",
|
||||
)
|
||||
p.add_argument(
|
||||
"--out_name",
|
||||
default="do_not_answer_sk",
|
||||
help="Name of the output folder to create inside base_dir",
|
||||
)
|
||||
p.add_argument("--model", default="facebook/nllb-200-1.3B")
|
||||
p.add_argument("--split", default="train")
|
||||
|
||||
p.add_argument("--translate_fields", default="question",
|
||||
help="Поля для перевода через запятую. Например: question,risk_area,types_of_harm,specific_harms")
|
||||
p.add_argument(
|
||||
"--translate_fields",
|
||||
default="question",
|
||||
help="Comma-separated list of fields to translate (e.g., question,risk_area,types_of_harm,specific_harms)",
|
||||
)
|
||||
|
||||
# Параметры генерации/производительности
|
||||
# Generation / performance parameters
|
||||
p.add_argument("--batch_size", type=int, default=32)
|
||||
p.add_argument("--max_length", type=int, default=256)
|
||||
p.add_argument("--num_beams", type=int, default=4)
|
||||
|
||||
# Языковые коды NLLB
|
||||
# NLLB language codes
|
||||
p.add_argument("--src_lang", default="eng_Latn")
|
||||
p.add_argument("--tgt_lang", default="slk_Latn")
|
||||
|
||||
args = p.parse_args()
|
||||
|
||||
# Output directory setup
|
||||
out_dir = os.path.join(args.base_dir, args.out_name)
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
|
||||
# Parse fields to translate
|
||||
fields = [x.strip() for x in args.translate_fields.split(",") if x.strip()]
|
||||
|
||||
# 1) Load dataset
|
||||
# 1) Load dataset split from Hugging Face Hub
|
||||
ds = load_dataset("LibrAI/do-not-answer", split=args.split)
|
||||
|
||||
# 2) Load NLLB (FP16 на GPU)
|
||||
# 2) Load NLLB model/tokenizer
|
||||
# If CUDA is available, use FP16 for better performance and lower VRAM usage.
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
tok = AutoTokenizer.from_pretrained(args.model)
|
||||
|
||||
@ -75,6 +144,19 @@ def main():
|
||||
mdl.eval()
|
||||
|
||||
# 3) Translate
|
||||
# map_fn(batch)
|
||||
# Purpose:
|
||||
# For each requested field, translate the field values and store them into
|
||||
# a new column named "<field>_sk".
|
||||
#
|
||||
# Behavior:
|
||||
# - Copies the incoming batch dict to out.
|
||||
# - For each field in 'fields':
|
||||
# * Takes batch[f] as a list of strings.
|
||||
# * Splits into translation sub-batches of size args.batch_size.
|
||||
# * Calls translate_batch() and concatenates results.
|
||||
# * Writes translated list to out[f"{f}_sk"].
|
||||
# - Returns out, which datasets.map merges into the dataset.
|
||||
def map_fn(batch):
|
||||
out = dict(batch)
|
||||
for f in fields:
|
||||
@ -94,12 +176,15 @@ def main():
|
||||
out[f"{f}_sk"] = translated_all
|
||||
return out
|
||||
|
||||
# datasets.map батч: можно больше, чем batch_size перевода (это разные уровни)
|
||||
# datasets.map batch size can be larger than translation batch_size
|
||||
# (these are two different levels of batching).
|
||||
ds_sk = ds.map(map_fn, batched=True, batch_size=128, desc="Translating to Slovak")
|
||||
|
||||
# 4) Save
|
||||
# 4) Save to disk
|
||||
ds_sk.save_to_disk(out_dir)
|
||||
print(f"Saved translated dataset to: {out_dir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Standard entry point guard: run main() only when executed as a script
|
||||
main()
|
||||
|
||||
Loading…
Reference in New Issue
Block a user