Added comments
This commit is contained in:
parent
10f4002231
commit
de2336183b
@ -6,15 +6,51 @@ import torch
|
|||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# chunk_list(lst, n)
|
||||||
|
# Purpose:
|
||||||
|
# Splits a list into consecutive chunks of size n.
|
||||||
|
#
|
||||||
|
# How it works:
|
||||||
|
# - Iterates from 0 to len(lst) in steps of n.
|
||||||
|
# - Yields slices lst[i:i+n].
|
||||||
|
#
|
||||||
|
# Why it exists:
|
||||||
|
# - The dataset mapping can provide large batches, but translation should be
|
||||||
|
# performed in smaller sub-batches to control GPU memory usage and latency.
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
def chunk_list(lst, n):
|
def chunk_list(lst, n):
|
||||||
for i in range(0, len(lst), n):
|
for i in range(0, len(lst), n):
|
||||||
yield lst[i:i+n]
|
yield lst[i:i+n]
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# translate_batch(...)
|
||||||
|
# Purpose:
|
||||||
|
# Translates a list of texts in one forward pass (generation) using an NLLB
|
||||||
|
# seq2seq model and tokenizer.
|
||||||
|
#
|
||||||
|
# Key details:
|
||||||
|
# - Sets tokenizer.src_lang to the source language code.
|
||||||
|
# - Uses forced_bos_token_id to force generation to start in the target language.
|
||||||
|
# - Tokenizes with padding/truncation, moves tensors to the target device.
|
||||||
|
# - Calls model.generate() with beam search parameters.
|
||||||
|
# - Decodes tokens back into strings.
|
||||||
|
#
|
||||||
|
# Torch behavior:
|
||||||
|
# - @torch.inference_mode() disables gradient tracking for faster inference and
|
||||||
|
# reduced memory usage.
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def translate_batch(texts, tokenizer, model, src_lang, tgt_lang, max_length, num_beams, device):
|
def translate_batch(texts, tokenizer, model, src_lang, tgt_lang, max_length, num_beams, device):
|
||||||
tokenizer.src_lang = src_lang
|
tokenizer.src_lang = src_lang
|
||||||
forced_bos_token_id = tokenizer.convert_tokens_to_ids(tgt_lang)
|
forced_bos_token_id = tokenizer.convert_tokens_to_ids(tgt_lang)
|
||||||
|
|
||||||
|
# Tokenize inputs:
|
||||||
|
# - return_tensors="pt" produces PyTorch tensors
|
||||||
|
# - padding=True aligns batch sequences
|
||||||
|
# - truncation=True ensures max_length is respected
|
||||||
inputs = tokenizer(
|
inputs = tokenizer(
|
||||||
texts,
|
texts,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
@ -23,46 +59,79 @@ def translate_batch(texts, tokenizer, model, src_lang, tgt_lang, max_length, num
|
|||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
).to(device)
|
).to(device)
|
||||||
|
|
||||||
|
# Generate translated sequences
|
||||||
generated = model.generate(
|
generated = model.generate(
|
||||||
**inputs,
|
**inputs,
|
||||||
forced_bos_token_id=forced_bos_token_id,
|
forced_bos_token_id=forced_bos_token_id,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
num_beams=num_beams,
|
num_beams=num_beams,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Convert generated token IDs into readable strings
|
||||||
return tokenizer.batch_decode(generated, skip_special_tokens=True)
|
return tokenizer.batch_decode(generated, skip_special_tokens=True)
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# main()
|
||||||
|
# Purpose:
|
||||||
|
# End-to-end pipeline to translate selected fields of the "LibrAI/do-not-answer"
|
||||||
|
# dataset into Slovak (or any NLLB-supported target language) and save the result
|
||||||
|
# to disk.
|
||||||
|
#
|
||||||
|
# Pipeline steps:
|
||||||
|
# 1) Parse CLI arguments (paths, model, split, translation fields, generation params).
|
||||||
|
# 2) Load the dataset split from Hugging Face Hub.
|
||||||
|
# 3) Load NLLB tokenizer + model, choose device (CUDA if available).
|
||||||
|
# 4) Translate requested fields via datasets.map() in batched mode:
|
||||||
|
# - The datasets.map batch size (batch_size=128) is separate from translation sub-batches.
|
||||||
|
# - Translation sub-batches are controlled by args.batch_size.
|
||||||
|
# 5) Save the translated dataset to disk and print the output path.
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
def main():
|
def main():
|
||||||
|
# CLI arguments
|
||||||
p = argparse.ArgumentParser()
|
p = argparse.ArgumentParser()
|
||||||
p.add_argument("--base_dir", default="/home/hyrenko/Diploma/datasets",
|
p.add_argument(
|
||||||
help="Базовая директория для сохранения результата")
|
"--base_dir",
|
||||||
p.add_argument("--out_name", default="do_not_answer_sk",
|
default="/home/hyrenko/Diploma/datasets",
|
||||||
help="Имя папки, которая будет создана внутри base_dir")
|
help="Base directory where the translated dataset will be saved",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--out_name",
|
||||||
|
default="do_not_answer_sk",
|
||||||
|
help="Name of the output folder to create inside base_dir",
|
||||||
|
)
|
||||||
p.add_argument("--model", default="facebook/nllb-200-1.3B")
|
p.add_argument("--model", default="facebook/nllb-200-1.3B")
|
||||||
p.add_argument("--split", default="train")
|
p.add_argument("--split", default="train")
|
||||||
|
|
||||||
p.add_argument("--translate_fields", default="question",
|
p.add_argument(
|
||||||
help="Поля для перевода через запятую. Например: question,risk_area,types_of_harm,specific_harms")
|
"--translate_fields",
|
||||||
|
default="question",
|
||||||
|
help="Comma-separated list of fields to translate (e.g., question,risk_area,types_of_harm,specific_harms)",
|
||||||
|
)
|
||||||
|
|
||||||
# Параметры генерации/производительности
|
# Generation / performance parameters
|
||||||
p.add_argument("--batch_size", type=int, default=32)
|
p.add_argument("--batch_size", type=int, default=32)
|
||||||
p.add_argument("--max_length", type=int, default=256)
|
p.add_argument("--max_length", type=int, default=256)
|
||||||
p.add_argument("--num_beams", type=int, default=4)
|
p.add_argument("--num_beams", type=int, default=4)
|
||||||
|
|
||||||
# Языковые коды NLLB
|
# NLLB language codes
|
||||||
p.add_argument("--src_lang", default="eng_Latn")
|
p.add_argument("--src_lang", default="eng_Latn")
|
||||||
p.add_argument("--tgt_lang", default="slk_Latn")
|
p.add_argument("--tgt_lang", default="slk_Latn")
|
||||||
|
|
||||||
args = p.parse_args()
|
args = p.parse_args()
|
||||||
|
|
||||||
|
# Output directory setup
|
||||||
out_dir = os.path.join(args.base_dir, args.out_name)
|
out_dir = os.path.join(args.base_dir, args.out_name)
|
||||||
os.makedirs(out_dir, exist_ok=True)
|
os.makedirs(out_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Parse fields to translate
|
||||||
fields = [x.strip() for x in args.translate_fields.split(",") if x.strip()]
|
fields = [x.strip() for x in args.translate_fields.split(",") if x.strip()]
|
||||||
|
|
||||||
# 1) Load dataset
|
# 1) Load dataset split from Hugging Face Hub
|
||||||
ds = load_dataset("LibrAI/do-not-answer", split=args.split)
|
ds = load_dataset("LibrAI/do-not-answer", split=args.split)
|
||||||
|
|
||||||
# 2) Load NLLB (FP16 на GPU)
|
# 2) Load NLLB model/tokenizer
|
||||||
|
# If CUDA is available, use FP16 for better performance and lower VRAM usage.
|
||||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
tok = AutoTokenizer.from_pretrained(args.model)
|
tok = AutoTokenizer.from_pretrained(args.model)
|
||||||
|
|
||||||
@ -75,6 +144,19 @@ def main():
|
|||||||
mdl.eval()
|
mdl.eval()
|
||||||
|
|
||||||
# 3) Translate
|
# 3) Translate
|
||||||
|
# map_fn(batch)
|
||||||
|
# Purpose:
|
||||||
|
# For each requested field, translate the field values and store them into
|
||||||
|
# a new column named "<field>_sk".
|
||||||
|
#
|
||||||
|
# Behavior:
|
||||||
|
# - Copies the incoming batch dict to out.
|
||||||
|
# - For each field in 'fields':
|
||||||
|
# * Takes batch[f] as a list of strings.
|
||||||
|
# * Splits into translation sub-batches of size args.batch_size.
|
||||||
|
# * Calls translate_batch() and concatenates results.
|
||||||
|
# * Writes translated list to out[f"{f}_sk"].
|
||||||
|
# - Returns out, which datasets.map merges into the dataset.
|
||||||
def map_fn(batch):
|
def map_fn(batch):
|
||||||
out = dict(batch)
|
out = dict(batch)
|
||||||
for f in fields:
|
for f in fields:
|
||||||
@ -94,12 +176,15 @@ def main():
|
|||||||
out[f"{f}_sk"] = translated_all
|
out[f"{f}_sk"] = translated_all
|
||||||
return out
|
return out
|
||||||
|
|
||||||
# datasets.map батч: можно больше, чем batch_size перевода (это разные уровни)
|
# datasets.map batch size can be larger than translation batch_size
|
||||||
|
# (these are two different levels of batching).
|
||||||
ds_sk = ds.map(map_fn, batched=True, batch_size=128, desc="Translating to Slovak")
|
ds_sk = ds.map(map_fn, batched=True, batch_size=128, desc="Translating to Slovak")
|
||||||
|
|
||||||
# 4) Save
|
# 4) Save to disk
|
||||||
ds_sk.save_to_disk(out_dir)
|
ds_sk.save_to_disk(out_dir)
|
||||||
print(f"Saved translated dataset to: {out_dir}")
|
print(f"Saved translated dataset to: {out_dir}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
# Standard entry point guard: run main() only when executed as a script
|
||||||
main()
|
main()
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user