From 13136dd010ebc85a0830d8795ad14db3d7f3b6b0 Mon Sep 17 00:00:00 2001 From: Andrii Pervashov Date: Thu, 26 Sep 2024 10:59:38 +0000 Subject: [PATCH] training script, 25.9.2024 version --- trainingscript.py | 66 +++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/trainingscript.py b/trainingscript.py index 24d0467..4662d38 100644 --- a/trainingscript.py +++ b/trainingscript.py @@ -1,54 +1,54 @@ +from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments from datasets import load_dataset -from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments -# Initialize the tokenizer -model_name = "t5-small" +model_name = "t5-base" tokenizer = T5Tokenizer.from_pretrained(model_name) +model = T5ForConditionalGeneration.from_pretrained(model_name) -# Load the dataset with the specific configuration -dataset = load_dataset("wiki_atomic_edits", "english_insertions", trust_remote_code=True) - -# Inspect the dataset splits -print(dataset.keys()) # Print available dataset splits - -# Preprocessing Function def preprocess_function(examples): - inputs = examples["base_sentence"] - targets = examples["edited_sentence"] - model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length") - labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length") - labels["input_ids"] = [ - [(label if label != tokenizer.pad_token_id else -100) for label in labels_example] - for labels_example in labels["input_ids"] - ] + before_list = [] + after_list = [] + for ex in examples["before after"]: + if ex is not None: + splits = ex.split(" before after ") + if len(splits) == 2: + before_list.append(splits[0]) + after_list.append(splits[1]) + else: + before_list.append(ex) + after_list.append('') + else: + before_list.append('') + after_list.append('') + + model_inputs = tokenizer(before_list, padding="max_length", truncation=True) + labels = tokenizer(after_list, padding="max_length", truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs -# Apply the preprocessing function to the dataset + +dataset = load_dataset("csv", data_files={"train": "converted.csv"}, delimiter=" ", column_names=["before after"]) + tokenized_datasets = dataset.map(preprocess_function, batched=True) -# Initialize the model -model = T5ForConditionalGeneration.from_pretrained(model_name) - -# Set up training arguments training_args = TrainingArguments( - output_dir="./results", - evaluation_strategy="epoch", # Updated from eval_strategy to evaluation_strategy + output_dir="./results1", + evaluation_strategy="epoch", + save_strategy="epoch", learning_rate=2e-5, - per_device_train_batch_size=4, - per_device_eval_batch_size=4, - num_train_epochs=3, + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + num_train_epochs=1, weight_decay=0.01, - logging_dir="./logs", ) -# Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], - eval_dataset=tokenized_datasets.get("validation") # Use .get() to avoid KeyError + tokenizer=tokenizer, ) -# Start training -trainer.train() \ No newline at end of file +trainer.train() +model.save_pretrained("T5Autocorrection") +tokenizer.save_pretrained("T5TokenizerAutocorrection")