bkpc/train.py

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['WANDB_DISABLED'] = 'true'
import pandas as pd
from datasets import Dataset
from transformers import ByT5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

df = pd.read_csv("dataset_file_name.csv", sep=";")

df.dropna(subset=['incorrect', 'correct'], inplace=True)
df['incorrect'] = df['incorrect'].astype(str)
df['correct'] = df['correct'].astype(str)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

tokenizer = ByT5Tokenizer.from_pretrained("your-model/name")
model = T5ForConditionalGeneration.from_pretrained("your-model/name")

def preprocess_function(examples):
    input_texts = examples["incorrect"]
    target_texts = examples["correct"]
    model_inputs = tokenizer(input_texts, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(target_texts, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

tokenized_train_dataset.save_to_disk("./tokenized_train_dataset")
tokenized_val_dataset.save_to_disk("./tokenized_val_dataset")
tokenized_test_dataset.save_to_disk("./tokenized_test_dataset")

def save_sentences_to_separate_files(df, incorrect_file_path, correct_file_path):
    with open(incorrect_file_path, "w", encoding="utf-8") as incorrect_file, \
         open(correct_file_path, "w", encoding="utf-8") as correct_file:
        for index, row in df.iterrows():
            incorrect_file.write(row["incorrect"] + "\n")
            correct_file.write(row["correct"] + "\n")

save_sentences_to_separate_files(train_df, "./train_incorrect.txt", "./train_correct.txt")
save_sentences_to_separate_files(val_df, "./val_incorrect.txt", "./val_correct.txt")
save_sentences_to_separate_files(test_df, "./test_incorrect.txt", "./test_correct.txt")

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=12,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,  )

trainer.train()

print("Evaluation on the test set:")
trainer.evaluate(tokenized_test_dataset)

model_path = "./fine_tuned_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
No results found.