import os os.environ['CUDA_VISIBLE_DEVICES'] = '0' os.environ['WANDB_DISABLED'] = 'true' import pandas as pd from datasets import Dataset from transformers import ByT5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments from sklearn.model_selection import train_test_split df = pd.read_csv("dataset_file_name.csv", sep=";") df.dropna(subset=['incorrect', 'correct'], inplace=True) df['incorrect'] = df['incorrect'].astype(str) df['correct'] = df['correct'].astype(str) train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42) train_dataset = Dataset.from_pandas(train_df) val_dataset = Dataset.from_pandas(val_df) test_dataset = Dataset.from_pandas(test_df) tokenizer = ByT5Tokenizer.from_pretrained("your-model/name") model = T5ForConditionalGeneration.from_pretrained("your-model/name") def preprocess_function(examples): input_texts = examples["incorrect"] target_texts = examples["correct"] model_inputs = tokenizer(input_texts, max_length=128, truncation=True, padding="max_length") with tokenizer.as_target_tokenizer(): labels = tokenizer(target_texts, max_length=128, truncation=True, padding="max_length") model_inputs["labels"] = labels["input_ids"] return model_inputs tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True) tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True) tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True) tokenized_train_dataset.save_to_disk("./tokenized_train_dataset") tokenized_val_dataset.save_to_disk("./tokenized_val_dataset") tokenized_test_dataset.save_to_disk("./tokenized_test_dataset") def save_sentences_to_separate_files(df, incorrect_file_path, correct_file_path): with open(incorrect_file_path, "w", encoding="utf-8") as incorrect_file, \ open(correct_file_path, "w", encoding="utf-8") as correct_file: for index, row in df.iterrows(): incorrect_file.write(row["incorrect"] + "\n") correct_file.write(row["correct"] + "\n") save_sentences_to_separate_files(train_df, "./train_incorrect.txt", "./train_correct.txt") save_sentences_to_separate_files(val_df, "./val_incorrect.txt", "./val_correct.txt") save_sentences_to_separate_files(test_df, "./test_incorrect.txt", "./test_correct.txt") training_args = TrainingArguments( output_dir="./results", num_train_epochs=3, per_device_train_batch_size=12, warmup_steps=500, weight_decay=0.01, logging_dir="./logs", logging_steps=10, evaluation_strategy="epoch", ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train_dataset, eval_dataset=tokenized_val_dataset, ) trainer.train() print("Evaluation on the test set:") trainer.evaluate(tokenized_test_dataset) model_path = "./fine_tuned_model" model.save_pretrained(model_path) tokenizer.save_pretrained(model_path)