from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments from datasets import load_dataset model_name = "t5-base" tokenizer = T5Tokenizer.from_pretrained(model_name) model = T5ForConditionalGeneration.from_pretrained(model_name) def preprocess_function(examples): before_list = [] after_list = [] for ex in examples["before after"]: if ex is not None: splits = ex.split(" before after ") if len(splits) == 2: before_list.append(splits[0]) after_list.append(splits[1]) else: before_list.append(ex) after_list.append('') else: before_list.append('') after_list.append('') model_inputs = tokenizer(before_list, padding="max_length", truncation=True) labels = tokenizer(after_list, padding="max_length", truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs dataset = load_dataset("csv", data_files={"train": "converted.csv"}, delimiter=" ", column_names=["before after"]) tokenized_datasets = dataset.map(preprocess_function, batched=True) training_args = TrainingArguments( output_dir="./results1", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64, num_train_epochs=1, weight_decay=0.01, ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], tokenizer=tokenizer, ) trainer.train() model.save_pretrained("T5Autocorrection") tokenizer.save_pretrained("T5TokenizerAutocorrection")