from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments from datasets import load_dataset model_name = "T5Autocorrection_Book18102024" tokenizer = T5Tokenizer.from_pretrained("T5Autocorrection_Book_Typos18102024") model = T5ForConditionalGeneration.from_pretrained(model_name) def preprocess_function(examples): before_list = [] after_list = [] for ex in examples["before after"]: if ex is not None: splits = ex.split(" before after ") before_list.append(splits[0] if len(splits) == 2 else ex) after_list.append(splits[1] if len(splits) == 2 else '') else: before_list.append('') after_list.append('') model_inputs = tokenizer(before_list, padding="max_length", truncation=True, max_length=512) labels = tokenizer(after_list, padding="max_length", truncation=True, max_length=512) model_inputs["labels"] = labels["input_ids"] return model_inputs dataset = load_dataset("csv", data_files={"train": "filtered_book_typos.csv"}, delimiter=",", column_names=["before after"]) tokenized_datasets = dataset.map(preprocess_function, batched=True) training_args = TrainingArguments( output_dir="./results_book5_3", save_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=1, per_device_eval_batch_size=1, num_train_epochs=5, weight_decay=0.01, gradient_accumulation_steps=64, fp16=True, ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], tokenizer=tokenizer, ) trainer.train() for param in model.parameters(): param.data = param.data.contiguous() model.save_pretrained("T5Autocorrection_Book18102024_3", safe_serialization=False) # Отключаем safetensors для простого сохранения tokenizer.save_pretrained("T5Autocorrection_Book_Typos18102024_3")