import os os.environ['CUDA_VISIBLE_DEVICES'] = '0' os.environ['WANDB_DISABLED'] = 'true' import pandas as pd from datasets import Dataset from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments from sklearn.model_selection import train_test_split from transformers import AutoTokenizer, AutoModelForSeq2SeqLM # Load dataset df = pd.read_csv("dataset_book.csv") # Preprocessing dataset: remove NaNs and cast columns to strings df.dropna(subset=['incorrect', 'correct'], inplace=True) df['incorrect'] = df['incorrect'].astype(str) df['correct'] = df['correct'].astype(str) # Split dataset into train, validation, and test sets train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42) # Convert pandas DataFrame to Hugging Face Dataset train_dataset = Dataset.from_pandas(train_df) val_dataset = Dataset.from_pandas(val_df) test_dataset = Dataset.from_pandas(test_df) # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained("T5Autocorrection228") model = AutoModelForSeq2SeqLM.from_pretrained("T5Autocorrection228") # Preprocess function to tokenize the data def preprocess_function(examples): input_texts = examples["incorrect"] target_texts = examples["correct"] model_inputs = tokenizer(input_texts, max_length=128, truncation=True, padding="max_length") labels = tokenizer(target_texts, max_length=128, truncation=True, padding="max_length") model_inputs["labels"] = labels["input_ids"] return model_inputs # Tokenize datasets tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True) tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True) tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True) # Define training arguments training_args = TrainingArguments( output_dir="./results_book", num_train_epochs=30, per_device_train_batch_size=1, warmup_steps=500, weight_decay=0.01, logging_dir="./logs_book", logging_steps=10, evaluation_strategy="epoch", save_strategy="epoch", save_total_limit=1, learning_rate=5e-5, ) # Function to ensure all tensors are contiguous before saving the model def save_contiguous_model(model, output_dir): # Make all parameters contiguous for param in model.parameters(): if not param.is_contiguous(): param.data = param.data.contiguous() # Save the model model.save_pretrained(output_dir) # Custom Trainer to make tensors contiguous before saving class CustomTrainer(Trainer): def _save(self, output_dir: str = None, state_dict=None): # Ensure tensors are contiguous save_contiguous_model(self.model, output_dir) # Initialize the custom Trainer trainer = CustomTrainer( model=model, args=training_args, train_dataset=tokenized_train_dataset, eval_dataset=tokenized_val_dataset, ) # Train the model trainer.train() # Evaluate the model on the test set print("Evaluation on the test set:") trainer.evaluate(tokenized_test_dataset) # Save the final model and tokenizer save_contiguous_model(model, "T5Autocorrection_Book") tokenizer.save_pretrained("T5Autocorrection_Book")