95 lines
3.2 KiB
Python
95 lines
3.2 KiB
Python
import os
|
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
|
os.environ['WANDB_DISABLED'] = 'true'
|
|
import pandas as pd
|
|
from datasets import Dataset
|
|
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
|
|
from sklearn.model_selection import train_test_split
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
|
|
# Load dataset
|
|
df = pd.read_csv("dataset_book.csv")
|
|
|
|
# Preprocessing dataset: remove NaNs and cast columns to strings
|
|
df.dropna(subset=['incorrect', 'correct'], inplace=True)
|
|
df['incorrect'] = df['incorrect'].astype(str)
|
|
df['correct'] = df['correct'].astype(str)
|
|
|
|
# Split dataset into train, validation, and test sets
|
|
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
|
|
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)
|
|
|
|
# Convert pandas DataFrame to Hugging Face Dataset
|
|
train_dataset = Dataset.from_pandas(train_df)
|
|
val_dataset = Dataset.from_pandas(val_df)
|
|
test_dataset = Dataset.from_pandas(test_df)
|
|
|
|
# Load tokenizer and model
|
|
tokenizer = AutoTokenizer.from_pretrained("T5Autocorrection228")
|
|
model = AutoModelForSeq2SeqLM.from_pretrained("T5Autocorrection228")
|
|
|
|
# Preprocess function to tokenize the data
|
|
def preprocess_function(examples):
|
|
input_texts = examples["incorrect"]
|
|
target_texts = examples["correct"]
|
|
model_inputs = tokenizer(input_texts, max_length=128, truncation=True, padding="max_length")
|
|
labels = tokenizer(target_texts, max_length=128, truncation=True, padding="max_length")
|
|
model_inputs["labels"] = labels["input_ids"]
|
|
return model_inputs
|
|
|
|
# Tokenize datasets
|
|
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
|
|
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)
|
|
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
|
|
|
|
# Define training arguments
|
|
training_args = TrainingArguments(
|
|
output_dir="./results_book",
|
|
num_train_epochs=30,
|
|
per_device_train_batch_size=1,
|
|
warmup_steps=500,
|
|
weight_decay=0.01,
|
|
logging_dir="./logs_book",
|
|
logging_steps=10,
|
|
evaluation_strategy="epoch",
|
|
save_strategy="epoch",
|
|
save_total_limit=1,
|
|
learning_rate=5e-5,
|
|
)
|
|
|
|
# Function to ensure all tensors are contiguous before saving the model
|
|
def save_contiguous_model(model, output_dir):
|
|
# Make all parameters contiguous
|
|
for param in model.parameters():
|
|
if not param.is_contiguous():
|
|
param.data = param.data.contiguous()
|
|
|
|
# Save the model
|
|
model.save_pretrained(output_dir)
|
|
|
|
# Custom Trainer to make tensors contiguous before saving
|
|
class CustomTrainer(Trainer):
|
|
def _save(self, output_dir: str = None, state_dict=None):
|
|
# Ensure tensors are contiguous
|
|
save_contiguous_model(self.model, output_dir)
|
|
|
|
# Initialize the custom Trainer
|
|
trainer = CustomTrainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=tokenized_train_dataset,
|
|
eval_dataset=tokenized_val_dataset,
|
|
)
|
|
|
|
# Train the model
|
|
trainer.train()
|
|
|
|
# Evaluate the model on the test set
|
|
print("Evaluation on the test set:")
|
|
trainer.evaluate(tokenized_test_dataset)
|
|
|
|
# Save the final model and tokenizer
|
|
save_contiguous_model(model, "T5Autocorrection_Book")
|
|
tokenizer.save_pretrained("T5Autocorrection_Book")
|
|
|