diff --git a/few_shot/few_shot_eval_mt5_1B.py b/few_shot/few_shot_eval_mt5_1B.py new file mode 100644 index 0000000..8d1fd28 --- /dev/null +++ b/few_shot/few_shot_eval_mt5_1B.py @@ -0,0 +1,212 @@ +import sys +import codecs +from datasets import load_dataset, concatenate_datasets +import numpy as np +import torch + +from sklearn.metrics import precision_recall_fscore_support,precision_recall_curve +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + Trainer, + TrainingArguments, + AutoModelForSeq2SeqLM, + set_seed, + T5Tokenizer +) + +from lm_eval import evaluate + +# Set seed for reproducibility +set_seed(42) + +# Load and preprocess data +ds = load_dataset("TUKE-KEMT/hate_speech_slovak") +label_0 = ds['train'].filter(lambda example: example['label'] == 0) +label_1 = ds['train'].filter(lambda example: example['label'] == 1) + +# Create stratified few-shot splits +def create_stratified_split(label_0, label_1, n_samples, seed=42): + few_shot_0 = label_0.shuffle(seed=seed).select(range(n_samples)) + few_shot_1 = label_1.shuffle(seed=seed).select(range(n_samples)) + return concatenate_datasets([few_shot_0, few_shot_1]).shuffle(seed=seed) + +# Create train/val/test splits +train_dataset = create_stratified_split(label_0, label_1, n_samples=40) +val_dataset = create_stratified_split(label_0, label_1, n_samples=10, seed=43) +test_dataset = create_stratified_split(label_0, label_1, n_samples=50, seed=44) + +# Initialize tokenizer and model +tokenizer = AutoTokenizer.from_pretrained("hlillemark/mt5-1B-flores200-baseline", force_download=True) +model = AutoModelForSequenceClassification.from_pretrained("hlillemark/mt5-1B-flores200-baseline", num_labels=2) + + + +# Tokenization function with padding +def tokenize(batch): + return tokenizer( + batch["text"], + padding="max_length", + truncation=True, + max_length=256 + ) + + +# Prepare datasets +def prepare_dataset(dataset): + dataset = dataset.map(tokenize, batched=True, remove_columns=["text"]) + print(dataset[0]) + return dataset.rename_column("label", "labels") + +train_dataset = prepare_dataset(train_dataset) +val_dataset = prepare_dataset(val_dataset) +test_dataset = prepare_dataset(test_dataset) + +# Set device +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +# Training arguments with improved settings +training_args = TrainingArguments( + output_dir="./hate_speech_model", + per_device_train_batch_size=8, + per_device_eval_batch_size=16, + learning_rate=3e-5, # Adjust as needed + num_train_epochs=7, # Increased epochs for better training + eval_strategy="epoch", # Use "epoch" for both strategies + save_strategy="epoch", # Matching the evaluation strategy + load_best_model_at_end=True, + metric_for_best_model="f1", + greater_is_better=True, + warmup_steps=100, # Increased warmup steps + weight_decay=0.01, + report_to="none", + seed=42, + logging_steps=10, + gradient_accumulation_steps=2, # For more effective training on larger datasets + lr_scheduler_type="cosine", # Using cosine scheduler for learning rate + logging_dir='./logs', +) + +# Custom metrics computation +def compute_metrics(pred): + logits = pred.predictions[0] # Ensure only the logits are used + preds = logits.argmax(-1) + labels = pred.label_ids + precision, recall, f1, _ = precision_recall_fscore_support( + labels, + preds, + average='binary' + ) + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Initialize trainer with validation data and metrics +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=val_dataset, + compute_metrics=compute_metrics, +) + +# Train model +trainer.train() + +# Evaluate on test set +def find_optimal_threshold(trainer, dataset): + # Get predictions + predictions = trainer.predict(dataset) + + # Extract logits (the first element in the predictions tuple) + logits = predictions.predictions # This is likely a tuple (logits, other_info) + + # If logits is a tuple, extract only the logits + if isinstance(logits, tuple): + logits = logits[0] # Extract the logits from the tuple + + # Check the shape of logits to debug the issue + print(f"Logits shape: {logits.shape}") + + # Ensure logits has the shape (batch_size, 2) for binary classification + if logits.shape[-1] != 2: + logits = logits[:, :2] # Take only the first two columns (logits for the two classes) + print(f"Logits shape after slicing: {logits.shape}") + + # Convert logits to tensor if necessary + if not isinstance(logits, torch.Tensor): + logits = torch.tensor(logits) # Convert logits to a tensor if needed + + # Apply softmax to get probabilities + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get probabilities for the positive class (label=1) + positive_probs = probs[:, 1].numpy() # The probabilities for the positive class (label=1) + + # Get true labels from predictions + true_labels = predictions.label_ids + + # Calculate precision-recall curve + precisions, recalls, thresholds = precision_recall_curve(true_labels, positive_probs) + f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8) + + # Find the optimal threshold based on F1-score + optimal_idx = np.argmax(f1_scores[:-1]) # Exclude last threshold (it is always 1) + optimal_threshold = thresholds[optimal_idx] + + return optimal_threshold, precisions[optimal_idx], recalls[optimal_idx], f1_scores[optimal_idx] + + +def evaluate_with_threshold(trainer, dataset, threshold=0.5): + predictions = trainer.predict(dataset) + + # Ensure that logits are properly reshaped to (batch_size, 2) before applying softmax + logits = predictions.predictions + if isinstance(logits, tuple): + logits = logits[0] # Extract logits if it's a tuple + + logits = torch.tensor(logits) if not isinstance(logits, torch.Tensor) else logits + + # Apply softmax to get probabilities + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get predicted labels based on the threshold for the positive class (label=1) + predicted_labels = (probs[:, 1] > threshold).numpy().astype(int) + + true_labels = predictions.label_ids + + # Calculate precision-recall-fscore + precision, recall, f1, _ = precision_recall_fscore_support( + true_labels, + predicted_labels, + average='binary', + zero_division=0 + ) + + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Example usage: +# Find the optimal threshold using validation data +print("\nFinding optimal threshold...") +optimal_threshold, best_precision, best_recall, best_f1 = find_optimal_threshold(trainer, val_dataset) +print(f"Optimal threshold: {optimal_threshold:.4f}") + +# Evaluate with optimal threshold +print("\nEvaluating with optimal threshold:") +optimized_results = evaluate_with_threshold(trainer, test_dataset, threshold=optimal_threshold) +print(f"Precision: {optimized_results['precision']:.4f}") +print(f"Recall: {optimized_results['recall']:.4f}") +print(f"F1: {optimized_results['f1']:.4f}") + + +# Save the model +#trainer.save_model("./hate_speech_model/best_model") +del model +torch.cuda.empty_cache() \ No newline at end of file