diff --git a/few_shot/few_shot_eval_mt5_1B.py b/few_shot/few_shot_eval_mt5_1B.py new file mode 100644 index 0000000..8d1fd28 --- /dev/null +++ b/few_shot/few_shot_eval_mt5_1B.py @@ -0,0 +1,212 @@ +import sys +import codecs +from datasets import load_dataset, concatenate_datasets +import numpy as np +import torch + +from sklearn.metrics import precision_recall_fscore_support,precision_recall_curve +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + Trainer, + TrainingArguments, + AutoModelForSeq2SeqLM, + set_seed, + T5Tokenizer +) + +from lm_eval import evaluate + +# Set seed for reproducibility +set_seed(42) + +# Load and preprocess data +ds = load_dataset("TUKE-KEMT/hate_speech_slovak") +label_0 = ds['train'].filter(lambda example: example['label'] == 0) +label_1 = ds['train'].filter(lambda example: example['label'] == 1) + +# Create stratified few-shot splits +def create_stratified_split(label_0, label_1, n_samples, seed=42): + few_shot_0 = label_0.shuffle(seed=seed).select(range(n_samples)) + few_shot_1 = label_1.shuffle(seed=seed).select(range(n_samples)) + return concatenate_datasets([few_shot_0, few_shot_1]).shuffle(seed=seed) + +# Create train/val/test splits +train_dataset = create_stratified_split(label_0, label_1, n_samples=40) +val_dataset = create_stratified_split(label_0, label_1, n_samples=10, seed=43) +test_dataset = create_stratified_split(label_0, label_1, n_samples=50, seed=44) + +# Initialize tokenizer and model +tokenizer = AutoTokenizer.from_pretrained("hlillemark/mt5-1B-flores200-baseline", force_download=True) +model = AutoModelForSequenceClassification.from_pretrained("hlillemark/mt5-1B-flores200-baseline", num_labels=2) + + + +# Tokenization function with padding +def tokenize(batch): + return tokenizer( + batch["text"], + padding="max_length", + truncation=True, + max_length=256 + ) + + +# Prepare datasets +def prepare_dataset(dataset): + dataset = dataset.map(tokenize, batched=True, remove_columns=["text"]) + print(dataset[0]) + return dataset.rename_column("label", "labels") + +train_dataset = prepare_dataset(train_dataset) +val_dataset = prepare_dataset(val_dataset) +test_dataset = prepare_dataset(test_dataset) + +# Set device +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +# Training arguments with improved settings +training_args = TrainingArguments( + output_dir="./hate_speech_model", + per_device_train_batch_size=8, + per_device_eval_batch_size=16, + learning_rate=3e-5, # Adjust as needed + num_train_epochs=7, # Increased epochs for better training + eval_strategy="epoch", # Use "epoch" for both strategies + save_strategy="epoch", # Matching the evaluation strategy + load_best_model_at_end=True, + metric_for_best_model="f1", + greater_is_better=True, + warmup_steps=100, # Increased warmup steps + weight_decay=0.01, + report_to="none", + seed=42, + logging_steps=10, + gradient_accumulation_steps=2, # For more effective training on larger datasets + lr_scheduler_type="cosine", # Using cosine scheduler for learning rate + logging_dir='./logs', +) + +# Custom metrics computation +def compute_metrics(pred): + logits = pred.predictions[0] # Ensure only the logits are used + preds = logits.argmax(-1) + labels = pred.label_ids + precision, recall, f1, _ = precision_recall_fscore_support( + labels, + preds, + average='binary' + ) + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Initialize trainer with validation data and metrics +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=val_dataset, + compute_metrics=compute_metrics, +) + +# Train model +trainer.train() + +# Evaluate on test set +def find_optimal_threshold(trainer, dataset): + # Get predictions + predictions = trainer.predict(dataset) + + # Extract logits (the first element in the predictions tuple) + logits = predictions.predictions # This is likely a tuple (logits, other_info) + + # If logits is a tuple, extract only the logits + if isinstance(logits, tuple): + logits = logits[0] # Extract the logits from the tuple + + # Check the shape of logits to debug the issue + print(f"Logits shape: {logits.shape}") + + # Ensure logits has the shape (batch_size, 2) for binary classification + if logits.shape[-1] != 2: + logits = logits[:, :2] # Take only the first two columns (logits for the two classes) + print(f"Logits shape after slicing: {logits.shape}") + + # Convert logits to tensor if necessary + if not isinstance(logits, torch.Tensor): + logits = torch.tensor(logits) # Convert logits to a tensor if needed + + # Apply softmax to get probabilities + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get probabilities for the positive class (label=1) + positive_probs = probs[:, 1].numpy() # The probabilities for the positive class (label=1) + + # Get true labels from predictions + true_labels = predictions.label_ids + + # Calculate precision-recall curve + precisions, recalls, thresholds = precision_recall_curve(true_labels, positive_probs) + f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8) + + # Find the optimal threshold based on F1-score + optimal_idx = np.argmax(f1_scores[:-1]) # Exclude last threshold (it is always 1) + optimal_threshold = thresholds[optimal_idx] + + return optimal_threshold, precisions[optimal_idx], recalls[optimal_idx], f1_scores[optimal_idx] + + +def evaluate_with_threshold(trainer, dataset, threshold=0.5): + predictions = trainer.predict(dataset) + + # Ensure that logits are properly reshaped to (batch_size, 2) before applying softmax + logits = predictions.predictions + if isinstance(logits, tuple): + logits = logits[0] # Extract logits if it's a tuple + + logits = torch.tensor(logits) if not isinstance(logits, torch.Tensor) else logits + + # Apply softmax to get probabilities + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get predicted labels based on the threshold for the positive class (label=1) + predicted_labels = (probs[:, 1] > threshold).numpy().astype(int) + + true_labels = predictions.label_ids + + # Calculate precision-recall-fscore + precision, recall, f1, _ = precision_recall_fscore_support( + true_labels, + predicted_labels, + average='binary', + zero_division=0 + ) + + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Example usage: +# Find the optimal threshold using validation data +print("\nFinding optimal threshold...") +optimal_threshold, best_precision, best_recall, best_f1 = find_optimal_threshold(trainer, val_dataset) +print(f"Optimal threshold: {optimal_threshold:.4f}") + +# Evaluate with optimal threshold +print("\nEvaluating with optimal threshold:") +optimized_results = evaluate_with_threshold(trainer, test_dataset, threshold=optimal_threshold) +print(f"Precision: {optimized_results['precision']:.4f}") +print(f"Recall: {optimized_results['recall']:.4f}") +print(f"F1: {optimized_results['f1']:.4f}") + + +# Save the model +#trainer.save_model("./hate_speech_model/best_model") +del model +torch.cuda.empty_cache() \ No newline at end of file diff --git a/few_shot/few_shot_eval_mt5_3b.py b/few_shot/few_shot_eval_mt5_3b.py new file mode 100644 index 0000000..7587062 --- /dev/null +++ b/few_shot/few_shot_eval_mt5_3b.py @@ -0,0 +1,208 @@ +import sys +import codecs +from datasets import load_dataset, concatenate_datasets +import numpy as np +import torch + +from sklearn.metrics import precision_recall_fscore_support,precision_recall_curve +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + Trainer, + TrainingArguments, + AutoModelForSeq2SeqLM, + set_seed, + T5Tokenizer +) + +# Set seed for reproducibility +set_seed(42) + +# Load and preprocess data +ds = load_dataset("TUKE-KEMT/hate_speech_slovak") +label_0 = ds['train'].filter(lambda example: example['label'] == 0) +label_1 = ds['train'].filter(lambda example: example['label'] == 1) + +# Create stratified few-shot splits +def create_stratified_split(label_0, label_1, n_samples, seed=42): + few_shot_0 = label_0.shuffle(seed=seed).select(range(n_samples)) + few_shot_1 = label_1.shuffle(seed=seed).select(range(n_samples)) + return concatenate_datasets([few_shot_0, few_shot_1]).shuffle(seed=seed) + +# Create train/val/test splits +train_dataset = create_stratified_split(label_0, label_1, n_samples=40) +val_dataset = create_stratified_split(label_0, label_1, n_samples=10, seed=43) +test_dataset = create_stratified_split(label_0, label_1, n_samples=50, seed=44) + +# Initialize tokenizer and model +tokenizer = AutoTokenizer.from_pretrained("unicamp-dl/mt5-3B-mmarco-en-pt", force_download=True) +model = AutoModelForSequenceClassification.from_pretrained("unicamp-dl/mt5-3B-mmarco-en-pt", num_labels=2) + + + +# Tokenization function with padding +def tokenize(batch): + return tokenizer( + batch["text"], + padding="max_length", + truncation=True, + max_length=256 + ) + + +# Prepare datasets +def prepare_dataset(dataset): + dataset = dataset.map(tokenize, batched=True, remove_columns=["text"]) + print(dataset[0]) + return dataset.rename_column("label", "labels") + +train_dataset = prepare_dataset(train_dataset) +val_dataset = prepare_dataset(val_dataset) +test_dataset = prepare_dataset(test_dataset) + +# Set device +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +# Training arguments with improved settings +training_args = TrainingArguments( + output_dir="./hate_speech_model", + per_device_train_batch_size=8, + per_device_eval_batch_size=16, + learning_rate=3e-5, # Adjust as needed + num_train_epochs=7, # Increased epochs for better training + eval_strategy="epoch", # Use "epoch" for both strategies + save_strategy="epoch", # Matching the evaluation strategy + load_best_model_at_end=True, + metric_for_best_model="f1", + greater_is_better=True, + warmup_steps=100, # Increased warmup steps + weight_decay=0.01, + report_to="none", + seed=42, + logging_steps=10, + gradient_accumulation_steps=2, # For more effective training on larger datasets + lr_scheduler_type="cosine", # Using cosine scheduler for learning rate + logging_dir='./logs', +) + +# Custom metrics computation +def compute_metrics(pred): + logits = pred.predictions[0] # Ensure only the logits are used + preds = logits.argmax(-1) + labels = pred.label_ids + precision, recall, f1, _ = precision_recall_fscore_support( + labels, + preds, + average='binary' + ) + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Initialize trainer with validation data and metrics +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=val_dataset, + compute_metrics=compute_metrics, +) + +# Train model +trainer.train() + +# Evaluate on test set +def find_optimal_threshold(trainer, dataset): + # Get predictions + predictions = trainer.predict(dataset) + + # Extract logits (the first element in the predictions tuple) + logits = predictions.predictions # This is likely a tuple (logits, other_info) + + # If logits is a tuple, extract only the logits + if isinstance(logits, tuple): + logits = logits[0] # Extract the logits from the tuple + + # Check the shape of logits to debug the issue + print(f"Logits shape: {logits.shape}") + + # Ensure logits has the shape (batch_size, 2) for binary classification + if logits.shape[-1] != 2: + logits = logits[:, :2] # Take only the first two columns (logits for the two classes) + print(f"Logits shape after slicing: {logits.shape}") + + # Convert logits to tensor if necessary + if not isinstance(logits, torch.Tensor): + logits = torch.tensor(logits) # Convert logits to a tensor if needed + + # Apply softmax to get probabilities + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get probabilities for the positive class (label=1) + positive_probs = probs[:, 1].numpy() # The probabilities for the positive class (label=1) + + # Get true labels from predictions + true_labels = predictions.label_ids + + # Calculate precision-recall curve + precisions, recalls, thresholds = precision_recall_curve(true_labels, positive_probs) + f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8) + + # Find the optimal threshold based on F1-score + optimal_idx = np.argmax(f1_scores[:-1]) # Exclude last threshold (it is always 1) + optimal_threshold = thresholds[optimal_idx] + + return optimal_threshold, precisions[optimal_idx], recalls[optimal_idx], f1_scores[optimal_idx] + + +def evaluate_with_threshold(trainer, dataset, threshold=0.5): + predictions = trainer.predict(dataset) + + # Ensure that logits are properly reshaped to (batch_size, 2) before applying softmax + logits = predictions.predictions + if isinstance(logits, tuple): + logits = logits[0] # Extract logits if it's a tuple + + logits = torch.tensor(logits) if not isinstance(logits, torch.Tensor) else logits + + # Apply softmax to get probabilities + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get predicted labels based on the threshold for the positive class (label=1) + predicted_labels = (probs[:, 1] > threshold).numpy().astype(int) + + true_labels = predictions.label_ids + + # Calculate precision-recall-fscore + precision, recall, f1, _ = precision_recall_fscore_support( + true_labels, + predicted_labels, + average='binary', + zero_division=0 + ) + + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Example usage: +# Find the optimal threshold using validation data +print("\nFinding optimal threshold...") +optimal_threshold, best_precision, best_recall, best_f1 = find_optimal_threshold(trainer, val_dataset) +print(f"Optimal threshold: {optimal_threshold:.4f}") + +# Evaluate with optimal threshold +print("\nEvaluating with optimal threshold:") +optimized_results = evaluate_with_threshold(trainer, test_dataset, threshold=optimal_threshold) +print(f"Precision: {optimized_results['precision']:.4f}") +print(f"Recall: {optimized_results['recall']:.4f}") +print(f"F1: {optimized_results['f1']:.4f}") + + +# Save the model +#trainer.save_model("./hate_speech_model/best_model") \ No newline at end of file diff --git a/few_shot/few_shot_eval_mt5_base.py b/few_shot/few_shot_eval_mt5_base.py new file mode 100644 index 0000000..f1721f9 --- /dev/null +++ b/few_shot/few_shot_eval_mt5_base.py @@ -0,0 +1,210 @@ +import sys +import codecs +from datasets import load_dataset, concatenate_datasets +import numpy as np +import torch + +from sklearn.metrics import precision_recall_fscore_support,precision_recall_curve +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + Trainer, + TrainingArguments, + AutoModelForSeq2SeqLM, + set_seed, + T5Tokenizer +) + +# Set seed for reproducibility +set_seed(42) + +# Load and preprocess data +ds = load_dataset("TUKE-KEMT/hate_speech_slovak") +label_0 = ds['train'].filter(lambda example: example['label'] == 0) +label_1 = ds['train'].filter(lambda example: example['label'] == 1) + +# Create stratified few-shot splits +def create_stratified_split(label_0, label_1, n_samples, seed=42): + few_shot_0 = label_0.shuffle(seed=seed).select(range(n_samples)) + few_shot_1 = label_1.shuffle(seed=seed).select(range(n_samples)) + return concatenate_datasets([few_shot_0, few_shot_1]).shuffle(seed=seed) + +# Create train/val/test splits +train_dataset = create_stratified_split(label_0, label_1, n_samples=40) +val_dataset = create_stratified_split(label_0, label_1, n_samples=10, seed=43) +test_dataset = create_stratified_split(label_0, label_1, n_samples=50, seed=44) + +# Initialize tokenizer and model +tokenizer = AutoTokenizer.from_pretrained("google/mt5-base", force_download=True) +model = AutoModelForSequenceClassification.from_pretrained("google/mt5-base", num_labels=2) + + + +# Tokenization function with padding +def tokenize(batch): + return tokenizer( + batch["text"], + padding="max_length", + truncation=True, + max_length=128 + ) + + +# Prepare datasets +def prepare_dataset(dataset): + dataset = dataset.map(tokenize, batched=True, remove_columns=["text"]) + print(dataset[0]) + return dataset.rename_column("label", "labels") + +train_dataset = prepare_dataset(train_dataset) +val_dataset = prepare_dataset(val_dataset) +test_dataset = prepare_dataset(test_dataset) + +# Set device +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +# Training arguments with improved settings +training_args = TrainingArguments( + output_dir="./hate_speech_model", + per_device_train_batch_size=8, + per_device_eval_batch_size=16, + learning_rate=3e-5, # Adjust as needed + num_train_epochs=7, # Increased epochs for better training + eval_strategy="epoch", # Use "epoch" for both strategies + save_strategy="epoch", # Matching the evaluation strategy + load_best_model_at_end=True, + metric_for_best_model="f1", + greater_is_better=True, + warmup_steps=100, # Increased warmup steps + weight_decay=0.01, + report_to="none", + seed=42, + logging_steps=10, + gradient_accumulation_steps=2, # For more effective training on larger datasets + lr_scheduler_type="cosine", # Using cosine scheduler for learning rate + logging_dir='./logs', +) + + + +# Custom metrics computation +def compute_metrics(pred): + logits = pred.predictions[0] # Ensure only the logits are used + preds = logits.argmax(-1) + labels = pred.label_ids + precision, recall, f1, _ = precision_recall_fscore_support( + labels, + preds, + average='binary' + ) + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Initialize trainer with validation data and metrics +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=val_dataset, + compute_metrics=compute_metrics, +) + +# Train model +trainer.train() + +# Evaluate on test set +def find_optimal_threshold(trainer, dataset): + # Get predictions + predictions = trainer.predict(dataset) + + # Extract logits (the first element in the predictions tuple) + logits = predictions.predictions # This is likely a tuple (logits, other_info) + + # If logits is a tuple, extract only the logits + if isinstance(logits, tuple): + logits = logits[0] # Extract the logits from the tuple + + # Check the shape of logits to debug the issue + print(f"Logits shape: {logits.shape}") + + # Ensure logits has the shape (batch_size, 2) for binary classification + if logits.shape[-1] != 2: + logits = logits[:, :2] # Take only the first two columns (logits for the two classes) + print(f"Logits shape after slicing: {logits.shape}") + + # Convert logits to tensor if necessary + if not isinstance(logits, torch.Tensor): + logits = torch.tensor(logits) # Convert logits to a tensor if needed + + # Apply softmax to get probabilities + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get probabilities for the positive class (label=1) + positive_probs = probs[:, 1].numpy() # The probabilities for the positive class (label=1) + + # Get true labels from predictions + true_labels = predictions.label_ids + + # Calculate precision-recall curve + precisions, recalls, thresholds = precision_recall_curve(true_labels, positive_probs) + f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8) + + # Find the optimal threshold based on F1-score + optimal_idx = np.argmax(f1_scores[:-1]) # Exclude last threshold (it is always 1) + optimal_threshold = thresholds[optimal_idx] + + return optimal_threshold, precisions[optimal_idx], recalls[optimal_idx], f1_scores[optimal_idx] + + +def evaluate_with_threshold(trainer, dataset, threshold=0.5): + predictions = trainer.predict(dataset) + + # Ensure that logits are properly reshaped to (batch_size, 2) before applying softmax + logits = predictions.predictions + if isinstance(logits, tuple): + logits = logits[0] # Extract logits if it's a tuple + + logits = torch.tensor(logits) if not isinstance(logits, torch.Tensor) else logits + + # Apply softmax to get probabilities + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get predicted labels based on the threshold for the positive class (label=1) + predicted_labels = (probs[:, 1] > threshold).numpy().astype(int) + + true_labels = predictions.label_ids + + # Calculate precision-recall-fscore + precision, recall, f1, _ = precision_recall_fscore_support( + true_labels, + predicted_labels, + average='binary', + zero_division=0 + ) + + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Example usage: +# Find the optimal threshold using validation data +print("\nFinding optimal threshold...") +optimal_threshold, best_precision, best_recall, best_f1 = find_optimal_threshold(trainer, val_dataset) +print(f"Optimal threshold: {optimal_threshold:.4f}") + +# Evaluate with optimal threshold +print("\nEvaluating with optimal threshold:") +optimized_results = evaluate_with_threshold(trainer, test_dataset, threshold=optimal_threshold) +print(f"Precision: {optimized_results['precision']:.4f}") +print(f"Recall: {optimized_results['recall']:.4f}") +print(f"F1: {optimized_results['f1']:.4f}") + + +# Save the model +trainer.save_model("./hate_speech_model/best_model") \ No newline at end of file diff --git a/few_shot/few_shot_eval_mt5_large.py b/few_shot/few_shot_eval_mt5_large.py new file mode 100644 index 0000000..c80214e --- /dev/null +++ b/few_shot/few_shot_eval_mt5_large.py @@ -0,0 +1,208 @@ +import sys +import codecs +from datasets import load_dataset, concatenate_datasets +import numpy as np +import torch + +from sklearn.metrics import precision_recall_fscore_support,precision_recall_curve +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + Trainer, + TrainingArguments, + AutoModelForSeq2SeqLM, + set_seed, + T5Tokenizer +) + +# Set seed for reproducibility +set_seed(42) + +# Load and preprocess data +ds = load_dataset("TUKE-KEMT/hate_speech_slovak") +label_0 = ds['train'].filter(lambda example: example['label'] == 0) +label_1 = ds['train'].filter(lambda example: example['label'] == 1) + +# Create stratified few-shot splits +def create_stratified_split(label_0, label_1, n_samples, seed=42): + few_shot_0 = label_0.shuffle(seed=seed).select(range(n_samples)) + few_shot_1 = label_1.shuffle(seed=seed).select(range(n_samples)) + return concatenate_datasets([few_shot_0, few_shot_1]).shuffle(seed=seed) + +# Create train/val/test splits +train_dataset = create_stratified_split(label_0, label_1, n_samples=40) +val_dataset = create_stratified_split(label_0, label_1, n_samples=10, seed=43) +test_dataset = create_stratified_split(label_0, label_1, n_samples=50, seed=44) + +# Initialize tokenizer and model +tokenizer = AutoTokenizer.from_pretrained("google/mt5-large", force_download=True) +model = AutoModelForSequenceClassification.from_pretrained("google/mt5-large", num_labels=2) + + + +# Tokenization function with padding +def tokenize(batch): + return tokenizer( + batch["text"], + padding="max_length", + truncation=True, + max_length=128 + ) + + +# Prepare datasets +def prepare_dataset(dataset): + dataset = dataset.map(tokenize, batched=True, remove_columns=["text"]) + print(dataset[0]) + return dataset.rename_column("label", "labels") + +train_dataset = prepare_dataset(train_dataset) +val_dataset = prepare_dataset(val_dataset) +test_dataset = prepare_dataset(test_dataset) + +# Set device +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +# Training arguments with improved settings +training_args = TrainingArguments( + output_dir="./hate_speech_model", + per_device_train_batch_size=8, + per_device_eval_batch_size=16, + learning_rate=3e-5, # Adjust as needed + num_train_epochs=7, # Increased epochs for better training + eval_strategy="epoch", # Use "epoch" for both strategies + save_strategy="epoch", # Matching the evaluation strategy + load_best_model_at_end=True, + metric_for_best_model="f1", + greater_is_better=True, + warmup_steps=100, # Increased warmup steps + weight_decay=0.01, + report_to="none", + seed=42, + logging_steps=10, + gradient_accumulation_steps=2, # For more effective training on larger datasets + lr_scheduler_type="cosine", # Using cosine scheduler for learning rate + logging_dir='./logs', +) + +# Custom metrics computation +def compute_metrics(pred): + logits = pred.predictions[0] # Ensure only the logits are used + preds = logits.argmax(-1) + labels = pred.label_ids + precision, recall, f1, _ = precision_recall_fscore_support( + labels, + preds, + average='binary' + ) + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Initialize trainer with validation data and metrics +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=val_dataset, + compute_metrics=compute_metrics, +) + +# Train model +trainer.train() + +# Evaluate on test set +def find_optimal_threshold(trainer, dataset): + # Get predictions + predictions = trainer.predict(dataset) + + # Extract logits (the first element in the predictions tuple) + logits = predictions.predictions # This is likely a tuple (logits, other_info) + + # If logits is a tuple, extract only the logits + if isinstance(logits, tuple): + logits = logits[0] # Extract the logits from the tuple + + # Check the shape of logits to debug the issue + print(f"Logits shape: {logits.shape}") + + # Ensure logits has the shape (batch_size, 2) for binary classification + if logits.shape[-1] != 2: + logits = logits[:, :2] # Take only the first two columns (logits for the two classes) + print(f"Logits shape after slicing: {logits.shape}") + + # Convert logits to tensor if necessary + if not isinstance(logits, torch.Tensor): + logits = torch.tensor(logits) # Convert logits to a tensor if needed + + # Apply softmax to get probabilities + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get probabilities for the positive class (label=1) + positive_probs = probs[:, 1].numpy() # The probabilities for the positive class (label=1) + + # Get true labels from predictions + true_labels = predictions.label_ids + + # Calculate precision-recall curve + precisions, recalls, thresholds = precision_recall_curve(true_labels, positive_probs) + f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8) + + # Find the optimal threshold based on F1-score + optimal_idx = np.argmax(f1_scores[:-1]) # Exclude last threshold (it is always 1) + optimal_threshold = thresholds[optimal_idx] + + return optimal_threshold, precisions[optimal_idx], recalls[optimal_idx], f1_scores[optimal_idx] + + +def evaluate_with_threshold(trainer, dataset, threshold=0.5): + predictions = trainer.predict(dataset) + + # Ensure that logits are properly reshaped to (batch_size, 2) before applying softmax + logits = predictions.predictions + if isinstance(logits, tuple): + logits = logits[0] # Extract logits if it's a tuple + + logits = torch.tensor(logits) if not isinstance(logits, torch.Tensor) else logits + + # Apply softmax to get probabilities + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get predicted labels based on the threshold for the positive class (label=1) + predicted_labels = (probs[:, 1] > threshold).numpy().astype(int) + + true_labels = predictions.label_ids + + # Calculate precision-recall-fscore + precision, recall, f1, _ = precision_recall_fscore_support( + true_labels, + predicted_labels, + average='binary', + zero_division=0 + ) + + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Example usage: +# Find the optimal threshold using validation data +print("\nFinding optimal threshold...") +optimal_threshold, best_precision, best_recall, best_f1 = find_optimal_threshold(trainer, val_dataset) +print(f"Optimal threshold: {optimal_threshold:.4f}") + +# Evaluate with optimal threshold +print("\nEvaluating with optimal threshold:") +optimized_results = evaluate_with_threshold(trainer, test_dataset, threshold=optimal_threshold) +print(f"Precision: {optimized_results['precision']:.4f}") +print(f"Recall: {optimized_results['recall']:.4f}") +print(f"F1: {optimized_results['f1']:.4f}") + + +# Save the model +trainer.save_model("./hate_speech_model/best_model") \ No newline at end of file diff --git a/few_shot/few_shot_eval_slovak_bert.py b/few_shot/few_shot_eval_slovak_bert.py new file mode 100644 index 0000000..f269988 --- /dev/null +++ b/few_shot/few_shot_eval_slovak_bert.py @@ -0,0 +1,214 @@ + +import sys +import codecs +from datasets import load_dataset, concatenate_datasets +import numpy as np +import torch + +from sklearn.metrics import precision_recall_fscore_support,precision_recall_curve +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + Trainer, + TrainingArguments, + AutoModelForSeq2SeqLM, + set_seed, + T5Tokenizer +) + +# Set seed for reproducibility +set_seed(42) + +# Load and preprocess data +ds = load_dataset("TUKE-KEMT/hate_speech_slovak") +label_0 = ds['train'].filter(lambda example: example['label'] == 0) +label_1 = ds['train'].filter(lambda example: example['label'] == 1) + +# Create stratified few-shot splits +def create_stratified_split(label_0, label_1, n_samples, seed=42): + few_shot_0 = label_0.shuffle(seed=seed).select(range(n_samples)) + few_shot_1 = label_1.shuffle(seed=seed).select(range(n_samples)) + return concatenate_datasets([few_shot_0, few_shot_1]).shuffle(seed=seed) + +# Create train/val/test splits +train_dataset = create_stratified_split(label_0, label_1, n_samples=40) +val_dataset = create_stratified_split(label_0, label_1, n_samples=10, seed=43) +test_dataset = create_stratified_split(label_0, label_1, n_samples=50, seed=44) + +# Load the tokenizer and model from "gerulata/slovakbert" +tokenizer = AutoTokenizer.from_pretrained("gerulata/slovakbert", force_download=True) +model = AutoModelForSequenceClassification.from_pretrained("gerulata/slovakbert", num_labels=2) + +# Tokenization function with padding +def tokenize(batch): + return tokenizer( + batch["text"], + padding="max_length", + truncation=True, + max_length=128 + ) + +# Prepare datasets +def prepare_dataset(dataset): + dataset = dataset.map(tokenize, batched=True, remove_columns=["text"]) + print(dataset[0]) + return dataset.rename_column("label", "labels") + +train_dataset = prepare_dataset(train_dataset) +val_dataset = prepare_dataset(val_dataset) +test_dataset = prepare_dataset(test_dataset) + +# Set device +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +# Training arguments with improved settings +training_args = TrainingArguments( + output_dir="./hate_speech_model", + per_device_train_batch_size=8, + per_device_eval_batch_size=16, + learning_rate=3e-5, # Adjust as needed + num_train_epochs=7, # Increased epochs for better training + eval_strategy="epoch", # Use "epoch" for both strategies + save_strategy="epoch", # Matching the evaluation strategy + load_best_model_at_end=True, + metric_for_best_model="f1", + greater_is_better=True, + warmup_steps=100, # Increased warmup steps + weight_decay=0.01, + report_to="none", + seed=42, + logging_steps=10, + gradient_accumulation_steps=2, # For more effective training on larger datasets + lr_scheduler_type="cosine", # Using cosine scheduler for learning rate + logging_dir='./logs', +) + +# Custom metrics computation +def compute_metrics(pred): + # Ensure predictions are properly extracted + logits = pred.predictions[0] if isinstance(pred.predictions, tuple) else pred.predictions + preds = logits.argmax(axis=-1) # Ensure you get the correct predictions (argmax for logits) + labels = pred.label_ids + + # Debugging: Check if preds and labels are arrays and have the same length + print(f"Predictions: {preds[:10]}") + print(f"Labels: {labels[:10]}") + + # Calculate precision, recall, and F1 score + precision, recall, f1, _ = precision_recall_fscore_support( + labels, + preds, + average='binary', + zero_division=0 + ) + + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Initialize trainer with validation data and metrics +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=val_dataset, + compute_metrics=compute_metrics, +) + +# Train model +trainer.train() + +# Evaluate on test set +def find_optimal_threshold(trainer, dataset): + # Get predictions + predictions = trainer.predict(dataset) + + # Extract logits (the first element in the predictions tuple) + logits = predictions.predictions # This is likely a tuple (logits, other_info) + + # If logits is a tuple, extract only the logits + if isinstance(logits, tuple): + logits = logits[0] # Extract the logits from the tuple + + # Check the shape of logits to debug the issue + print(f"Logits shape: {logits.shape}") + + # Ensure logits has the shape (batch_size, 2) for binary classification + if logits.shape[-1] != 2: + logits = logits[:, :2] # Take only the first two columns (logits for the two classes) + print(f"Logits shape after slicing: {logits.shape}") + + # Convert logits to tensor if necessary + if not isinstance(logits, torch.Tensor): + logits = torch.tensor(logits) # Convert logits to a tensor if needed + + # Apply softmax to get probabilities + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get probabilities for the positive class (label=1) + positive_probs = probs[:, 1].numpy() # The probabilities for the positive class (label=1) + + # Get true labels from predictions + true_labels = predictions.label_ids + + # Calculate precision-recall curve + precisions, recalls, thresholds = precision_recall_curve(true_labels, positive_probs) + f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8) + + # Find the optimal threshold based on F1-score + optimal_idx = np.argmax(f1_scores[:-1]) # Exclude last threshold (it is always 1) + optimal_threshold = thresholds[optimal_idx] + + return optimal_threshold, precisions[optimal_idx], recalls[optimal_idx], f1_scores[optimal_idx] + + +def evaluate_with_threshold(trainer, dataset, threshold=0.5): + predictions = trainer.predict(dataset) + + # Ensure that logits are properly reshaped to (batch_size, 2) before applying softmax + logits = predictions.predictions + if isinstance(logits, tuple): + logits = logits[0] # Extract logits if it's a tuple + + logits = torch.tensor(logits) if not isinstance(logits, torch.Tensor) else logits + + # Apply softmax to get probabilities + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get predicted labels based on the threshold for the positive class (label=1) + predicted_labels = (probs[:, 1] > threshold).numpy().astype(int) + + true_labels = predictions.label_ids + + # Calculate precision-recall-fscore + precision, recall, f1, _ = precision_recall_fscore_support( + true_labels, + predicted_labels, + average='binary', + zero_division=0 + ) + + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Example usage: +# Find the optimal threshold using validation data +print("\nFinding optimal threshold...") +optimal_threshold, best_precision, best_recall, best_f1 = find_optimal_threshold(trainer, val_dataset) +print(f"Optimal threshold: {optimal_threshold:.4f}") + +# Evaluate with optimal threshold +print("\nEvaluating with optimal threshold:") +optimized_results = evaluate_with_threshold(trainer, test_dataset, threshold=optimal_threshold) +print(f"Precision: {optimized_results['precision']:.4f}") +print(f"Recall: {optimized_results['recall']:.4f}") +print(f"F1: {optimized_results['f1']:.4f}") + +# Save the model +trainer.save_model("./hate_speech_model/best_model_slovakbert") diff --git a/few_shot/few_shot_eval_slovak_t5_base.py b/few_shot/few_shot_eval_slovak_t5_base.py new file mode 100644 index 0000000..0a1c240 --- /dev/null +++ b/few_shot/few_shot_eval_slovak_t5_base.py @@ -0,0 +1,205 @@ +import sys +import codecs +from datasets import load_dataset, concatenate_datasets +import numpy as np +import torch + +from sklearn.metrics import precision_recall_fscore_support,precision_recall_curve +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + Trainer, + TrainingArguments, + AutoModelForSeq2SeqLM, + set_seed, + T5Tokenizer +) + +# Set seed for reproducibility +set_seed(42) + +# Load and preprocess data +ds = load_dataset("TUKE-KEMT/hate_speech_slovak") +label_0 = ds['train'].filter(lambda example: example['label'] == 0) +label_1 = ds['train'].filter(lambda example: example['label'] == 1) + +# Create stratified few-shot splits +def create_stratified_split(label_0, label_1, n_samples, seed=42): + few_shot_0 = label_0.shuffle(seed=seed).select(range(n_samples)) + few_shot_1 = label_1.shuffle(seed=seed).select(range(n_samples)) + return concatenate_datasets([few_shot_0, few_shot_1]).shuffle(seed=seed) + +# Create train/val/test splits +train_dataset = create_stratified_split(label_0, label_1, n_samples=40) +val_dataset = create_stratified_split(label_0, label_1, n_samples=10, seed=43) +test_dataset = create_stratified_split(label_0, label_1, n_samples=50, seed=44) + +# Initialize tokenizer and model +tokenizer = AutoTokenizer.from_pretrained("TUKE-KEMT/slovak-t5-base", use_fast=False, legacy=False) +model = AutoModelForSequenceClassification.from_pretrained("TUKE-KEMT/slovak-t5-base", num_labels=2) + + +# Tokenization function with padding +def tokenize(batch): + return tokenizer( + batch["text"], + padding="max_length", + truncation=True, + max_length=128 + ) + +# Prepare datasets +def prepare_dataset(dataset): + dataset = dataset.map(tokenize, batched=True, remove_columns=["text"]) + return dataset.rename_column("label", "labels") + +train_dataset = prepare_dataset(train_dataset) +val_dataset = prepare_dataset(val_dataset) +test_dataset = prepare_dataset(test_dataset) + +# Set device +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +# Training arguments with improved settings +training_args = TrainingArguments( + output_dir="./hate_speech_model", + per_device_train_batch_size=8, + per_device_eval_batch_size=16, + learning_rate=3e-5, # Adjust as needed + num_train_epochs=7, # Increased epochs for better training + eval_strategy="epoch", # Use "epoch" for both strategies + save_strategy="epoch", # Matching the evaluation strategy + load_best_model_at_end=True, + metric_for_best_model="f1", + greater_is_better=True, + warmup_steps=100, # Increased warmup steps + weight_decay=0.01, + report_to="none", + seed=42, + logging_steps=10, + gradient_accumulation_steps=2, # For more effective training on larger datasets + lr_scheduler_type="cosine", # Using cosine scheduler for learning rate + logging_dir='./logs', +) + +# Custom metrics computation +def compute_metrics(pred): + logits = pred.predictions[0] # Ensure only the logits are used + preds = logits.argmax(-1) + labels = pred.label_ids + precision, recall, f1, _ = precision_recall_fscore_support( + labels, + preds, + average='binary' + ) + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Initialize trainer with validation data and metrics +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=val_dataset, + compute_metrics=compute_metrics, +) + +# Train model +trainer.train() + +# Evaluate on test set +def find_optimal_threshold(trainer, dataset): + # Get predictions + predictions = trainer.predict(dataset) + + # Extract logits (the first element in the predictions tuple) + logits = predictions.predictions # This is likely a tuple (logits, other_info) + + # If logits is a tuple, extract only the logits + if isinstance(logits, tuple): + logits = logits[0] # Extract the logits from the tuple + + # Check the shape of logits to debug the issue + print(f"Logits shape: {logits.shape}") + + # Ensure logits has the shape (batch_size, 2) for binary classification + if logits.shape[-1] != 2: + logits = logits[:, :2] # Take only the first two columns (logits for the two classes) + print(f"Logits shape after slicing: {logits.shape}") + + # Convert logits to tensor if necessary + if not isinstance(logits, torch.Tensor): + logits = torch.tensor(logits) # Convert logits to a tensor if needed + + # Apply softmax to get probabilities + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get probabilities for the positive class (label=1) + positive_probs = probs[:, 1].numpy() # The probabilities for the positive class (label=1) + + # Get true labels from predictions + true_labels = predictions.label_ids + + # Calculate precision-recall curve + precisions, recalls, thresholds = precision_recall_curve(true_labels, positive_probs) + f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8) + + # Find the optimal threshold based on F1-score + optimal_idx = np.argmax(f1_scores[:-1]) # Exclude last threshold (it is always 1) + optimal_threshold = thresholds[optimal_idx] + + return optimal_threshold, precisions[optimal_idx], recalls[optimal_idx], f1_scores[optimal_idx] + + +def evaluate_with_threshold(trainer, dataset, threshold=0.5): + predictions = trainer.predict(dataset) + + # Ensure that logits are properly reshaped to (batch_size, 2) before applying softmax + logits = predictions.predictions + if isinstance(logits, tuple): + logits = logits[0] # Extract logits if it's a tuple + + logits = torch.tensor(logits) if not isinstance(logits, torch.Tensor) else logits + + # Apply softmax to get probabilities + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get predicted labels based on the threshold for the positive class (label=1) + predicted_labels = (probs[:, 1] > threshold).numpy().astype(int) + + true_labels = predictions.label_ids + + # Calculate precision-recall-fscore + precision, recall, f1, _ = precision_recall_fscore_support( + true_labels, + predicted_labels, + average='binary', + zero_division=0 + ) + + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Example usage: +# Find the optimal threshold using validation data +print("\nFinding optimal threshold...") +optimal_threshold, best_precision, best_recall, best_f1 = find_optimal_threshold(trainer, val_dataset) +print(f"Optimal threshold: {optimal_threshold:.4f}") + +# Evaluate with optimal threshold +print("\nEvaluating with optimal threshold:") +optimized_results = evaluate_with_threshold(trainer, test_dataset, threshold=optimal_threshold) +print(f"Precision: {optimized_results['precision']:.4f}") +print(f"Recall: {optimized_results['recall']:.4f}") +print(f"F1: {optimized_results['f1']:.4f}") + + +# Save the model +trainer.save_model("./hate_speech_model/best_model") \ No newline at end of file diff --git a/few_shot/few_shot_eval_slovak_t5_small.py b/few_shot/few_shot_eval_slovak_t5_small.py index e0add4f..cbb291e 100644 --- a/few_shot/few_shot_eval_slovak_t5_small.py +++ b/few_shot/few_shot_eval_slovak_t5_small.py @@ -1,180 +1,180 @@ -import numpy as np -import torch -from datasets import load_dataset, concatenate_datasets -from sklearn.metrics import precision_recall_fscore_support, precision_recall_curve -from transformers import ( - AutoTokenizer, - AutoModelForSequenceClassification, - Trainer, - TrainingArguments, - set_seed -) - -# šŸ”¹ 1. Set seed for reproducibility -set_seed(42) - -# šŸ”¹ 2. Load dataset -ds = load_dataset("TUKE-KEMT/hate_speech_slovak") -label_0 = ds['train'].filter(lambda example: example['label'] == 0) -label_1 = ds['train'].filter(lambda example: example['label'] == 1) - -# Function to create stratified splits -def create_stratified_split(label_0, label_1, n_samples, seed=42): - few_shot_0 = label_0.shuffle(seed=seed).select(range(n_samples)) - few_shot_1 = label_1.shuffle(seed=seed).select(range(n_samples)) - return concatenate_datasets([few_shot_0, few_shot_1]).shuffle(seed=seed) - -# Create train, validation, and test splits -train_dataset = create_stratified_split(label_0, label_1, n_samples=40) -val_dataset = create_stratified_split(label_0, label_1, n_samples=10, seed=43) -test_dataset = create_stratified_split(label_0, label_1, n_samples=50, seed=44) - -# šŸ”¹ 3. Load tokenizer and model -model_name = "ApoTro/slovak-t5-small" -tokenizer = AutoTokenizer.from_pretrained(model_name, force_download=True) -model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) - -# Tokenization function -def tokenize(batch): - return tokenizer( - batch["text"], - padding="max_length", - truncation=True, - max_length=128 - ) - -# Function to prepare datasets -def prepare_dataset(dataset): - dataset = dataset.map(tokenize, batched=True, remove_columns=["text"]) - return dataset.rename_column("label", "labels") - -# Apply preparation to datasets -train_dataset = prepare_dataset(train_dataset) -val_dataset = prepare_dataset(val_dataset) -test_dataset = prepare_dataset(test_dataset) - -# Set device (GPU if available) -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -model.to(device) - -# šŸ”¹ 4. Define training arguments -training_args = TrainingArguments( - output_dir="./hate_speech_model", - per_device_train_batch_size=8, - per_device_eval_batch_size=16, - learning_rate=3e-5, - num_train_epochs=7, - evaluation_strategy="epoch", - save_strategy="epoch", - load_best_model_at_end=True, - metric_for_best_model="f1", - greater_is_better=True, - warmup_steps=100, - weight_decay=0.01, - report_to="none", - seed=42, - logging_steps=10, - gradient_accumulation_steps=2, - lr_scheduler_type="cosine", - logging_dir='./logs', -) - -# šŸ”¹ 5. Define evaluation metrics -def compute_metrics(pred): - logits = pred.predictions[0] - preds = logits.argmax(-1) - labels = pred.label_ids - precision, recall, f1, _ = precision_recall_fscore_support( - labels, preds, average='binary' - ) - return { - 'precision': precision, - 'recall': recall, - 'f1': f1 - } - -# šŸ”¹ 6. Create Trainer instance -trainer = Trainer( - model=model, - args=training_args, - train_dataset=train_dataset, - eval_dataset=val_dataset, - compute_metrics=compute_metrics, -) - -# šŸ”¹ 7. Train the model -trainer.train() - -# šŸ”¹ 8. Save the trained model -trainer.save_model("./hate_speech_model/best_model") - -# šŸ”¹ 9. Load the trained model before testing -model = AutoModelForSequenceClassification.from_pretrained("./hate_speech_model/best_model") -trainer = Trainer( - model=model, - args=training_args, - eval_dataset=val_dataset, - compute_metrics=compute_metrics, -) - -# šŸ”¹ 10. Function to find the optimal threshold -def find_optimal_threshold(trainer, dataset): - predictions = trainer.predict(dataset) - logits = predictions.predictions - - # Ensure logits are properly shaped - if isinstance(logits, tuple): - logits = logits[0] - logits = torch.tensor(logits) - - # Apply softmax - probs = torch.nn.functional.softmax(logits, dim=-1) - positive_probs = probs[:, 1].numpy() - true_labels = predictions.label_ids - - # Compute Precision-Recall curve - precisions, recalls, thresholds = precision_recall_curve(true_labels, positive_probs) - f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8) - - # Find the best threshold based on F1-score - optimal_idx = np.argmax(f1_scores[:-1]) - optimal_threshold = thresholds[optimal_idx] - - return optimal_threshold, precisions[optimal_idx], recalls[optimal_idx], f1_scores[optimal_idx] - -# šŸ”¹ 11. Function to evaluate the model using a custom threshold -def evaluate_with_threshold(trainer, dataset, threshold=0.5): - predictions = trainer.predict(dataset) - logits = predictions.predictions - - if isinstance(logits, tuple): - logits = logits[0] - logits = torch.tensor(logits) - - # Apply softmax - probs = torch.nn.functional.softmax(logits, dim=-1) - predicted_labels = (probs[:, 1] > threshold).numpy().astype(int) - true_labels = predictions.label_ids - - # Compute Precision, Recall, F1-score - precision, recall, f1, _ = precision_recall_fscore_support( - true_labels, predicted_labels, average='binary', zero_division=0 - ) - - return { - 'precision': precision, - 'recall': recall, - 'f1': f1 - } - -# šŸ”¹ 12. Find the optimal threshold using validation set -print("\nšŸ” Finding optimal threshold...") -optimal_threshold, best_precision, best_recall, best_f1 = find_optimal_threshold(trainer, val_dataset) -print(f"āœ… Optimal threshold: {optimal_threshold:.4f}") - -# šŸ”¹ 13. Final evaluation on the test set using the best threshold -print("\nšŸ“Š Evaluating on the test set with the optimal threshold:") -optimized_results = evaluate_with_threshold(trainer, test_dataset, threshold=optimal_threshold) -print(f"šŸŽÆ Precision: {optimized_results['precision']:.4f}") -print(f"šŸŽÆ Recall: {optimized_results['recall']:.4f}") -print(f"šŸŽÆ F1-score: {optimized_results['f1']:.4f}") +import numpy as np +import torch +from datasets import load_dataset, concatenate_datasets +from sklearn.metrics import precision_recall_fscore_support, precision_recall_curve +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + Trainer, + TrainingArguments, + set_seed +) + +# šŸ”¹ 1. Set seed for reproducibility +set_seed(42) + +# šŸ”¹ 2. Load dataset +ds = load_dataset("TUKE-KEMT/hate_speech_slovak") +label_0 = ds['train'].filter(lambda example: example['label'] == 0) +label_1 = ds['train'].filter(lambda example: example['label'] == 1) + +# Function to create stratified splits +def create_stratified_split(label_0, label_1, n_samples, seed=42): + few_shot_0 = label_0.shuffle(seed=seed).select(range(n_samples)) + few_shot_1 = label_1.shuffle(seed=seed).select(range(n_samples)) + return concatenate_datasets([few_shot_0, few_shot_1]).shuffle(seed=seed) + +# Create train, validation, and test splits +train_dataset = create_stratified_split(label_0, label_1, n_samples=40) +val_dataset = create_stratified_split(label_0, label_1, n_samples=10, seed=43) +test_dataset = create_stratified_split(label_0, label_1, n_samples=50, seed=44) + +# šŸ”¹ 3. Load tokenizer and model +model_name = "ApoTro/slovak-t5-small" +tokenizer = AutoTokenizer.from_pretrained(model_name, force_download=True) +model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) + +# Tokenization function +def tokenize(batch): + return tokenizer( + batch["text"], + padding="max_length", + truncation=True, + max_length=128 + ) + +# Function to prepare datasets +def prepare_dataset(dataset): + dataset = dataset.map(tokenize, batched=True, remove_columns=["text"]) + return dataset.rename_column("label", "labels") + +# Apply preparation to datasets +train_dataset = prepare_dataset(train_dataset) +val_dataset = prepare_dataset(val_dataset) +test_dataset = prepare_dataset(test_dataset) + +# Set device (GPU if available) +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +# šŸ”¹ 4. Define training arguments +training_args = TrainingArguments( + output_dir="./hate_speech_model", + per_device_train_batch_size=8, + per_device_eval_batch_size=16, + learning_rate=3e-5, + num_train_epochs=7, + evaluation_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + metric_for_best_model="f1", + greater_is_better=True, + warmup_steps=100, + weight_decay=0.01, + report_to="none", + seed=42, + logging_steps=10, + gradient_accumulation_steps=2, + lr_scheduler_type="cosine", + logging_dir='./logs', +) + +# šŸ”¹ 5. Define evaluation metrics +def compute_metrics(pred): + logits = pred.predictions[0] + preds = logits.argmax(-1) + labels = pred.label_ids + precision, recall, f1, _ = precision_recall_fscore_support( + labels, preds, average='binary' + ) + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# šŸ”¹ 6. Create Trainer instance +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=val_dataset, + compute_metrics=compute_metrics, +) + +# šŸ”¹ 7. Train the model +trainer.train() + +# šŸ”¹ 8. Save the trained model +trainer.save_model("./hate_speech_model/best_model") + +# šŸ”¹ 9. Load the trained model before testing +model = AutoModelForSequenceClassification.from_pretrained("./hate_speech_model/best_model") +trainer = Trainer( + model=model, + args=training_args, + eval_dataset=val_dataset, + compute_metrics=compute_metrics, +) + +# šŸ”¹ 10. Function to find the optimal threshold +def find_optimal_threshold(trainer, dataset): + predictions = trainer.predict(dataset) + logits = predictions.predictions + + # Ensure logits are properly shaped + if isinstance(logits, tuple): + logits = logits[0] + logits = torch.tensor(logits) + + # Apply softmax + probs = torch.nn.functional.softmax(logits, dim=-1) + positive_probs = probs[:, 1].numpy() + true_labels = predictions.label_ids + + # Compute Precision-Recall curve + precisions, recalls, thresholds = precision_recall_curve(true_labels, positive_probs) + f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8) + + # Find the best threshold based on F1-score + optimal_idx = np.argmax(f1_scores[:-1]) + optimal_threshold = thresholds[optimal_idx] + + return optimal_threshold, precisions[optimal_idx], recalls[optimal_idx], f1_scores[optimal_idx] + +# šŸ”¹ 11. Function to evaluate the model using a custom threshold +def evaluate_with_threshold(trainer, dataset, threshold=0.5): + predictions = trainer.predict(dataset) + logits = predictions.predictions + + if isinstance(logits, tuple): + logits = logits[0] + logits = torch.tensor(logits) + + # Apply softmax + probs = torch.nn.functional.softmax(logits, dim=-1) + predicted_labels = (probs[:, 1] > threshold).numpy().astype(int) + true_labels = predictions.label_ids + + # Compute Precision, Recall, F1-score + precision, recall, f1, _ = precision_recall_fscore_support( + true_labels, predicted_labels, average='binary', zero_division=0 + ) + + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# šŸ”¹ 12. Find the optimal threshold using validation set +print("\nšŸ” Finding optimal threshold...") +optimal_threshold, best_precision, best_recall, best_f1 = find_optimal_threshold(trainer, val_dataset) +print(f"āœ… Optimal threshold: {optimal_threshold:.4f}") + +# šŸ”¹ 13. Final evaluation on the test set using the best threshold +print("\nšŸ“Š Evaluating on the test set with the optimal threshold:") +optimized_results = evaluate_with_threshold(trainer, test_dataset, threshold=optimal_threshold) +print(f"šŸŽÆ Precision: {optimized_results['precision']:.4f}") +print(f"šŸŽÆ Recall: {optimized_results['recall']:.4f}") +print(f"šŸŽÆ F1-score: {optimized_results['f1']:.4f}") diff --git a/few_shot/few_shot_mt5_small.py b/few_shot/few_shot_mt5_small.py new file mode 100644 index 0000000..033c8f0 --- /dev/null +++ b/few_shot/few_shot_mt5_small.py @@ -0,0 +1,209 @@ +import sys +import codecs +from datasets import load_dataset, concatenate_datasets +import numpy as np +import torch + +from sklearn.metrics import precision_recall_fscore_support,precision_recall_curve +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + Trainer, + TrainingArguments, + AutoModelForSeq2SeqLM, + set_seed, + T5Tokenizer +) + +# Set seed for reproducibility +set_seed(42) + +# Load and preprocess data +ds = load_dataset("TUKE-KEMT/hate_speech_slovak") +label_0 = ds['train'].filter(lambda example: example['label'] == 0) +label_1 = ds['train'].filter(lambda example: example['label'] == 1) + +# Create stratified few-shot splits +def create_stratified_split(label_0, label_1, n_samples, seed=42): + few_shot_0 = label_0.shuffle(seed=seed).select(range(n_samples)) + few_shot_1 = label_1.shuffle(seed=seed).select(range(n_samples)) + return concatenate_datasets([few_shot_0, few_shot_1]).shuffle(seed=seed) + +# Create train/val/test splits +train_dataset = create_stratified_split(label_0, label_1, n_samples=40) +val_dataset = create_stratified_split(label_0, label_1, n_samples=10, seed=43) +test_dataset = create_stratified_split(label_0, label_1, n_samples=50, seed=44) + +# Initialize tokenizer and model +tokenizer = AutoTokenizer.from_pretrained("google/mt5-small", force_download=True) +model = AutoModelForSequenceClassification.from_pretrained("google/mt5-small", num_labels=2) + + + + +# Tokenization function with padding +def tokenize(batch): + return tokenizer( + batch["text"], + padding="max_length", + truncation=True, + max_length=128 + ) + + +# Prepare datasets +def prepare_dataset(dataset): + dataset = dataset.map(tokenize, batched=True, remove_columns=["text"]) + print(dataset[0]) + return dataset.rename_column("label", "labels") + +train_dataset = prepare_dataset(train_dataset) +val_dataset = prepare_dataset(val_dataset) +test_dataset = prepare_dataset(test_dataset) + +# Set device +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +# Training arguments with improved settings +training_args = TrainingArguments( + output_dir="./hate_speech_model", + per_device_train_batch_size=8, + per_device_eval_batch_size=16, + learning_rate=3e-5, # Adjust as needed + num_train_epochs=7, # Increased epochs for better training + eval_strategy="epoch", # Use "epoch" for both strategies + save_strategy="epoch", # Matching the evaluation strategy + load_best_model_at_end=True, + metric_for_best_model="f1", + greater_is_better=True, + warmup_steps=100, # Increased warmup steps + weight_decay=0.01, + report_to="none", + seed=42, + logging_steps=10, + gradient_accumulation_steps=2, # For more effective training on larger datasets + lr_scheduler_type="cosine", # Using cosine scheduler for learning rate + logging_dir='./logs', +) + +# Custom metrics computation +def compute_metrics(pred): + logits = pred.predictions[0] # Ensure only the logits are used + preds = logits.argmax(-1) + labels = pred.label_ids + precision, recall, f1, _ = precision_recall_fscore_support( + labels, + preds, + average='binary' + ) + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Initialize trainer with validation data and metrics +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=val_dataset, + compute_metrics=compute_metrics, +) + +# Train model +trainer.train() + +# Evaluate on test set +def find_optimal_threshold(trainer, dataset): + # Get predictions + predictions = trainer.predict(dataset) + + # Extract logits (the first element in the predictions tuple) + logits = predictions.predictions # This is likely a tuple (logits, other_info) + + # If logits is a tuple, extract only the logits + if isinstance(logits, tuple): + logits = logits[0] # Extract the logits from the tuple + + # Check the shape of logits to debug the issue + print(f"Logits shape: {logits.shape}") + + # Ensure logits has the shape (batch_size, 2) for binary classification + if logits.shape[-1] != 2: + logits = logits[:, :2] # Take only the first two columns (logits for the two classes) + print(f"Logits shape after slicing: {logits.shape}") + + # Convert logits to tensor if necessary + if not isinstance(logits, torch.Tensor): + logits = torch.tensor(logits) # Convert logits to a tensor if needed + + # Apply softmax to get probabilities + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get probabilities for the positive class (label=1) + positive_probs = probs[:, 1].numpy() # The probabilities for the positive class (label=1) + + # Get true labels from predictions + true_labels = predictions.label_ids + + # Calculate precision-recall curve + precisions, recalls, thresholds = precision_recall_curve(true_labels, positive_probs) + f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8) + + # Find the optimal threshold based on F1-score + optimal_idx = np.argmax(f1_scores[:-1]) # Exclude last threshold (it is always 1) + optimal_threshold = thresholds[optimal_idx] + + return optimal_threshold, precisions[optimal_idx], recalls[optimal_idx], f1_scores[optimal_idx] + + +def evaluate_with_threshold(trainer, dataset, threshold=0.5): + predictions = trainer.predict(dataset) + + # Ensure that logits are properly reshaped to (batch_size, 2) before applying softmax + logits = predictions.predictions + if isinstance(logits, tuple): + logits = logits[0] # Extract logits if it's a tuple + + logits = torch.tensor(logits) if not isinstance(logits, torch.Tensor) else logits + + # Apply softmax to get probabilities + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get predicted labels based on the threshold for the positive class (label=1) + predicted_labels = (probs[:, 1] > threshold).numpy().astype(int) + + true_labels = predictions.label_ids + + # Calculate precision-recall-fscore + precision, recall, f1, _ = precision_recall_fscore_support( + true_labels, + predicted_labels, + average='binary', + zero_division=0 + ) + + return { + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Example usage: +# Find the optimal threshold using validation data +print("\nFinding optimal threshold...") +optimal_threshold, best_precision, best_recall, best_f1 = find_optimal_threshold(trainer, val_dataset) +print(f"Optimal threshold: {optimal_threshold:.4f}") + +# Evaluate with optimal threshold +print("\nEvaluating with optimal threshold:") +optimized_results = evaluate_with_threshold(trainer, test_dataset, threshold=optimal_threshold) +print(f"Precision: {optimized_results['precision']:.4f}") +print(f"Recall: {optimized_results['recall']:.4f}") +print(f"F1: {optimized_results['f1']:.4f}") + + +# Save the model +trainer.save_model("./hate_speech_model/best_model") \ No newline at end of file