from lm_eval.api.task import Task from datasets import load_dataset from sklearn.metrics import precision_recall_fscore_support class SlovakToxicClassification(Task): VERSION = 1 DATASET_PATH = "TUKE-KEMT/hate_speech_slovak" DATASET_NAME = None def __init__(self): self.dataset = load_dataset(self.DATASET_PATH) def has_training_docs(self): return False def has_validation_docs(self): return True def has_test_docs(self): return True def validation_docs(self): return list(self.dataset["test"]) def test_docs(self): return list(self.dataset["test"]) def doc_to_text(self, doc): return doc["text"] def doc_to_target(self, doc): return int(doc["label"]) def construct_requests(self, doc, ctx): return self.loglikelihood(ctx, " 0"), self.loglikelihood(ctx, " 1") def process_results(self, doc, results): ll0, ll1 = results pred = int(ll1 > ll0) gold = self.doc_to_target(doc) return { "f1": (gold, pred), "precision": (gold, pred), "recall": (gold, pred), } def aggregation(self): return { "f1": self.f1_score, "precision": self.precision_score, "recall": self.recall_score, } def higher_is_better(self): return { "f1": True, "precision": True, "recall": True, } def f1_score(self, gold_and_pred): golds, preds = zip(*gold_and_pred) return precision_recall_fscore_support(golds, preds, average="binary")[2] def precision_score(self, gold_and_pred): golds, preds = zip(*gold_and_pred) return precision_recall_fscore_support(golds, preds, average="binary")[0] def recall_score(self, gold_and_pred): golds, preds = zip(*gold_and_pred) return precision_recall_fscore_support(golds, preds, average="binary")[1]