diff --git a/lm-eval-harness/slovak_toxic_classification/__init__.py b/lm-eval-harness/slovak_toxic_classification/__init__.py new file mode 100644 index 0000000..79970b2 --- /dev/null +++ b/lm-eval-harness/slovak_toxic_classification/__init__.py @@ -0,0 +1,73 @@ +from lm_eval.api.task import Task +from datasets import load_dataset +from sklearn.metrics import precision_recall_fscore_support + +class SlovakToxicClassification(Task): + VERSION = 1 + DATASET_PATH = "TUKE-KEMT/hate_speech_slovak" + DATASET_NAME = None + + def __init__(self): + # Загружаем датасет + self.dataset = load_dataset(self.DATASET_PATH) + + def has_training_docs(self): + return False + + def has_validation_docs(self): + return True + + def has_test_docs(self): + return True + + def validation_docs(self): + return list(self.dataset["test"]) + + def test_docs(self): + return list(self.dataset["test"]) + + def doc_to_text(self, doc): + return doc["text"] + + def doc_to_target(self, doc): + return int(doc["label"]) + + def construct_requests(self, doc, ctx): + + return self.loglikelihood(ctx, " 0"), self.loglikelihood(ctx, " 1") + + def process_results(self, doc, results): + ll0, ll1 = results + pred = int(ll1 > ll0) + gold = self.doc_to_target(doc) + return { + "f1": (gold, pred), + "precision": (gold, pred), + "recall": (gold, pred), + } + + def aggregation(self): + return { + "f1": self.f1_score, + "precision": self.precision_score, + "recall": self.recall_score, + } + + def higher_is_better(self): + return { + "f1": True, + "precision": True, + "recall": True, + } + + def f1_score(self, gold_and_pred): + golds, preds = zip(*gold_and_pred) + return precision_recall_fscore_support(golds, preds, average="binary")[2] + + def precision_score(self, gold_and_pred): + golds, preds = zip(*gold_and_pred) + return precision_recall_fscore_support(golds, preds, average="binary")[0] + + def recall_score(self, gold_and_pred): + golds, preds = zip(*gold_and_pred) + return precision_recall_fscore_support(golds, preds, average="binary")[1]