bakalarka_praca/lm-eval-harness/slovak_toxic_classification/__init__.py

73 lines
2.0 KiB
Python

from lm_eval.api.task import Task
from datasets import load_dataset
from sklearn.metrics import precision_recall_fscore_support
class SlovakToxicClassification(Task):
VERSION = 1
DATASET_PATH = "TUKE-KEMT/hate_speech_slovak"
DATASET_NAME = None
def __init__(self):
self.dataset = load_dataset(self.DATASET_PATH)
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def validation_docs(self):
return list(self.dataset["test"])
def test_docs(self):
return list(self.dataset["test"])
def doc_to_text(self, doc):
return doc["text"]
def doc_to_target(self, doc):
return int(doc["label"])
def construct_requests(self, doc, ctx):
return self.loglikelihood(ctx, " 0"), self.loglikelihood(ctx, " 1")
def process_results(self, doc, results):
ll0, ll1 = results
pred = int(ll1 > ll0)
gold = self.doc_to_target(doc)
return {
"f1": (gold, pred),
"precision": (gold, pred),
"recall": (gold, pred),
}
def aggregation(self):
return {
"f1": self.f1_score,
"precision": self.precision_score,
"recall": self.recall_score,
}
def higher_is_better(self):
return {
"f1": True,
"precision": True,
"recall": True,
}
def f1_score(self, gold_and_pred):
golds, preds = zip(*gold_and_pred)
return precision_recall_fscore_support(golds, preds, average="binary")[2]
def precision_score(self, gold_and_pred):
golds, preds = zip(*gold_and_pred)
return precision_recall_fscore_support(golds, preds, average="binary")[0]
def recall_score(self, gold_and_pred):
golds, preds = zip(*gold_and_pred)
return precision_recall_fscore_support(golds, preds, average="binary")[1]