bkpc/metric.py


import sys
import sacrebleu
import collections
import numpy as np
import time


def ngram_counts(text, max_n=4):

    counts = collections.defaultdict(int)
    for n in range(1, max_n + 1):
        for i in range(len(text) - n + 1):
            ngram = tuple(text[i:i+n])
            counts[ngram] += 1
    return counts

def gleu_score(reference, hypothesis, max_n=4):

    ref_counts = ngram_counts(reference.split(), max_n)
    hyp_counts = ngram_counts(hypothesis.split(), max_n)

    overlap = sum(min(count, hyp_counts[gram]) for gram, count in ref_counts.items())

    hyp_count_sum = sum(hyp_counts.values())
    ref_count_sum = sum(ref_counts.values())

    precision = overlap / hyp_count_sum if hyp_count_sum > 0 else 0
    recall = overlap / ref_count_sum if ref_count_sum > 0 else 0

    return min(precision, recall)

def fbeta_score(reference, hypothesis, beta=0.5, max_n=4):

    ref_counts = ngram_counts(reference.split(), max_n)
    hyp_counts = ngram_counts(hypothesis.split(), max_n)

    overlap = sum(min(count, hyp_counts[gram]) for gram, count in ref_counts.items())

    hyp_count_sum = sum(hyp_counts.values())
    ref_count_sum = sum(ref_counts.values())

    precision = overlap / hyp_count_sum if hyp_count_sum > 0 else 0
    recall = overlap / ref_count_sum if ref_count_sum > 0 else 0

    if precision + recall == 0:
        return 0.0
    else:
        return (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)

def edit_distance(ref, hyp):

    d = np.zeros((len(ref) + 1) * (len(hyp) + 1), dtype=np.int32)
    d = d.reshape((len(ref) + 1, len(hyp) + 1))
    for i in range(len(ref) + 1):
        for j in range(len(hyp) + 1):
            if i == 0:
                d[i][j] = j
            elif j == 0:
                d[i][j] = i
            elif ref[i - 1] == hyp[j - 1]:
                d[i][j] = d[i - 1][j - 1]
            else:
                d[i][j] = 1 + min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1])
    return d[len(ref)][len(hyp)]

def wer(reference, hypothesis):

    ref_words = reference.split()
    if len(ref_words) == 0:
        return 1.0
    hyp_words = hypothesis.split()
    distance = edit_distance(ref_words, hyp_words)
    return distance / len(ref_words)

def cer(reference, hypothesis):

    ref_chars = list(reference)
    if len(ref_chars) == 0:
        return 1.0
    hyp_chars = list(hypothesis)
    distance = edit_distance(ref_chars, hyp_chars)
    return distance / len(ref_chars)

def accuracy(refs, preds):

    exact_matches = sum(1 for ref, pred in zip(refs, preds) if ref == pred)
    return exact_matches / len(refs) if len(refs) > 0 else 0

def ser(refs, preds):

    sentence_errors = sum(1 for ref, pred in zip(refs, preds) if ref != pred)
    return sentence_errors / len(refs) if len(refs) > 0 else 0

def main(target_test, target_pred):
    start_time = time.time()

    refs = []
    preds = []

    with open(target_test) as test:
        for line in test:
            line = line.strip()
            refs.append(line)

    with open(target_pred) as pred:
        for line in pred:
            line = line.strip()
            preds.append(line)


    gleu_scores = [gleu_score(refs[i], preds[i]) for i in range(len(refs))]
    average_gleu = np.mean(gleu_scores)
    print("Average GLEU: {:.2f}%".format(average_gleu * 100))

    fbeta_scores = [fbeta_score(refs[i], preds[i]) for i in range(len(refs))]
    average_fbeta = np.mean(fbeta_scores)
    print("Average F0.5 Score: {:.2f}%".format(average_fbeta * 100))

    wer_scores = [wer(refs[i], preds[i]) for i in range(len(refs))]
    average_wer = np.mean(wer_scores)
    print("Average WER: {:.2f}%".format(average_wer * 100))

    cer_scores = [cer(refs[i], preds[i]) for i in range(len(refs))]
    average_cer = np.mean(cer_scores)
    print("Average CER: {:.2f}%".format(average_cer * 100))

    accuracy_score = accuracy(refs, preds)
    print("Accuracy: {:.2f}%".format(accuracy_score * 100))

    ser_score = ser(refs, preds)
    print("SER: {:.2f}%".format(ser_score * 100))

    end_time = time.time()
    print(f"Execution Time: {end_time - start_time:.2f} seconds")

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python script.py target_test target_pred")
    else:
        main(sys.argv[1], sys.argv[2])