import sys import sacrebleu import collections import numpy as np import time def ngram_counts(text, max_n=4): counts = collections.defaultdict(int) for n in range(1, max_n + 1): for i in range(len(text) - n + 1): ngram = tuple(text[i:i+n]) counts[ngram] += 1 return counts def gleu_score(reference, hypothesis, max_n=4): ref_counts = ngram_counts(reference.split(), max_n) hyp_counts = ngram_counts(hypothesis.split(), max_n) overlap = sum(min(count, hyp_counts[gram]) for gram, count in ref_counts.items()) hyp_count_sum = sum(hyp_counts.values()) ref_count_sum = sum(ref_counts.values()) precision = overlap / hyp_count_sum if hyp_count_sum > 0 else 0 recall = overlap / ref_count_sum if ref_count_sum > 0 else 0 return min(precision, recall) def fbeta_score(reference, hypothesis, beta=0.5, max_n=4): ref_counts = ngram_counts(reference.split(), max_n) hyp_counts = ngram_counts(hypothesis.split(), max_n) overlap = sum(min(count, hyp_counts[gram]) for gram, count in ref_counts.items()) hyp_count_sum = sum(hyp_counts.values()) ref_count_sum = sum(ref_counts.values()) precision = overlap / hyp_count_sum if hyp_count_sum > 0 else 0 recall = overlap / ref_count_sum if ref_count_sum > 0 else 0 if precision + recall == 0: return 0.0 else: return (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall) def edit_distance(ref, hyp): d = np.zeros((len(ref) + 1) * (len(hyp) + 1), dtype=np.int32) d = d.reshape((len(ref) + 1, len(hyp) + 1)) for i in range(len(ref) + 1): for j in range(len(hyp) + 1): if i == 0: d[i][j] = j elif j == 0: d[i][j] = i elif ref[i - 1] == hyp[j - 1]: d[i][j] = d[i - 1][j - 1] else: d[i][j] = 1 + min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1]) return d[len(ref)][len(hyp)] def wer(reference, hypothesis): ref_words = reference.split() if len(ref_words) == 0: return 1.0 hyp_words = hypothesis.split() distance = edit_distance(ref_words, hyp_words) return distance / len(ref_words) def cer(reference, hypothesis): ref_chars = list(reference) if len(ref_chars) == 0: return 1.0 hyp_chars = list(hypothesis) distance = edit_distance(ref_chars, hyp_chars) return distance / len(ref_chars) def accuracy(refs, preds): exact_matches = sum(1 for ref, pred in zip(refs, preds) if ref == pred) return exact_matches / len(refs) if len(refs) > 0 else 0 def ser(refs, preds): sentence_errors = sum(1 for ref, pred in zip(refs, preds) if ref != pred) return sentence_errors / len(refs) if len(refs) > 0 else 0 def main(target_test, target_pred): start_time = time.time() refs = [] preds = [] with open(target_test) as test: for line in test: line = line.strip() refs.append(line) with open(target_pred) as pred: for line in pred: line = line.strip() preds.append(line) gleu_scores = [gleu_score(refs[i], preds[i]) for i in range(len(refs))] average_gleu = np.mean(gleu_scores) print("Average GLEU: {:.2f}%".format(average_gleu * 100)) fbeta_scores = [fbeta_score(refs[i], preds[i]) for i in range(len(refs))] average_fbeta = np.mean(fbeta_scores) print("Average F0.5 Score: {:.2f}%".format(average_fbeta * 100)) wer_scores = [wer(refs[i], preds[i]) for i in range(len(refs))] average_wer = np.mean(wer_scores) print("Average WER: {:.2f}%".format(average_wer * 100)) cer_scores = [cer(refs[i], preds[i]) for i in range(len(refs))] average_cer = np.mean(cer_scores) print("Average CER: {:.2f}%".format(average_cer * 100)) accuracy_score = accuracy(refs, preds) print("Accuracy: {:.2f}%".format(accuracy_score * 100)) ser_score = ser(refs, preds) print("SER: {:.2f}%".format(ser_score * 100)) end_time = time.time() print(f"Execution Time: {end_time - start_time:.2f} seconds") if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: python script.py target_test target_pred") else: main(sys.argv[1], sys.argv[2])