From 029f804fa2ffd44d9edd70dc94a049b5d1730d31 Mon Sep 17 00:00:00 2001
From: Andrii Pervashov <andrii.pervashov@student.tuke.sk>
Date: Tue, 8 Apr 2025 17:57:52 +0000
Subject: [PATCH] Upload files to "/"

---
 load.py   |  49 +++++++++++++++++++
 metric.py | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 train.py  |  77 +++++++++++++++++++++++++++++
 3 files changed, 267 insertions(+)
 create mode 100644 load.py
 create mode 100644 metric.py
 create mode 100644 train.py

diff --git a/load.py b/load.py
new file mode 100644
index 0000000..9e68745
--- /dev/null
+++ b/load.py
@@ -0,0 +1,49 @@
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0' 
+os.environ['WANDB_DISABLED'] = 'true'
+import torch
+from tqdm import tqdm
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from transformers import ByT5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
+from sklearn.model_selection import train_test_split
+
+
+def load_model(model_path):
+    tokenizer = ByT5Tokenizer.from_pretrained(model_path)
+    model = T5ForConditionalGeneration.from_pretrained(model_path)
+    if torch.cuda.is_available():
+        model = model.cuda()
+    return tokenizer, model
+
+def correct_sentence(tokenizer, model, sentence):
+    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=4096)
+    if torch.cuda.is_available():
+        inputs = {k: v.cuda() for k, v in inputs.items()}
+    output_sequences = model.generate(
+        input_ids=inputs["input_ids"],
+        attention_mask=inputs["attention_mask"],
+        max_length=4096,
+    )
+    corrected = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
+    return corrected
+
+def process_and_save_corrections(input_file_path, output_file_path, tokenizer, model):
+    with open(input_file_path, 'r', encoding='utf-8') as input_file, \
+         open(output_file_path, 'w', encoding='utf-8') as output_file:
+        sentences = input_file.readlines()
+        for sentence in tqdm(sentences, desc="Processing sentences"):
+            sentence = sentence.strip()
+            if sentence:
+                corrected = correct_sentence(tokenizer, model, sentence)
+                output_file.write(corrected + "\n")
+                output_file.flush()
+
+if __name__ == "__main__":
+    model_path = "./fine_tuned_model"
+    input_file_path = "./test_incorrect.txt"
+    output_file_path = "./test_correct_model.txt"
+    
+    tokenizer, model = load_model(model_path)
+    process_and_save_corrections(input_file_path, output_file_path, tokenizer, model)
+    print("Correction process completed. Corrected sentences saved to", output_file_path)
diff --git a/metric.py b/metric.py
new file mode 100644
index 0000000..8f2d991
--- /dev/null
+++ b/metric.py
@@ -0,0 +1,141 @@
+
+import sys
+import sacrebleu
+import collections
+import numpy as np
+import time  
+
+
+def ngram_counts(text, max_n=4):
+   
+    counts = collections.defaultdict(int)
+    for n in range(1, max_n + 1):
+        for i in range(len(text) - n + 1):
+            ngram = tuple(text[i:i+n])
+            counts[ngram] += 1
+    return counts
+
+def gleu_score(reference, hypothesis, max_n=4):
+
+    ref_counts = ngram_counts(reference.split(), max_n)
+    hyp_counts = ngram_counts(hypothesis.split(), max_n)
+
+    overlap = sum(min(count, hyp_counts[gram]) for gram, count in ref_counts.items())
+
+    hyp_count_sum = sum(hyp_counts.values())
+    ref_count_sum = sum(ref_counts.values())
+
+    precision = overlap / hyp_count_sum if hyp_count_sum > 0 else 0
+    recall = overlap / ref_count_sum if ref_count_sum > 0 else 0
+
+    return min(precision, recall)
+
+def fbeta_score(reference, hypothesis, beta=0.5, max_n=4):
+
+    ref_counts = ngram_counts(reference.split(), max_n)
+    hyp_counts = ngram_counts(hypothesis.split(), max_n)
+
+    overlap = sum(min(count, hyp_counts[gram]) for gram, count in ref_counts.items())
+
+    hyp_count_sum = sum(hyp_counts.values())
+    ref_count_sum = sum(ref_counts.values())
+
+    precision = overlap / hyp_count_sum if hyp_count_sum > 0 else 0
+    recall = overlap / ref_count_sum if ref_count_sum > 0 else 0
+
+    if precision + recall == 0:
+        return 0.0
+    else:
+        return (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
+
+def edit_distance(ref, hyp):
+
+    d = np.zeros((len(ref) + 1) * (len(hyp) + 1), dtype=np.int32)  
+    d = d.reshape((len(ref) + 1, len(hyp) + 1))
+    for i in range(len(ref) + 1):
+        for j in range(len(hyp) + 1):
+            if i == 0:
+                d[i][j] = j
+            elif j == 0:
+                d[i][j] = i
+            elif ref[i - 1] == hyp[j - 1]:
+                d[i][j] = d[i - 1][j - 1]
+            else:
+                d[i][j] = 1 + min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1])
+    return d[len(ref)][len(hyp)]
+
+def wer(reference, hypothesis):
+
+    ref_words = reference.split()
+    if len(ref_words) == 0: 
+        return 1.0  
+    hyp_words = hypothesis.split()
+    distance = edit_distance(ref_words, hyp_words)
+    return distance / len(ref_words)
+
+def cer(reference, hypothesis):
+
+    ref_chars = list(reference)
+    if len(ref_chars) == 0:  
+        return 1.0  
+    hyp_chars = list(hypothesis)
+    distance = edit_distance(ref_chars, hyp_chars)
+    return distance / len(ref_chars)
+
+def accuracy(refs, preds):
+
+    exact_matches = sum(1 for ref, pred in zip(refs, preds) if ref == pred)
+    return exact_matches / len(refs) if len(refs) > 0 else 0
+
+def ser(refs, preds):
+
+    sentence_errors = sum(1 for ref, pred in zip(refs, preds) if ref != pred)
+    return sentence_errors / len(refs) if len(refs) > 0 else 0
+
+def main(target_test, target_pred):
+    start_time = time.time()  
+
+    refs = []
+    preds = []
+
+    with open(target_test) as test:
+        for line in test:
+            line = line.strip()
+            refs.append(line)
+
+    with open(target_pred) as pred:
+        for line in pred:
+            line = line.strip()
+            preds.append(line)
+
+
+    gleu_scores = [gleu_score(refs[i], preds[i]) for i in range(len(refs))]
+    average_gleu = np.mean(gleu_scores)
+    print("Average GLEU: {:.2f}%".format(average_gleu * 100))
+
+    fbeta_scores = [fbeta_score(refs[i], preds[i]) for i in range(len(refs))]
+    average_fbeta = np.mean(fbeta_scores)
+    print("Average F0.5 Score: {:.2f}%".format(average_fbeta * 100))
+
+    wer_scores = [wer(refs[i], preds[i]) for i in range(len(refs))]
+    average_wer = np.mean(wer_scores)
+    print("Average WER: {:.2f}%".format(average_wer * 100))
+
+    cer_scores = [cer(refs[i], preds[i]) for i in range(len(refs))]
+    average_cer = np.mean(cer_scores)
+    print("Average CER: {:.2f}%".format(average_cer * 100))
+
+    accuracy_score = accuracy(refs, preds)
+    print("Accuracy: {:.2f}%".format(accuracy_score * 100))
+
+    ser_score = ser(refs, preds)
+    print("SER: {:.2f}%".format(ser_score * 100))
+
+    end_time = time.time() 
+    print(f"Execution Time: {end_time - start_time:.2f} seconds")
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python script.py target_test target_pred")
+    else:
+        main(sys.argv[1], sys.argv[2])
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..461cb2c
--- /dev/null
+++ b/train.py
@@ -0,0 +1,77 @@
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0' 
+os.environ['WANDB_DISABLED'] = 'true'
+import pandas as pd
+from datasets import Dataset
+from transformers import ByT5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
+from sklearn.model_selection import train_test_split
+
+df = pd.read_csv("dataset_file_name.csv", sep=";")
+
+df.dropna(subset=['incorrect', 'correct'], inplace=True)
+df['incorrect'] = df['incorrect'].astype(str)
+df['correct'] = df['correct'].astype(str)
+
+train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
+val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)
+
+train_dataset = Dataset.from_pandas(train_df)
+val_dataset = Dataset.from_pandas(val_df)
+test_dataset = Dataset.from_pandas(test_df)
+
+tokenizer = ByT5Tokenizer.from_pretrained("your-model/name")
+model = T5ForConditionalGeneration.from_pretrained("your-model/name")
+
+def preprocess_function(examples):
+    input_texts = examples["incorrect"]
+    target_texts = examples["correct"]
+    model_inputs = tokenizer(input_texts, max_length=128, truncation=True, padding="max_length")
+    with tokenizer.as_target_tokenizer():
+        labels = tokenizer(target_texts, max_length=128, truncation=True, padding="max_length")
+    model_inputs["labels"] = labels["input_ids"]
+    return model_inputs
+
+tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
+tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)
+tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
+
+tokenized_train_dataset.save_to_disk("./tokenized_train_dataset")
+tokenized_val_dataset.save_to_disk("./tokenized_val_dataset")
+tokenized_test_dataset.save_to_disk("./tokenized_test_dataset")
+
+def save_sentences_to_separate_files(df, incorrect_file_path, correct_file_path):
+    with open(incorrect_file_path, "w", encoding="utf-8") as incorrect_file, \
+         open(correct_file_path, "w", encoding="utf-8") as correct_file:
+        for index, row in df.iterrows():
+            incorrect_file.write(row["incorrect"] + "\n")
+            correct_file.write(row["correct"] + "\n")
+
+save_sentences_to_separate_files(train_df, "./train_incorrect.txt", "./train_correct.txt")
+save_sentences_to_separate_files(val_df, "./val_incorrect.txt", "./val_correct.txt")
+save_sentences_to_separate_files(test_df, "./test_incorrect.txt", "./test_correct.txt")
+
+training_args = TrainingArguments(
+    output_dir="./results",
+    num_train_epochs=3,
+    per_device_train_batch_size=12,
+    warmup_steps=500,
+    weight_decay=0.01,
+    logging_dir="./logs",
+    logging_steps=10,
+    evaluation_strategy="epoch", 
+)
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_train_dataset,
+    eval_dataset=tokenized_val_dataset,  )
+
+trainer.train()
+
+print("Evaluation on the test set:")
+trainer.evaluate(tokenized_test_dataset)
+
+model_path = "./fine_tuned_model"
+model.save_pretrained(model_path)
+tokenizer.save_pretrained(model_path)