From 029f804fa2ffd44d9edd70dc94a049b5d1730d31 Mon Sep 17 00:00:00 2001 From: Andrii Pervashov Date: Tue, 8 Apr 2025 17:57:52 +0000 Subject: [PATCH] Upload files to "/" --- load.py | 49 +++++++++++++++++++ metric.py | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ train.py | 77 +++++++++++++++++++++++++++++ 3 files changed, 267 insertions(+) create mode 100644 load.py create mode 100644 metric.py create mode 100644 train.py diff --git a/load.py b/load.py new file mode 100644 index 0000000..9e68745 --- /dev/null +++ b/load.py @@ -0,0 +1,49 @@ +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '0' +os.environ['WANDB_DISABLED'] = 'true' +import torch +from tqdm import tqdm +from transformers import T5Tokenizer, T5ForConditionalGeneration +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +from transformers import ByT5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments +from sklearn.model_selection import train_test_split + + +def load_model(model_path): + tokenizer = ByT5Tokenizer.from_pretrained(model_path) + model = T5ForConditionalGeneration.from_pretrained(model_path) + if torch.cuda.is_available(): + model = model.cuda() + return tokenizer, model + +def correct_sentence(tokenizer, model, sentence): + inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=4096) + if torch.cuda.is_available(): + inputs = {k: v.cuda() for k, v in inputs.items()} + output_sequences = model.generate( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + max_length=4096, + ) + corrected = tokenizer.decode(output_sequences[0], skip_special_tokens=True) + return corrected + +def process_and_save_corrections(input_file_path, output_file_path, tokenizer, model): + with open(input_file_path, 'r', encoding='utf-8') as input_file, \ + open(output_file_path, 'w', encoding='utf-8') as output_file: + sentences = input_file.readlines() + for sentence in tqdm(sentences, desc="Processing sentences"): + sentence = sentence.strip() + if sentence: + corrected = correct_sentence(tokenizer, model, sentence) + output_file.write(corrected + "\n") + output_file.flush() + +if __name__ == "__main__": + model_path = "./fine_tuned_model" + input_file_path = "./test_incorrect.txt" + output_file_path = "./test_correct_model.txt" + + tokenizer, model = load_model(model_path) + process_and_save_corrections(input_file_path, output_file_path, tokenizer, model) + print("Correction process completed. Corrected sentences saved to", output_file_path) diff --git a/metric.py b/metric.py new file mode 100644 index 0000000..8f2d991 --- /dev/null +++ b/metric.py @@ -0,0 +1,141 @@ + +import sys +import sacrebleu +import collections +import numpy as np +import time + + +def ngram_counts(text, max_n=4): + + counts = collections.defaultdict(int) + for n in range(1, max_n + 1): + for i in range(len(text) - n + 1): + ngram = tuple(text[i:i+n]) + counts[ngram] += 1 + return counts + +def gleu_score(reference, hypothesis, max_n=4): + + ref_counts = ngram_counts(reference.split(), max_n) + hyp_counts = ngram_counts(hypothesis.split(), max_n) + + overlap = sum(min(count, hyp_counts[gram]) for gram, count in ref_counts.items()) + + hyp_count_sum = sum(hyp_counts.values()) + ref_count_sum = sum(ref_counts.values()) + + precision = overlap / hyp_count_sum if hyp_count_sum > 0 else 0 + recall = overlap / ref_count_sum if ref_count_sum > 0 else 0 + + return min(precision, recall) + +def fbeta_score(reference, hypothesis, beta=0.5, max_n=4): + + ref_counts = ngram_counts(reference.split(), max_n) + hyp_counts = ngram_counts(hypothesis.split(), max_n) + + overlap = sum(min(count, hyp_counts[gram]) for gram, count in ref_counts.items()) + + hyp_count_sum = sum(hyp_counts.values()) + ref_count_sum = sum(ref_counts.values()) + + precision = overlap / hyp_count_sum if hyp_count_sum > 0 else 0 + recall = overlap / ref_count_sum if ref_count_sum > 0 else 0 + + if precision + recall == 0: + return 0.0 + else: + return (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall) + +def edit_distance(ref, hyp): + + d = np.zeros((len(ref) + 1) * (len(hyp) + 1), dtype=np.int32) + d = d.reshape((len(ref) + 1, len(hyp) + 1)) + for i in range(len(ref) + 1): + for j in range(len(hyp) + 1): + if i == 0: + d[i][j] = j + elif j == 0: + d[i][j] = i + elif ref[i - 1] == hyp[j - 1]: + d[i][j] = d[i - 1][j - 1] + else: + d[i][j] = 1 + min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1]) + return d[len(ref)][len(hyp)] + +def wer(reference, hypothesis): + + ref_words = reference.split() + if len(ref_words) == 0: + return 1.0 + hyp_words = hypothesis.split() + distance = edit_distance(ref_words, hyp_words) + return distance / len(ref_words) + +def cer(reference, hypothesis): + + ref_chars = list(reference) + if len(ref_chars) == 0: + return 1.0 + hyp_chars = list(hypothesis) + distance = edit_distance(ref_chars, hyp_chars) + return distance / len(ref_chars) + +def accuracy(refs, preds): + + exact_matches = sum(1 for ref, pred in zip(refs, preds) if ref == pred) + return exact_matches / len(refs) if len(refs) > 0 else 0 + +def ser(refs, preds): + + sentence_errors = sum(1 for ref, pred in zip(refs, preds) if ref != pred) + return sentence_errors / len(refs) if len(refs) > 0 else 0 + +def main(target_test, target_pred): + start_time = time.time() + + refs = [] + preds = [] + + with open(target_test) as test: + for line in test: + line = line.strip() + refs.append(line) + + with open(target_pred) as pred: + for line in pred: + line = line.strip() + preds.append(line) + + + gleu_scores = [gleu_score(refs[i], preds[i]) for i in range(len(refs))] + average_gleu = np.mean(gleu_scores) + print("Average GLEU: {:.2f}%".format(average_gleu * 100)) + + fbeta_scores = [fbeta_score(refs[i], preds[i]) for i in range(len(refs))] + average_fbeta = np.mean(fbeta_scores) + print("Average F0.5 Score: {:.2f}%".format(average_fbeta * 100)) + + wer_scores = [wer(refs[i], preds[i]) for i in range(len(refs))] + average_wer = np.mean(wer_scores) + print("Average WER: {:.2f}%".format(average_wer * 100)) + + cer_scores = [cer(refs[i], preds[i]) for i in range(len(refs))] + average_cer = np.mean(cer_scores) + print("Average CER: {:.2f}%".format(average_cer * 100)) + + accuracy_score = accuracy(refs, preds) + print("Accuracy: {:.2f}%".format(accuracy_score * 100)) + + ser_score = ser(refs, preds) + print("SER: {:.2f}%".format(ser_score * 100)) + + end_time = time.time() + print(f"Execution Time: {end_time - start_time:.2f} seconds") + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: python script.py target_test target_pred") + else: + main(sys.argv[1], sys.argv[2]) diff --git a/train.py b/train.py new file mode 100644 index 0000000..461cb2c --- /dev/null +++ b/train.py @@ -0,0 +1,77 @@ +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '0' +os.environ['WANDB_DISABLED'] = 'true' +import pandas as pd +from datasets import Dataset +from transformers import ByT5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments +from sklearn.model_selection import train_test_split + +df = pd.read_csv("dataset_file_name.csv", sep=";") + +df.dropna(subset=['incorrect', 'correct'], inplace=True) +df['incorrect'] = df['incorrect'].astype(str) +df['correct'] = df['correct'].astype(str) + +train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) +val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42) + +train_dataset = Dataset.from_pandas(train_df) +val_dataset = Dataset.from_pandas(val_df) +test_dataset = Dataset.from_pandas(test_df) + +tokenizer = ByT5Tokenizer.from_pretrained("your-model/name") +model = T5ForConditionalGeneration.from_pretrained("your-model/name") + +def preprocess_function(examples): + input_texts = examples["incorrect"] + target_texts = examples["correct"] + model_inputs = tokenizer(input_texts, max_length=128, truncation=True, padding="max_length") + with tokenizer.as_target_tokenizer(): + labels = tokenizer(target_texts, max_length=128, truncation=True, padding="max_length") + model_inputs["labels"] = labels["input_ids"] + return model_inputs + +tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True) +tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True) +tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True) + +tokenized_train_dataset.save_to_disk("./tokenized_train_dataset") +tokenized_val_dataset.save_to_disk("./tokenized_val_dataset") +tokenized_test_dataset.save_to_disk("./tokenized_test_dataset") + +def save_sentences_to_separate_files(df, incorrect_file_path, correct_file_path): + with open(incorrect_file_path, "w", encoding="utf-8") as incorrect_file, \ + open(correct_file_path, "w", encoding="utf-8") as correct_file: + for index, row in df.iterrows(): + incorrect_file.write(row["incorrect"] + "\n") + correct_file.write(row["correct"] + "\n") + +save_sentences_to_separate_files(train_df, "./train_incorrect.txt", "./train_correct.txt") +save_sentences_to_separate_files(val_df, "./val_incorrect.txt", "./val_correct.txt") +save_sentences_to_separate_files(test_df, "./test_incorrect.txt", "./test_correct.txt") + +training_args = TrainingArguments( + output_dir="./results", + num_train_epochs=3, + per_device_train_batch_size=12, + warmup_steps=500, + weight_decay=0.01, + logging_dir="./logs", + logging_steps=10, + evaluation_strategy="epoch", +) + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_train_dataset, + eval_dataset=tokenized_val_dataset, ) + +trainer.train() + +print("Evaluation on the test set:") +trainer.evaluate(tokenized_test_dataset) + +model_path = "./fine_tuned_model" +model.save_pretrained(model_path) +tokenizer.save_pretrained(model_path)