Upload files to "/"
This commit is contained in:
commit
029f804fa2
49
load.py
Normal file
49
load.py
Normal file
@ -0,0 +1,49 @@
|
||||
import os
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
||||
os.environ['WANDB_DISABLED'] = 'true'
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
||||
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
||||
from transformers import ByT5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
|
||||
def load_model(model_path):
|
||||
tokenizer = ByT5Tokenizer.from_pretrained(model_path)
|
||||
model = T5ForConditionalGeneration.from_pretrained(model_path)
|
||||
if torch.cuda.is_available():
|
||||
model = model.cuda()
|
||||
return tokenizer, model
|
||||
|
||||
def correct_sentence(tokenizer, model, sentence):
|
||||
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=4096)
|
||||
if torch.cuda.is_available():
|
||||
inputs = {k: v.cuda() for k, v in inputs.items()}
|
||||
output_sequences = model.generate(
|
||||
input_ids=inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"],
|
||||
max_length=4096,
|
||||
)
|
||||
corrected = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
|
||||
return corrected
|
||||
|
||||
def process_and_save_corrections(input_file_path, output_file_path, tokenizer, model):
|
||||
with open(input_file_path, 'r', encoding='utf-8') as input_file, \
|
||||
open(output_file_path, 'w', encoding='utf-8') as output_file:
|
||||
sentences = input_file.readlines()
|
||||
for sentence in tqdm(sentences, desc="Processing sentences"):
|
||||
sentence = sentence.strip()
|
||||
if sentence:
|
||||
corrected = correct_sentence(tokenizer, model, sentence)
|
||||
output_file.write(corrected + "\n")
|
||||
output_file.flush()
|
||||
|
||||
if __name__ == "__main__":
|
||||
model_path = "./fine_tuned_model"
|
||||
input_file_path = "./test_incorrect.txt"
|
||||
output_file_path = "./test_correct_model.txt"
|
||||
|
||||
tokenizer, model = load_model(model_path)
|
||||
process_and_save_corrections(input_file_path, output_file_path, tokenizer, model)
|
||||
print("Correction process completed. Corrected sentences saved to", output_file_path)
|
141
metric.py
Normal file
141
metric.py
Normal file
@ -0,0 +1,141 @@
|
||||
|
||||
import sys
|
||||
import sacrebleu
|
||||
import collections
|
||||
import numpy as np
|
||||
import time
|
||||
|
||||
|
||||
def ngram_counts(text, max_n=4):
|
||||
|
||||
counts = collections.defaultdict(int)
|
||||
for n in range(1, max_n + 1):
|
||||
for i in range(len(text) - n + 1):
|
||||
ngram = tuple(text[i:i+n])
|
||||
counts[ngram] += 1
|
||||
return counts
|
||||
|
||||
def gleu_score(reference, hypothesis, max_n=4):
|
||||
|
||||
ref_counts = ngram_counts(reference.split(), max_n)
|
||||
hyp_counts = ngram_counts(hypothesis.split(), max_n)
|
||||
|
||||
overlap = sum(min(count, hyp_counts[gram]) for gram, count in ref_counts.items())
|
||||
|
||||
hyp_count_sum = sum(hyp_counts.values())
|
||||
ref_count_sum = sum(ref_counts.values())
|
||||
|
||||
precision = overlap / hyp_count_sum if hyp_count_sum > 0 else 0
|
||||
recall = overlap / ref_count_sum if ref_count_sum > 0 else 0
|
||||
|
||||
return min(precision, recall)
|
||||
|
||||
def fbeta_score(reference, hypothesis, beta=0.5, max_n=4):
|
||||
|
||||
ref_counts = ngram_counts(reference.split(), max_n)
|
||||
hyp_counts = ngram_counts(hypothesis.split(), max_n)
|
||||
|
||||
overlap = sum(min(count, hyp_counts[gram]) for gram, count in ref_counts.items())
|
||||
|
||||
hyp_count_sum = sum(hyp_counts.values())
|
||||
ref_count_sum = sum(ref_counts.values())
|
||||
|
||||
precision = overlap / hyp_count_sum if hyp_count_sum > 0 else 0
|
||||
recall = overlap / ref_count_sum if ref_count_sum > 0 else 0
|
||||
|
||||
if precision + recall == 0:
|
||||
return 0.0
|
||||
else:
|
||||
return (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
|
||||
|
||||
def edit_distance(ref, hyp):
|
||||
|
||||
d = np.zeros((len(ref) + 1) * (len(hyp) + 1), dtype=np.int32)
|
||||
d = d.reshape((len(ref) + 1, len(hyp) + 1))
|
||||
for i in range(len(ref) + 1):
|
||||
for j in range(len(hyp) + 1):
|
||||
if i == 0:
|
||||
d[i][j] = j
|
||||
elif j == 0:
|
||||
d[i][j] = i
|
||||
elif ref[i - 1] == hyp[j - 1]:
|
||||
d[i][j] = d[i - 1][j - 1]
|
||||
else:
|
||||
d[i][j] = 1 + min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1])
|
||||
return d[len(ref)][len(hyp)]
|
||||
|
||||
def wer(reference, hypothesis):
|
||||
|
||||
ref_words = reference.split()
|
||||
if len(ref_words) == 0:
|
||||
return 1.0
|
||||
hyp_words = hypothesis.split()
|
||||
distance = edit_distance(ref_words, hyp_words)
|
||||
return distance / len(ref_words)
|
||||
|
||||
def cer(reference, hypothesis):
|
||||
|
||||
ref_chars = list(reference)
|
||||
if len(ref_chars) == 0:
|
||||
return 1.0
|
||||
hyp_chars = list(hypothesis)
|
||||
distance = edit_distance(ref_chars, hyp_chars)
|
||||
return distance / len(ref_chars)
|
||||
|
||||
def accuracy(refs, preds):
|
||||
|
||||
exact_matches = sum(1 for ref, pred in zip(refs, preds) if ref == pred)
|
||||
return exact_matches / len(refs) if len(refs) > 0 else 0
|
||||
|
||||
def ser(refs, preds):
|
||||
|
||||
sentence_errors = sum(1 for ref, pred in zip(refs, preds) if ref != pred)
|
||||
return sentence_errors / len(refs) if len(refs) > 0 else 0
|
||||
|
||||
def main(target_test, target_pred):
|
||||
start_time = time.time()
|
||||
|
||||
refs = []
|
||||
preds = []
|
||||
|
||||
with open(target_test) as test:
|
||||
for line in test:
|
||||
line = line.strip()
|
||||
refs.append(line)
|
||||
|
||||
with open(target_pred) as pred:
|
||||
for line in pred:
|
||||
line = line.strip()
|
||||
preds.append(line)
|
||||
|
||||
|
||||
gleu_scores = [gleu_score(refs[i], preds[i]) for i in range(len(refs))]
|
||||
average_gleu = np.mean(gleu_scores)
|
||||
print("Average GLEU: {:.2f}%".format(average_gleu * 100))
|
||||
|
||||
fbeta_scores = [fbeta_score(refs[i], preds[i]) for i in range(len(refs))]
|
||||
average_fbeta = np.mean(fbeta_scores)
|
||||
print("Average F0.5 Score: {:.2f}%".format(average_fbeta * 100))
|
||||
|
||||
wer_scores = [wer(refs[i], preds[i]) for i in range(len(refs))]
|
||||
average_wer = np.mean(wer_scores)
|
||||
print("Average WER: {:.2f}%".format(average_wer * 100))
|
||||
|
||||
cer_scores = [cer(refs[i], preds[i]) for i in range(len(refs))]
|
||||
average_cer = np.mean(cer_scores)
|
||||
print("Average CER: {:.2f}%".format(average_cer * 100))
|
||||
|
||||
accuracy_score = accuracy(refs, preds)
|
||||
print("Accuracy: {:.2f}%".format(accuracy_score * 100))
|
||||
|
||||
ser_score = ser(refs, preds)
|
||||
print("SER: {:.2f}%".format(ser_score * 100))
|
||||
|
||||
end_time = time.time()
|
||||
print(f"Execution Time: {end_time - start_time:.2f} seconds")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: python script.py target_test target_pred")
|
||||
else:
|
||||
main(sys.argv[1], sys.argv[2])
|
77
train.py
Normal file
77
train.py
Normal file
@ -0,0 +1,77 @@
|
||||
import os
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
||||
os.environ['WANDB_DISABLED'] = 'true'
|
||||
import pandas as pd
|
||||
from datasets import Dataset
|
||||
from transformers import ByT5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
df = pd.read_csv("dataset_file_name.csv", sep=";")
|
||||
|
||||
df.dropna(subset=['incorrect', 'correct'], inplace=True)
|
||||
df['incorrect'] = df['incorrect'].astype(str)
|
||||
df['correct'] = df['correct'].astype(str)
|
||||
|
||||
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
|
||||
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)
|
||||
|
||||
train_dataset = Dataset.from_pandas(train_df)
|
||||
val_dataset = Dataset.from_pandas(val_df)
|
||||
test_dataset = Dataset.from_pandas(test_df)
|
||||
|
||||
tokenizer = ByT5Tokenizer.from_pretrained("your-model/name")
|
||||
model = T5ForConditionalGeneration.from_pretrained("your-model/name")
|
||||
|
||||
def preprocess_function(examples):
|
||||
input_texts = examples["incorrect"]
|
||||
target_texts = examples["correct"]
|
||||
model_inputs = tokenizer(input_texts, max_length=128, truncation=True, padding="max_length")
|
||||
with tokenizer.as_target_tokenizer():
|
||||
labels = tokenizer(target_texts, max_length=128, truncation=True, padding="max_length")
|
||||
model_inputs["labels"] = labels["input_ids"]
|
||||
return model_inputs
|
||||
|
||||
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
|
||||
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)
|
||||
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
|
||||
|
||||
tokenized_train_dataset.save_to_disk("./tokenized_train_dataset")
|
||||
tokenized_val_dataset.save_to_disk("./tokenized_val_dataset")
|
||||
tokenized_test_dataset.save_to_disk("./tokenized_test_dataset")
|
||||
|
||||
def save_sentences_to_separate_files(df, incorrect_file_path, correct_file_path):
|
||||
with open(incorrect_file_path, "w", encoding="utf-8") as incorrect_file, \
|
||||
open(correct_file_path, "w", encoding="utf-8") as correct_file:
|
||||
for index, row in df.iterrows():
|
||||
incorrect_file.write(row["incorrect"] + "\n")
|
||||
correct_file.write(row["correct"] + "\n")
|
||||
|
||||
save_sentences_to_separate_files(train_df, "./train_incorrect.txt", "./train_correct.txt")
|
||||
save_sentences_to_separate_files(val_df, "./val_incorrect.txt", "./val_correct.txt")
|
||||
save_sentences_to_separate_files(test_df, "./test_incorrect.txt", "./test_correct.txt")
|
||||
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
num_train_epochs=3,
|
||||
per_device_train_batch_size=12,
|
||||
warmup_steps=500,
|
||||
weight_decay=0.01,
|
||||
logging_dir="./logs",
|
||||
logging_steps=10,
|
||||
evaluation_strategy="epoch",
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_train_dataset,
|
||||
eval_dataset=tokenized_val_dataset, )
|
||||
|
||||
trainer.train()
|
||||
|
||||
print("Evaluation on the test set:")
|
||||
trainer.evaluate(tokenized_test_dataset)
|
||||
|
||||
model_path = "./fine_tuned_model"
|
||||
model.save_pretrained(model_path)
|
||||
tokenizer.save_pretrained(model_path)
|
Loading…
Reference in New Issue
Block a user