From 54e835669cb1768d158c5b42806c3dc94e7b7171 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Remia=C5=A1?= Date: Tue, 28 Nov 2023 13:31:11 +0000 Subject: [PATCH] Upload files to '' --- slovak_punction2.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 slovak_punction2.py diff --git a/slovak_punction2.py b/slovak_punction2.py new file mode 100644 index 0000000..018f8e0 --- /dev/null +++ b/slovak_punction2.py @@ -0,0 +1,42 @@ +# import modulov +from transformers import pipeline +from nltk.tokenize import sent_tokenize +import nltk + +nltk.download('punkt') + +# funkcia na obnovu interpunkcie +def restore_punctuation(text): + sents = sent_tokenize(text) + new_text = "" + labels = ['.', '!', ',', ':', '?', '-', ";"] + + for sent in sents: + sent = ''.join(ch for ch in sent if ch not in labels) + text_word = sent.split() + words = text_word[:] + + unmasker = pipeline('fill-mask', model='gerulata/slovakbert') + + for i in range(1, len(text_word) + 1): + text_word.insert(i, '') + sent = " ".join(text_word) + text_with_punc = unmasker(sent) + + if text_with_punc[0]['token_str'] in labels: + words[i - 1] = words[i - 1] + text_with_punc[0]['token_str'] + + text_word = words[:] + + new_text += " ".join(words) + + return new_text + +# Zadanie textu pre opravu interpunkcie +input_text = input("Zadajte text na opravu interpunkcie: ") +output_text = restore_punctuation(input_text) + +# Výpis pôvodného a opraveného textu +print("Pôvodný text:", input_text) +print("Opravený text:", output_text) +