DP2024/slovak_punction2.py

43 lines
1.1 KiB
Python
Raw Normal View History

2023-11-28 13:31:11 +00:00
# import modulov
from transformers import pipeline
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')
# funkcia na obnovu interpunkcie
def restore_punctuation(text):
sents = sent_tokenize(text)
new_text = ""
labels = ['.', '!', ',', ':', '?', '-', ";"]
for sent in sents:
sent = ''.join(ch for ch in sent if ch not in labels)
text_word = sent.split()
words = text_word[:]
unmasker = pipeline('fill-mask', model='gerulata/slovakbert')
for i in range(1, len(text_word) + 1):
text_word.insert(i, '<mask>')
sent = " ".join(text_word)
text_with_punc = unmasker(sent)
if text_with_punc[0]['token_str'] in labels:
words[i - 1] = words[i - 1] + text_with_punc[0]['token_str']
text_word = words[:]
new_text += " ".join(words)
return new_text
# Zadanie textu pre opravu interpunkcie
input_text = input("Zadajte text na opravu interpunkcie: ")
output_text = restore_punctuation(input_text)
# Výpis pôvodného a opraveného textu
print("Pôvodný text:", input_text)
print("Opravený text:", output_text)