Upload files to ''
This commit is contained in:
parent
de517dbe62
commit
54e835669c
42
slovak_punction2.py
Normal file
42
slovak_punction2.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
# import modulov
|
||||||
|
from transformers import pipeline
|
||||||
|
from nltk.tokenize import sent_tokenize
|
||||||
|
import nltk
|
||||||
|
|
||||||
|
nltk.download('punkt')
|
||||||
|
|
||||||
|
# funkcia na obnovu interpunkcie
|
||||||
|
def restore_punctuation(text):
|
||||||
|
sents = sent_tokenize(text)
|
||||||
|
new_text = ""
|
||||||
|
labels = ['.', '!', ',', ':', '?', '-', ";"]
|
||||||
|
|
||||||
|
for sent in sents:
|
||||||
|
sent = ''.join(ch for ch in sent if ch not in labels)
|
||||||
|
text_word = sent.split()
|
||||||
|
words = text_word[:]
|
||||||
|
|
||||||
|
unmasker = pipeline('fill-mask', model='gerulata/slovakbert')
|
||||||
|
|
||||||
|
for i in range(1, len(text_word) + 1):
|
||||||
|
text_word.insert(i, '<mask>')
|
||||||
|
sent = " ".join(text_word)
|
||||||
|
text_with_punc = unmasker(sent)
|
||||||
|
|
||||||
|
if text_with_punc[0]['token_str'] in labels:
|
||||||
|
words[i - 1] = words[i - 1] + text_with_punc[0]['token_str']
|
||||||
|
|
||||||
|
text_word = words[:]
|
||||||
|
|
||||||
|
new_text += " ".join(words)
|
||||||
|
|
||||||
|
return new_text
|
||||||
|
|
||||||
|
# Zadanie textu pre opravu interpunkcie
|
||||||
|
input_text = input("Zadajte text na opravu interpunkcie: ")
|
||||||
|
output_text = restore_punctuation(input_text)
|
||||||
|
|
||||||
|
# Výpis pôvodného a opraveného textu
|
||||||
|
print("Pôvodný text:", input_text)
|
||||||
|
print("Opravený text:", output_text)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user