diff --git a/pages/students/2016/jakub_maruniak/dp2021/annotation/data/text_to_sent.py b/pages/students/2016/jakub_maruniak/dp2021/annotation/data/text_to_sent.py deleted file mode 100644 index 98027c404f..0000000000 --- a/pages/students/2016/jakub_maruniak/dp2021/annotation/data/text_to_sent.py +++ /dev/null @@ -1,24 +0,0 @@ -import csv - -# load data -filename = 'dataminer.csv' -file = open(filename, 'rt', encoding='utf-8') -text = file.read() - -# split text to sentences, with list of slovak abbreviations -from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters -punkt_param = PunktParameters() -punkt_param.abbrev_types = set(['rokov', 'sgt', 'storočia', 't.j', 'Kr', 'resp', 'poľ', 'tzv', 'pod', 'napr', 'prof', 'angl']) -sentence_splitter = PunktSentenceTokenizer(punkt_param) -sentences = sentence_splitter.tokenize(text) - -# write data -with open("textfile.csv", mode='w', encoding='utf-8', newline='\n') as textfile: - for sentences in sentence_splitter.tokenize(text): - sentence1 = sentences.replace(";", "") - row_writer=csv.writer(textfile) - row_writer.writerow([sentence1]) - print(sentence1) - - - \ No newline at end of file