From 90f286cd2a74bbed6625b94b29353daf89f799b0 Mon Sep 17 00:00:00 2001 From: Jakub Maruniak Date: Fri, 22 May 2020 19:17:52 +0000 Subject: [PATCH] =?UTF-8?q?Smazat=20=E2=80=9Epages/students/2016/jakub=5Fm?= =?UTF-8?q?aruniak/dp2021/annotation/data/text=5Fto=5Fsent.py=E2=80=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../dp2021/annotation/data/text_to_sent.py | 24 ------------------- 1 file changed, 24 deletions(-) delete mode 100644 pages/students/2016/jakub_maruniak/dp2021/annotation/data/text_to_sent.py diff --git a/pages/students/2016/jakub_maruniak/dp2021/annotation/data/text_to_sent.py b/pages/students/2016/jakub_maruniak/dp2021/annotation/data/text_to_sent.py deleted file mode 100644 index 98027c40..00000000 --- a/pages/students/2016/jakub_maruniak/dp2021/annotation/data/text_to_sent.py +++ /dev/null @@ -1,24 +0,0 @@ -import csv - -# load data -filename = 'dataminer.csv' -file = open(filename, 'rt', encoding='utf-8') -text = file.read() - -# split text to sentences, with list of slovak abbreviations -from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters -punkt_param = PunktParameters() -punkt_param.abbrev_types = set(['rokov', 'sgt', 'storočia', 't.j', 'Kr', 'resp', 'poľ', 'tzv', 'pod', 'napr', 'prof', 'angl']) -sentence_splitter = PunktSentenceTokenizer(punkt_param) -sentences = sentence_splitter.tokenize(text) - -# write data -with open("textfile.csv", mode='w', encoding='utf-8', newline='\n') as textfile: - for sentences in sentence_splitter.tokenize(text): - sentence1 = sentences.replace(";", "") - row_writer=csv.writer(textfile) - row_writer.writerow([sentence1]) - print(sentence1) - - - \ No newline at end of file