Smazat „pages/students/2016/jakub_maruniak/dp2021/annotation/data/text_to_sent.py“

2020-05-22 19:17:52 +00:00 · 2020-05-22 19:17:52 +00:00 · 90f286cd2a
commit 90f286cd2a
parent 2c24867494
1 changed files with 0 additions and 24 deletions
--- a/pages/students/2016/jakub_maruniak/dp2021/annotation/data/text_to_sent.py
+++ b/pages/students/2016/jakub_maruniak/dp2021/annotation/data/text_to_sent.py
@ -1,24 +0,0 @@
 import csv
 # load data
 filename = 'dataminer.csv'
 file = open(filename, 'rt', encoding='utf-8')
 text = file.read()
 # split text to sentences, with list of slovak abbreviations
 from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
 punkt_param = PunktParameters()
 punkt_param.abbrev_types = set(['rokov', 'sgt', 'storočia', 't.j', 'Kr', 'resp', 'poľ', 'tzv', 'pod', 'napr', 'prof', 'angl'])
 sentence_splitter = PunktSentenceTokenizer(punkt_param)
 sentences = sentence_splitter.tokenize(text)
 # write data
 with open("textfile.csv", mode='w', encoding='utf-8', newline='\n') as textfile:
    for sentences in sentence_splitter.tokenize(text):
        sentence1 = sentences.replace(";", "")
        row_writer=csv.writer(textfile)
        row_writer.writerow([sentence1])
        print(sentence1)