forked from KEMT/zpwiki
		
	Smazat „pages/students/2016/jakub_maruniak/dp2021/annotation/data/text_to_sent.py“
This commit is contained in:
		
							parent
							
								
									2c24867494
								
							
						
					
					
						commit
						90f286cd2a
					
				@ -1,24 +0,0 @@
 | 
			
		||||
import csv
 | 
			
		||||
 | 
			
		||||
# load data
 | 
			
		||||
filename = 'dataminer.csv'
 | 
			
		||||
file = open(filename, 'rt', encoding='utf-8')
 | 
			
		||||
text = file.read()
 | 
			
		||||
 | 
			
		||||
# split text to sentences, with list of slovak abbreviations
 | 
			
		||||
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
 | 
			
		||||
punkt_param = PunktParameters()
 | 
			
		||||
punkt_param.abbrev_types = set(['rokov', 'sgt', 'storočia', 't.j', 'Kr', 'resp', 'poľ', 'tzv', 'pod', 'napr', 'prof', 'angl'])
 | 
			
		||||
sentence_splitter = PunktSentenceTokenizer(punkt_param)
 | 
			
		||||
sentences = sentence_splitter.tokenize(text)
 | 
			
		||||
 | 
			
		||||
# write data
 | 
			
		||||
with open("textfile.csv", mode='w', encoding='utf-8', newline='\n') as textfile:
 | 
			
		||||
    for sentences in sentence_splitter.tokenize(text):
 | 
			
		||||
        sentence1 = sentences.replace(";", "")
 | 
			
		||||
        row_writer=csv.writer(textfile)
 | 
			
		||||
        row_writer.writerow([sentence1])
 | 
			
		||||
        print(sentence1)
 | 
			
		||||
        
 | 
			
		||||
        
 | 
			
		||||
        
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user