forked from KEMT/zpwiki
		
	Smazat „pages/students/2016/jakub_maruniak/dp2021/annotation/data/text_to_sent.py“
This commit is contained in:
		
							parent
							
								
									2c24867494
								
							
						
					
					
						commit
						90f286cd2a
					
				| @ -1,24 +0,0 @@ | ||||
| import csv | ||||
| 
 | ||||
| # load data | ||||
| filename = 'dataminer.csv' | ||||
| file = open(filename, 'rt', encoding='utf-8') | ||||
| text = file.read() | ||||
| 
 | ||||
| # split text to sentences, with list of slovak abbreviations | ||||
| from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters | ||||
| punkt_param = PunktParameters() | ||||
| punkt_param.abbrev_types = set(['rokov', 'sgt', 'storočia', 't.j', 'Kr', 'resp', 'poľ', 'tzv', 'pod', 'napr', 'prof', 'angl']) | ||||
| sentence_splitter = PunktSentenceTokenizer(punkt_param) | ||||
| sentences = sentence_splitter.tokenize(text) | ||||
| 
 | ||||
| # write data | ||||
| with open("textfile.csv", mode='w', encoding='utf-8', newline='\n') as textfile: | ||||
|     for sentences in sentence_splitter.tokenize(text): | ||||
|         sentence1 = sentences.replace(";", "") | ||||
|         row_writer=csv.writer(textfile) | ||||
|         row_writer.writerow([sentence1]) | ||||
|         print(sentence1) | ||||
|          | ||||
|          | ||||
|          | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user