19 lines
		
	
	
		
			603 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			19 lines
		
	
	
		
			603 B
		
	
	
	
		
			Python
		
	
	
	
	
	
| from tokenizers import Tokenizer
 | |
| from tokenizers.models import BPE
 | |
| 
 | |
| tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
 | |
| tokenizedLine = ""
 | |
| fileName = "eceuropa.test.raw"
 | |
| 
 | |
| def listToString(s):
 | |
|     str1 = " "
 | |
|     return (str1.join(s))
 | |
| 
 | |
| with open('raw/'+fileName) as read_file:
 | |
|     for line in read_file:
 | |
|         tokenizedLine = tokenizer.encode(line.rstrip())
 | |
|         with open('tokenized/bpe-tok_'+fileName, 'a') as input_file:
 | |
|             stringified = listToString(tokenizedLine.tokens)
 | |
|             print(stringified)
 | |
|             input_file.write(stringified)
 | |
|             input_file.write("\n") |