dp2022/BPETokenizer.py

19 lines
603 B
Python
Raw Normal View History

2022-01-11 23:01:48 +00:00
from tokenizers import Tokenizer
from tokenizers.models import BPE
tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
tokenizedLine = ""
fileName = "eceuropa.test.raw"
def listToString(s):
str1 = " "
return (str1.join(s))
with open('raw/'+fileName) as read_file:
for line in read_file:
tokenizedLine = tokenizer.encode(line.rstrip())
with open('tokenized/bpe-tok_'+fileName, 'a') as input_file:
stringified = listToString(tokenizedLine.tokens)
print(stringified)
input_file.write(stringified)
input_file.write("\n")