19 lines
603 B
Python
19 lines
603 B
Python
|
from tokenizers import Tokenizer
|
||
|
from tokenizers.models import BPE
|
||
|
|
||
|
tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
|
||
|
tokenizedLine = ""
|
||
|
fileName = "eceuropa.test.raw"
|
||
|
|
||
|
def listToString(s):
|
||
|
str1 = " "
|
||
|
return (str1.join(s))
|
||
|
|
||
|
with open('raw/'+fileName) as read_file:
|
||
|
for line in read_file:
|
||
|
tokenizedLine = tokenizer.encode(line.rstrip())
|
||
|
with open('tokenized/bpe-tok_'+fileName, 'a') as input_file:
|
||
|
stringified = listToString(tokenizedLine.tokens)
|
||
|
print(stringified)
|
||
|
input_file.write(stringified)
|
||
|
input_file.write("\n")
|