21 lines
739 B
Python
21 lines
739 B
Python
from tokenizers import Tokenizer
|
|
from tokenizers.models import WordPiece
|
|
|
|
tokenizer = Tokenizer.from_file("wordpiece-tokenizer-eujournal-sk.json")
|
|
tokenizedLine = ""
|
|
fileName = "eceuropa.sk"
|
|
files = ["train", "valid", "test"]
|
|
|
|
def listToString(s):
|
|
str1 = " "
|
|
return (str1.join(s))
|
|
|
|
for file in files:
|
|
with open('raw/'+fileName+'.'+file+'.raw') as read_file:
|
|
for line in read_file:
|
|
tokenizedLine = tokenizer.encode(line.rstrip())
|
|
with open('tokenized/wordpiece-tok_'+fileName+'.'+file+'.en', 'a') as input_file:
|
|
stringified = listToString(tokenizedLine.tokens)
|
|
print(stringified)
|
|
input_file.write(stringified)
|
|
input_file.write("\n") |