21 lines
739 B
Python
21 lines
739 B
Python
|
from tokenizers import Tokenizer
|
||
|
from tokenizers.models import WordPiece
|
||
|
|
||
|
tokenizer = Tokenizer.from_file("wordpiece-tokenizer-eujournal-sk.json")
|
||
|
tokenizedLine = ""
|
||
|
fileName = "eceuropa.sk"
|
||
|
files = ["train", "valid", "test"]
|
||
|
|
||
|
def listToString(s):
|
||
|
str1 = " "
|
||
|
return (str1.join(s))
|
||
|
|
||
|
for file in files:
|
||
|
with open('raw/'+fileName+'.'+file+'.raw') as read_file:
|
||
|
for line in read_file:
|
||
|
tokenizedLine = tokenizer.encode(line.rstrip())
|
||
|
with open('tokenized/wordpiece-tok_'+fileName+'.'+file+'.en', 'a') as input_file:
|
||
|
stringified = listToString(tokenizedLine.tokens)
|
||
|
print(stringified)
|
||
|
input_file.write(stringified)
|
||
|
input_file.write("\n")
|