from tokenizers import Tokenizer from tokenizers.models import WordPiece tokenizer = Tokenizer.from_file("wordpiece-tokenizer-eujournal-sk.json") tokenizedLine = "" fileName = "eceuropa.sk" files = ["train", "valid", "test"] def listToString(s): str1 = " " return (str1.join(s)) for file in files: with open('raw/'+fileName+'.'+file+'.raw') as read_file: for line in read_file: tokenizedLine = tokenizer.encode(line.rstrip()) with open('tokenized/wordpiece-tok_'+fileName+'.'+file+'.en', 'a') as input_file: stringified = listToString(tokenizedLine.tokens) print(stringified) input_file.write(stringified) input_file.write("\n")