diff --git a/BPETokenizer.py b/BPETokenizer.py new file mode 100644 index 0000000..9db172b --- /dev/null +++ b/BPETokenizer.py @@ -0,0 +1,19 @@ +from tokenizers import Tokenizer +from tokenizers.models import BPE + +tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json") +tokenizedLine = "" +fileName = "eceuropa.test.raw" + +def listToString(s): + str1 = " " + return (str1.join(s)) + +with open('raw/'+fileName) as read_file: + for line in read_file: + tokenizedLine = tokenizer.encode(line.rstrip()) + with open('tokenized/bpe-tok_'+fileName, 'a') as input_file: + stringified = listToString(tokenizedLine.tokens) + print(stringified) + input_file.write(stringified) + input_file.write("\n") \ No newline at end of file diff --git a/BPETokenizerTrainer.py b/BPETokenizerTrainer.py new file mode 100644 index 0000000..1f4de60 --- /dev/null +++ b/BPETokenizerTrainer.py @@ -0,0 +1,13 @@ +from tokenizers import Tokenizer +from tokenizers.models import BPE +from tokenizers.trainers import BpeTrainer +from tokenizers.pre_tokenizers import Whitespace + +# training the tokenizer +tokenizer = Tokenizer(BPE(unk_token="[UNK]")) +trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) +tokenizer.pre_tokenizer = Whitespace() +files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]] +tokenizer.train(files, trainer) +tokenizer.save("data/bpe-tokenizer-wiki.json") + diff --git a/Trenovania.ods b/Trenovania.ods new file mode 100644 index 0000000..23dbe07 Binary files /dev/null and b/Trenovania.ods differ diff --git a/WordPieceTokenizer.py b/WordPieceTokenizer.py new file mode 100644 index 0000000..7a9dc14 --- /dev/null +++ b/WordPieceTokenizer.py @@ -0,0 +1,21 @@ +from tokenizers import Tokenizer +from tokenizers.models import WordPiece + +tokenizer = Tokenizer.from_file("wordpiece-tokenizer-eujournal-sk.json") +tokenizedLine = "" +fileName = "eceuropa.sk" +files = ["train", "valid", "test"] + +def listToString(s): + str1 = " " + return (str1.join(s)) + +for file in files: + with open('raw/'+fileName+'.'+file+'.raw') as read_file: + for line in read_file: + tokenizedLine = tokenizer.encode(line.rstrip()) + with open('tokenized/wordpiece-tok_'+fileName+'.'+file+'.en', 'a') as input_file: + stringified = listToString(tokenizedLine.tokens) + print(stringified) + input_file.write(stringified) + input_file.write("\n") \ No newline at end of file diff --git a/WordPieceTokenizerTrainer.py b/WordPieceTokenizerTrainer.py new file mode 100644 index 0000000..e46a80c --- /dev/null +++ b/WordPieceTokenizerTrainer.py @@ -0,0 +1,13 @@ +from tokenizers import Tokenizer +from tokenizers.models import WordPiece +from tokenizers.trainers import WordPieceTrainer +from tokenizers.pre_tokenizers import Whitespace + +# training the tokenizer +tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) +trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) +tokenizer.pre_tokenizer = Whitespace() +# files = [f"raw/eceuropa.{split}.raw" for split in ["test", "train", "valid"]] +files = [f"raw/eujournal.sk.raw"] +tokenizer.train(files, trainer) +tokenizer.save("wordpiece-tokenizer-eujournal-sk.json") diff --git a/all-in-one.sh b/all-in-one.sh new file mode 100644 index 0000000..424c11a --- /dev/null +++ b/all-in-one.sh @@ -0,0 +1,24 @@ +NUMBER=1 +CORPUSNAME=eceuropa + +wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11858/00-097C-0000-0006-AAE0-A/corpus-en-sk-plaintext.tar.gz +tar -xvzf corpus-en-sk-plaintext.tar.gz +gzip -d corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.en.gz +gzip -d corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.sk.gz +mkdir -p $NUMBER-$CORPUSNAME/corpus/en-sk/ +mv corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.en $NUMBER-$CORPUSNAME/corpus/en-sk/ +mv corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.sk $NUMBER-$CORPUSNAME/corpus/en-sk/ +cp $NUMBER-$CORPUSNAME/corpus/en-sk/eceuropa.en-sk.sk $NUMBER-eceuropa/corpus/en-sk/train.sk +rm -r corpus-en-sk-plaintext +cd $NUMBER-$CORPUSNAME +bash ../pre-process.sh $CORPUSNAME +TEXT=$CORPUSNAME.en-sk +fairseq-preprocess --source-lang sk --target-lang en --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test --destdir data-bin/$CORPUSNAME.en-sk +mkdir -p checkpoints/fconv +CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/$CORPUSNAME.en-sk --optimizer nag --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 --arch fconv_iwslt_de_en --max-epoch 1 --save-dir checkpoints/fconv +fairseq-generate data-bin/$CORPUSNAME.en-sk --path checkpoints/fconv/checkpoint_best.pt --batch-size 128 --beam 5 +cp data-bin/$CORPUSNAME.en-sk/dict.en.txt checkpoints/fconv/ +cp data-bin/$CORPUSNAME.en-sk/dict.sk.txt checkpoints/fconv/ +cp $CORPUSNAME.en-sk/code checkpoints/fconv/bpecodes +MODEL_DIR=checkpoints/fconv +fairseq-interactive --path $MODEL_DIR/checkpoint_best.pt $MODEL_DIR --beam 5 --source-lang sk --target-lang en --tokenizer moses --bpe subword_nmt --bpe-codes $MODEL_DIR/bpecodes \ No newline at end of file diff --git a/idoc_install.txt b/idoc_install.txt new file mode 100644 index 0000000..f641d67 --- /dev/null +++ b/idoc_install.txt @@ -0,0 +1,13 @@ +conda install python=3.7 numpy=1.19 +conda install cudatoolkit=11.0 nccl -c nvidia +conda install pytorch -c pytorch +# https://github.com/pytorch/fairseq +git clone https://github.com/pytorch/fairseq +cd fairseq +pip install . +cd .. +git clone https://github.com/NVIDIA/apex +cd apex +# https://github.com/NVIDIA/apex/issues/1043 +git reset --hard 3fe10b5597ba14a748ebb271a6ab97c09c5701ac +1904 pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./ \ No newline at end of file diff --git a/pre-process.sh b/pre-process.sh new file mode 100644 index 0000000..04cb968 --- /dev/null +++ b/pre-process.sh @@ -0,0 +1,86 @@ +CORPUSNAME=$1 + +echo 'Cloning Moses github repository (for tokenization scripts)...' +git clone https://github.com/moses-smt/mosesdecoder.git + +echo 'Cloning Subword NMT repository (for BPE pre-processing)...' +git clone https://github.com/rsennrich/subword-nmt.git + +SCRIPTS=mosesdecoder/scripts +TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl +LC=$SCRIPTS/tokenizer/lowercase.perl +CLEAN=$SCRIPTS/training/clean-corpus-n.perl +BPEROOT=subword-nmt/subword_nmt +BPE_TOKENS=10000 + + +if [ ! -d "$SCRIPTS" ]; then + echo "Please set SCRIPTS variable correctly to point to Moses scripts." + exit +fi + +src=en +tgt=sk +lang=en-sk +prep=$CORPUSNAME.en-sk +tmp=$prep/tmp +orig=corpus + +mkdir -p $orig $tmp $prep + +echo "pre-processing train data..." +for l in $src $tgt; do + f=$CORPUSNAME.$lang.$l + tok=$CORPUSNAME.$lang.tok.$l + cat $orig/$lang/$f | \ + perl $TOKENIZER -threads 8 -l $l > $tmp/$tok + echo "" +done +perl $CLEAN -ratio 1.5 $tmp/$CORPUSNAME.$lang.tok $src $tgt $tmp/$CORPUSNAME.$lang.clean 1 175 +for l in $src $tgt; do + perl $LC < $tmp/$CORPUSNAME.$lang.clean.$l > $tmp/$CORPUSNAME.$lang.$l +done + +xd=$CORPUSNAME.$lang.$l + +echo "pre-processing valid/test data..." +for l in $src $tgt; do + for o in `ls $orig/$lang/$xd`; do + fname=${o##*/} + f=$tmp/${fname%.*} + echo $o $f | \ + # grep '\s*//g' | \ + # sed -e 's/\s*<\/seg>\s*//g' | \ + # sed -e "s/\’/\'/g" | \ + perl $TOKENIZER -threads 8 -l $l | \ + perl $LC > $f + echo "" + done +done + +echo "creating train, valid, test..." +for l in $src $tgt; do + awk '{if (NR%23 == 0) print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/valid.$l + awk '{if (NR%23 != 0) print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/train.$l + + cat $tmp/$CORPUSNAME.en-sk.$l \ + > $tmp/test.$l +done + +TRAIN=$tmp/train.en-sk +BPE_CODE=$prep/code +rm -f $TRAIN +for l in $src $tgt; do + cat $tmp/train.$l >> $TRAIN +done + +echo "learn_bpe.py on ${TRAIN}..." +python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE + +for L in $src $tgt; do + for f in train.$L valid.$L test.$L; do + echo "apply_bpe.py to ${f}..." + python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f + done +done