Upload files to ''

2022-01-11 23:01:48 +00:00 · 2022-01-11 23:01:48 +00:00 · 2fcdc9edbb
commit 2fcdc9edbb
parent e757160ae5
8 changed files with 189 additions and 0 deletions
--- a/BPETokenizer.py
+++ b/BPETokenizer.py
@ -0,0 +1,19 @@
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+
+tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
+tokenizedLine = ""
+fileName = "eceuropa.test.raw"
+
+def listToString(s):
+    str1 = " "
+    return (str1.join(s))
+
+with open('raw/'+fileName) as read_file:
+    for line in read_file:
+        tokenizedLine = tokenizer.encode(line.rstrip())
+        with open('tokenized/bpe-tok_'+fileName, 'a') as input_file:
+            stringified = listToString(tokenizedLine.tokens)
+            print(stringified)
+            input_file.write(stringified)
+            input_file.write("\n")
--- a/BPETokenizerTrainer.py
+++ b/BPETokenizerTrainer.py
@ -0,0 +1,13 @@
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import Whitespace
+
+# training the tokenizer
+tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+tokenizer.pre_tokenizer = Whitespace()
+files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
+tokenizer.train(files, trainer)
+tokenizer.save("data/bpe-tokenizer-wiki.json")
+
--- a/Trenovania.ods
+++ b/Trenovania.ods
--- a/WordPieceTokenizer.py
+++ b/WordPieceTokenizer.py
@ -0,0 +1,21 @@
+from tokenizers import Tokenizer
+from tokenizers.models import WordPiece
+
+tokenizer = Tokenizer.from_file("wordpiece-tokenizer-eujournal-sk.json")
+tokenizedLine = ""
+fileName = "eceuropa.sk"
+files = ["train", "valid", "test"]
+
+def listToString(s):
+    str1 = " "
+    return (str1.join(s))
+
+for file in files:
+    with open('raw/'+fileName+'.'+file+'.raw') as read_file:
+        for line in read_file:
+            tokenizedLine = tokenizer.encode(line.rstrip())
+            with open('tokenized/wordpiece-tok_'+fileName+'.'+file+'.en', 'a') as input_file:
+                stringified = listToString(tokenizedLine.tokens)
+                print(stringified)
+                input_file.write(stringified)
+                input_file.write("\n")
--- a/WordPieceTokenizerTrainer.py
+++ b/WordPieceTokenizerTrainer.py
@ -0,0 +1,13 @@
+from tokenizers import Tokenizer
+from tokenizers.models import WordPiece
+from tokenizers.trainers import WordPieceTrainer
+from tokenizers.pre_tokenizers import Whitespace
+
+# training the tokenizer
+tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
+trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+tokenizer.pre_tokenizer = Whitespace()
+# files = [f"raw/eceuropa.{split}.raw" for split in ["test", "train", "valid"]]
+files = [f"raw/eujournal.sk.raw"]
+tokenizer.train(files, trainer)
+tokenizer.save("wordpiece-tokenizer-eujournal-sk.json")
--- a/all-in-one.sh
+++ b/all-in-one.sh
@ -0,0 +1,24 @@
+NUMBER=1
+CORPUSNAME=eceuropa
+
+wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11858/00-097C-0000-0006-AAE0-A/corpus-en-sk-plaintext.tar.gz
+tar -xvzf corpus-en-sk-plaintext.tar.gz
+gzip -d corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.en.gz
+gzip -d corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.sk.gz
+mkdir -p $NUMBER-$CORPUSNAME/corpus/en-sk/
+mv corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.en $NUMBER-$CORPUSNAME/corpus/en-sk/
+mv corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.sk $NUMBER-$CORPUSNAME/corpus/en-sk/
+cp $NUMBER-$CORPUSNAME/corpus/en-sk/eceuropa.en-sk.sk $NUMBER-eceuropa/corpus/en-sk/train.sk
+rm -r corpus-en-sk-plaintext
+cd $NUMBER-$CORPUSNAME
+bash ../pre-process.sh $CORPUSNAME
+TEXT=$CORPUSNAME.en-sk
+fairseq-preprocess --source-lang sk --target-lang en --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test --destdir data-bin/$CORPUSNAME.en-sk
+mkdir -p checkpoints/fconv
+CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/$CORPUSNAME.en-sk --optimizer nag --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 --arch fconv_iwslt_de_en --max-epoch 1 --save-dir checkpoints/fconv
+fairseq-generate data-bin/$CORPUSNAME.en-sk --path checkpoints/fconv/checkpoint_best.pt --batch-size 128 --beam 5
+cp data-bin/$CORPUSNAME.en-sk/dict.en.txt checkpoints/fconv/
+cp data-bin/$CORPUSNAME.en-sk/dict.sk.txt checkpoints/fconv/
+cp $CORPUSNAME.en-sk/code checkpoints/fconv/bpecodes
+MODEL_DIR=checkpoints/fconv
+fairseq-interactive --path $MODEL_DIR/checkpoint_best.pt $MODEL_DIR --beam 5 --source-lang sk --target-lang en --tokenizer moses --bpe subword_nmt --bpe-codes $MODEL_DIR/bpecodes
--- a/idoc_install.txt
+++ b/idoc_install.txt
@ -0,0 +1,13 @@
+conda install python=3.7 numpy=1.19
+conda install cudatoolkit=11.0 nccl -c nvidia
+conda install pytorch -c pytorch
+# https://github.com/pytorch/fairseq
+git clone https://github.com/pytorch/fairseq
+cd fairseq
+pip install .
+cd ..
+git clone https://github.com/NVIDIA/apex
+cd apex
+# https://github.com/NVIDIA/apex/issues/1043
+git reset --hard 3fe10b5597ba14a748ebb271a6ab97c09c5701ac
+1904 pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./
--- a/pre-process.sh
+++ b/pre-process.sh
@ -0,0 +1,86 @@
+CORPUSNAME=$1
+
+echo 'Cloning Moses github repository (for tokenization scripts)...'
+git clone https://github.com/moses-smt/mosesdecoder.git
+
+echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
+git clone https://github.com/rsennrich/subword-nmt.git
+
+SCRIPTS=mosesdecoder/scripts
+TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
+LC=$SCRIPTS/tokenizer/lowercase.perl
+CLEAN=$SCRIPTS/training/clean-corpus-n.perl
+BPEROOT=subword-nmt/subword_nmt
+BPE_TOKENS=10000
+
+
+if [ ! -d "$SCRIPTS" ]; then
+    echo "Please set SCRIPTS variable correctly to point to Moses scripts."
+    exit
+fi
+
+src=en
+tgt=sk
+lang=en-sk
+prep=$CORPUSNAME.en-sk
+tmp=$prep/tmp
+orig=corpus
+
+mkdir -p $orig $tmp $prep
+
+echo "pre-processing train data..."
+for l in $src $tgt; do
+    f=$CORPUSNAME.$lang.$l
+    tok=$CORPUSNAME.$lang.tok.$l
+    cat $orig/$lang/$f | \
+    perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
+    echo ""
+done
+perl $CLEAN -ratio 1.5 $tmp/$CORPUSNAME.$lang.tok $src $tgt $tmp/$CORPUSNAME.$lang.clean 1 175
+for l in $src $tgt; do
+    perl $LC < $tmp/$CORPUSNAME.$lang.clean.$l > $tmp/$CORPUSNAME.$lang.$l
+done
+
+xd=$CORPUSNAME.$lang.$l
+
+echo "pre-processing valid/test data..."
+for l in $src $tgt; do
+    for o in `ls $orig/$lang/$xd`; do
+    fname=${o##*/}
+    f=$tmp/${fname%.*}
+    echo $o $f | \
+    # grep '<seg id' $o | \
+    # sed -e 's/<seg id="[0-9]*">\s*//g' | \
+    # sed -e 's/\s*<\/seg>\s*//g' | \
+    # sed -e "s/\’/\'/g" | \
+    perl $TOKENIZER -threads 8 -l $l | \
+    perl $LC > $f
+    echo ""
+    done
+done
+
+echo "creating train, valid, test..."
+for l in $src $tgt; do
+    awk '{if (NR%23 == 0)  print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/valid.$l
+    awk '{if (NR%23 != 0)  print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/train.$l
+
+    cat $tmp/$CORPUSNAME.en-sk.$l \
+        > $tmp/test.$l
+done
+
+TRAIN=$tmp/train.en-sk
+BPE_CODE=$prep/code
+rm -f $TRAIN
+for l in $src $tgt; do
+    cat $tmp/train.$l >> $TRAIN
+done
+
+echo "learn_bpe.py on ${TRAIN}..."
+python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
+
+for L in $src $tgt; do
+    for f in train.$L valid.$L test.$L; do
+        echo "apply_bpe.py to ${f}..."
+        python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
+    done
+done