dp2022/all-in-one.sh
2022-01-11 23:01:48 +00:00

24 lines
1.6 KiB
Bash

NUMBER=1
CORPUSNAME=eceuropa
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11858/00-097C-0000-0006-AAE0-A/corpus-en-sk-plaintext.tar.gz
tar -xvzf corpus-en-sk-plaintext.tar.gz
gzip -d corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.en.gz
gzip -d corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.sk.gz
mkdir -p $NUMBER-$CORPUSNAME/corpus/en-sk/
mv corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.en $NUMBER-$CORPUSNAME/corpus/en-sk/
mv corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.sk $NUMBER-$CORPUSNAME/corpus/en-sk/
cp $NUMBER-$CORPUSNAME/corpus/en-sk/eceuropa.en-sk.sk $NUMBER-eceuropa/corpus/en-sk/train.sk
rm -r corpus-en-sk-plaintext
cd $NUMBER-$CORPUSNAME
bash ../pre-process.sh $CORPUSNAME
TEXT=$CORPUSNAME.en-sk
fairseq-preprocess --source-lang sk --target-lang en --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test --destdir data-bin/$CORPUSNAME.en-sk
mkdir -p checkpoints/fconv
CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/$CORPUSNAME.en-sk --optimizer nag --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 --arch fconv_iwslt_de_en --max-epoch 1 --save-dir checkpoints/fconv
fairseq-generate data-bin/$CORPUSNAME.en-sk --path checkpoints/fconv/checkpoint_best.pt --batch-size 128 --beam 5
cp data-bin/$CORPUSNAME.en-sk/dict.en.txt checkpoints/fconv/
cp data-bin/$CORPUSNAME.en-sk/dict.sk.txt checkpoints/fconv/
cp $CORPUSNAME.en-sk/code checkpoints/fconv/bpecodes
MODEL_DIR=checkpoints/fconv
fairseq-interactive --path $MODEL_DIR/checkpoint_best.pt $MODEL_DIR --beam 5 --source-lang sk --target-lang en --tokenizer moses --bpe subword_nmt --bpe-codes $MODEL_DIR/bpecodes