24 lines
1.6 KiB
Bash
24 lines
1.6 KiB
Bash
NUMBER=1
|
|
CORPUSNAME=eceuropa
|
|
|
|
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11858/00-097C-0000-0006-AAE0-A/corpus-en-sk-plaintext.tar.gz
|
|
tar -xvzf corpus-en-sk-plaintext.tar.gz
|
|
gzip -d corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.en.gz
|
|
gzip -d corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.sk.gz
|
|
mkdir -p $NUMBER-$CORPUSNAME/corpus/en-sk/
|
|
mv corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.en $NUMBER-$CORPUSNAME/corpus/en-sk/
|
|
mv corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.sk $NUMBER-$CORPUSNAME/corpus/en-sk/
|
|
cp $NUMBER-$CORPUSNAME/corpus/en-sk/eceuropa.en-sk.sk $NUMBER-eceuropa/corpus/en-sk/train.sk
|
|
rm -r corpus-en-sk-plaintext
|
|
cd $NUMBER-$CORPUSNAME
|
|
bash ../pre-process.sh $CORPUSNAME
|
|
TEXT=$CORPUSNAME.en-sk
|
|
fairseq-preprocess --source-lang sk --target-lang en --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test --destdir data-bin/$CORPUSNAME.en-sk
|
|
mkdir -p checkpoints/fconv
|
|
CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/$CORPUSNAME.en-sk --optimizer nag --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 --arch fconv_iwslt_de_en --max-epoch 1 --save-dir checkpoints/fconv
|
|
fairseq-generate data-bin/$CORPUSNAME.en-sk --path checkpoints/fconv/checkpoint_best.pt --batch-size 128 --beam 5
|
|
cp data-bin/$CORPUSNAME.en-sk/dict.en.txt checkpoints/fconv/
|
|
cp data-bin/$CORPUSNAME.en-sk/dict.sk.txt checkpoints/fconv/
|
|
cp $CORPUSNAME.en-sk/code checkpoints/fconv/bpecodes
|
|
MODEL_DIR=checkpoints/fconv
|
|
fairseq-interactive --path $MODEL_DIR/checkpoint_best.pt $MODEL_DIR --beam 5 --source-lang sk --target-lang en --tokenizer moses --bpe subword_nmt --bpe-codes $MODEL_DIR/bpecodes |