CORPUSNAME=$1 echo 'Cloning Moses github repository (for tokenization scripts)...' git clone https://github.com/moses-smt/mosesdecoder.git echo 'Cloning Subword NMT repository (for BPE pre-processing)...' git clone https://github.com/rsennrich/subword-nmt.git SCRIPTS=mosesdecoder/scripts TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl LC=$SCRIPTS/tokenizer/lowercase.perl CLEAN=$SCRIPTS/training/clean-corpus-n.perl BPEROOT=subword-nmt/subword_nmt BPE_TOKENS=10000 if [ ! -d "$SCRIPTS" ]; then echo "Please set SCRIPTS variable correctly to point to Moses scripts." exit fi src=en tgt=sk lang=en-sk prep=$CORPUSNAME.en-sk tmp=$prep/tmp orig=corpus mkdir -p $orig $tmp $prep echo "pre-processing train data..." for l in $src $tgt; do f=$CORPUSNAME.$lang.$l tok=$CORPUSNAME.$lang.tok.$l cat $orig/$lang/$f | \ perl $TOKENIZER -threads 8 -l $l > $tmp/$tok echo "" done perl $CLEAN -ratio 1.5 $tmp/$CORPUSNAME.$lang.tok $src $tgt $tmp/$CORPUSNAME.$lang.clean 1 175 for l in $src $tgt; do perl $LC < $tmp/$CORPUSNAME.$lang.clean.$l > $tmp/$CORPUSNAME.$lang.$l done xd=$CORPUSNAME.$lang.$l echo "pre-processing valid/test data..." for l in $src $tgt; do for o in `ls $orig/$lang/$xd`; do fname=${o##*/} f=$tmp/${fname%.*} echo $o $f | \ # grep '\s*//g' | \ # sed -e 's/\s*<\/seg>\s*//g' | \ # sed -e "s/\’/\'/g" | \ perl $TOKENIZER -threads 8 -l $l | \ perl $LC > $f echo "" done done echo "creating train, valid, test..." for l in $src $tgt; do awk '{if (NR%23 == 0) print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/valid.$l awk '{if (NR%23 != 0) print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/train.$l cat $tmp/$CORPUSNAME.en-sk.$l \ > $tmp/test.$l done TRAIN=$tmp/train.en-sk BPE_CODE=$prep/code rm -f $TRAIN for l in $src $tgt; do cat $tmp/train.$l >> $TRAIN done echo "learn_bpe.py on ${TRAIN}..." python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE for L in $src $tgt; do for f in train.$L valid.$L test.$L; do echo "apply_bpe.py to ${f}..." python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f done done