87 lines
2.2 KiB
Bash
87 lines
2.2 KiB
Bash
|
CORPUSNAME=$1
|
|||
|
|
|||
|
echo 'Cloning Moses github repository (for tokenization scripts)...'
|
|||
|
git clone https://github.com/moses-smt/mosesdecoder.git
|
|||
|
|
|||
|
echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
|
|||
|
git clone https://github.com/rsennrich/subword-nmt.git
|
|||
|
|
|||
|
SCRIPTS=mosesdecoder/scripts
|
|||
|
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
|
|||
|
LC=$SCRIPTS/tokenizer/lowercase.perl
|
|||
|
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
|
|||
|
BPEROOT=subword-nmt/subword_nmt
|
|||
|
BPE_TOKENS=10000
|
|||
|
|
|||
|
|
|||
|
if [ ! -d "$SCRIPTS" ]; then
|
|||
|
echo "Please set SCRIPTS variable correctly to point to Moses scripts."
|
|||
|
exit
|
|||
|
fi
|
|||
|
|
|||
|
src=en
|
|||
|
tgt=sk
|
|||
|
lang=en-sk
|
|||
|
prep=$CORPUSNAME.en-sk
|
|||
|
tmp=$prep/tmp
|
|||
|
orig=corpus
|
|||
|
|
|||
|
mkdir -p $orig $tmp $prep
|
|||
|
|
|||
|
echo "pre-processing train data..."
|
|||
|
for l in $src $tgt; do
|
|||
|
f=$CORPUSNAME.$lang.$l
|
|||
|
tok=$CORPUSNAME.$lang.tok.$l
|
|||
|
cat $orig/$lang/$f | \
|
|||
|
perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
|
|||
|
echo ""
|
|||
|
done
|
|||
|
perl $CLEAN -ratio 1.5 $tmp/$CORPUSNAME.$lang.tok $src $tgt $tmp/$CORPUSNAME.$lang.clean 1 175
|
|||
|
for l in $src $tgt; do
|
|||
|
perl $LC < $tmp/$CORPUSNAME.$lang.clean.$l > $tmp/$CORPUSNAME.$lang.$l
|
|||
|
done
|
|||
|
|
|||
|
xd=$CORPUSNAME.$lang.$l
|
|||
|
|
|||
|
echo "pre-processing valid/test data..."
|
|||
|
for l in $src $tgt; do
|
|||
|
for o in `ls $orig/$lang/$xd`; do
|
|||
|
fname=${o##*/}
|
|||
|
f=$tmp/${fname%.*}
|
|||
|
echo $o $f | \
|
|||
|
# grep '<seg id' $o | \
|
|||
|
# sed -e 's/<seg id="[0-9]*">\s*//g' | \
|
|||
|
# sed -e 's/\s*<\/seg>\s*//g' | \
|
|||
|
# sed -e "s/\’/\'/g" | \
|
|||
|
perl $TOKENIZER -threads 8 -l $l | \
|
|||
|
perl $LC > $f
|
|||
|
echo ""
|
|||
|
done
|
|||
|
done
|
|||
|
|
|||
|
echo "creating train, valid, test..."
|
|||
|
for l in $src $tgt; do
|
|||
|
awk '{if (NR%23 == 0) print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/valid.$l
|
|||
|
awk '{if (NR%23 != 0) print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/train.$l
|
|||
|
|
|||
|
cat $tmp/$CORPUSNAME.en-sk.$l \
|
|||
|
> $tmp/test.$l
|
|||
|
done
|
|||
|
|
|||
|
TRAIN=$tmp/train.en-sk
|
|||
|
BPE_CODE=$prep/code
|
|||
|
rm -f $TRAIN
|
|||
|
for l in $src $tgt; do
|
|||
|
cat $tmp/train.$l >> $TRAIN
|
|||
|
done
|
|||
|
|
|||
|
echo "learn_bpe.py on ${TRAIN}..."
|
|||
|
python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
|
|||
|
|
|||
|
for L in $src $tgt; do
|
|||
|
for f in train.$L valid.$L test.$L; do
|
|||
|
echo "apply_bpe.py to ${f}..."
|
|||
|
python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
|
|||
|
done
|
|||
|
done
|