dp2022/pre-process.sh
2022-01-11 23:01:48 +00:00

87 lines
2.2 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

CORPUSNAME=$1
echo 'Cloning Moses github repository (for tokenization scripts)...'
git clone https://github.com/moses-smt/mosesdecoder.git
echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
git clone https://github.com/rsennrich/subword-nmt.git
SCRIPTS=mosesdecoder/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
LC=$SCRIPTS/tokenizer/lowercase.perl
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
BPEROOT=subword-nmt/subword_nmt
BPE_TOKENS=10000
if [ ! -d "$SCRIPTS" ]; then
echo "Please set SCRIPTS variable correctly to point to Moses scripts."
exit
fi
src=en
tgt=sk
lang=en-sk
prep=$CORPUSNAME.en-sk
tmp=$prep/tmp
orig=corpus
mkdir -p $orig $tmp $prep
echo "pre-processing train data..."
for l in $src $tgt; do
f=$CORPUSNAME.$lang.$l
tok=$CORPUSNAME.$lang.tok.$l
cat $orig/$lang/$f | \
perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
echo ""
done
perl $CLEAN -ratio 1.5 $tmp/$CORPUSNAME.$lang.tok $src $tgt $tmp/$CORPUSNAME.$lang.clean 1 175
for l in $src $tgt; do
perl $LC < $tmp/$CORPUSNAME.$lang.clean.$l > $tmp/$CORPUSNAME.$lang.$l
done
xd=$CORPUSNAME.$lang.$l
echo "pre-processing valid/test data..."
for l in $src $tgt; do
for o in `ls $orig/$lang/$xd`; do
fname=${o##*/}
f=$tmp/${fname%.*}
echo $o $f | \
# grep '<seg id' $o | \
# sed -e 's/<seg id="[0-9]*">\s*//g' | \
# sed -e 's/\s*<\/seg>\s*//g' | \
# sed -e "s/\/\'/g" | \
perl $TOKENIZER -threads 8 -l $l | \
perl $LC > $f
echo ""
done
done
echo "creating train, valid, test..."
for l in $src $tgt; do
awk '{if (NR%23 == 0) print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/valid.$l
awk '{if (NR%23 != 0) print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/train.$l
cat $tmp/$CORPUSNAME.en-sk.$l \
> $tmp/test.$l
done
TRAIN=$tmp/train.en-sk
BPE_CODE=$prep/code
rm -f $TRAIN
for l in $src $tgt; do
cat $tmp/train.$l >> $TRAIN
done
echo "learn_bpe.py on ${TRAIN}..."
python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
for L in $src $tgt; do
for f in train.$L valid.$L test.$L; do
echo "apply_bpe.py to ${f}..."
python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
done
done