Upload files to ''
This commit is contained in:
parent
e757160ae5
commit
2fcdc9edbb
19
BPETokenizer.py
Normal file
19
BPETokenizer.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from tokenizers import Tokenizer
|
||||||
|
from tokenizers.models import BPE
|
||||||
|
|
||||||
|
tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
|
||||||
|
tokenizedLine = ""
|
||||||
|
fileName = "eceuropa.test.raw"
|
||||||
|
|
||||||
|
def listToString(s):
|
||||||
|
str1 = " "
|
||||||
|
return (str1.join(s))
|
||||||
|
|
||||||
|
with open('raw/'+fileName) as read_file:
|
||||||
|
for line in read_file:
|
||||||
|
tokenizedLine = tokenizer.encode(line.rstrip())
|
||||||
|
with open('tokenized/bpe-tok_'+fileName, 'a') as input_file:
|
||||||
|
stringified = listToString(tokenizedLine.tokens)
|
||||||
|
print(stringified)
|
||||||
|
input_file.write(stringified)
|
||||||
|
input_file.write("\n")
|
13
BPETokenizerTrainer.py
Normal file
13
BPETokenizerTrainer.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
from tokenizers import Tokenizer
|
||||||
|
from tokenizers.models import BPE
|
||||||
|
from tokenizers.trainers import BpeTrainer
|
||||||
|
from tokenizers.pre_tokenizers import Whitespace
|
||||||
|
|
||||||
|
# training the tokenizer
|
||||||
|
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
||||||
|
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
||||||
|
tokenizer.pre_tokenizer = Whitespace()
|
||||||
|
files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
|
||||||
|
tokenizer.train(files, trainer)
|
||||||
|
tokenizer.save("data/bpe-tokenizer-wiki.json")
|
||||||
|
|
BIN
Trenovania.ods
Normal file
BIN
Trenovania.ods
Normal file
Binary file not shown.
21
WordPieceTokenizer.py
Normal file
21
WordPieceTokenizer.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
from tokenizers import Tokenizer
|
||||||
|
from tokenizers.models import WordPiece
|
||||||
|
|
||||||
|
tokenizer = Tokenizer.from_file("wordpiece-tokenizer-eujournal-sk.json")
|
||||||
|
tokenizedLine = ""
|
||||||
|
fileName = "eceuropa.sk"
|
||||||
|
files = ["train", "valid", "test"]
|
||||||
|
|
||||||
|
def listToString(s):
|
||||||
|
str1 = " "
|
||||||
|
return (str1.join(s))
|
||||||
|
|
||||||
|
for file in files:
|
||||||
|
with open('raw/'+fileName+'.'+file+'.raw') as read_file:
|
||||||
|
for line in read_file:
|
||||||
|
tokenizedLine = tokenizer.encode(line.rstrip())
|
||||||
|
with open('tokenized/wordpiece-tok_'+fileName+'.'+file+'.en', 'a') as input_file:
|
||||||
|
stringified = listToString(tokenizedLine.tokens)
|
||||||
|
print(stringified)
|
||||||
|
input_file.write(stringified)
|
||||||
|
input_file.write("\n")
|
13
WordPieceTokenizerTrainer.py
Normal file
13
WordPieceTokenizerTrainer.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
from tokenizers import Tokenizer
|
||||||
|
from tokenizers.models import WordPiece
|
||||||
|
from tokenizers.trainers import WordPieceTrainer
|
||||||
|
from tokenizers.pre_tokenizers import Whitespace
|
||||||
|
|
||||||
|
# training the tokenizer
|
||||||
|
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
|
||||||
|
trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
||||||
|
tokenizer.pre_tokenizer = Whitespace()
|
||||||
|
# files = [f"raw/eceuropa.{split}.raw" for split in ["test", "train", "valid"]]
|
||||||
|
files = [f"raw/eujournal.sk.raw"]
|
||||||
|
tokenizer.train(files, trainer)
|
||||||
|
tokenizer.save("wordpiece-tokenizer-eujournal-sk.json")
|
24
all-in-one.sh
Normal file
24
all-in-one.sh
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
NUMBER=1
|
||||||
|
CORPUSNAME=eceuropa
|
||||||
|
|
||||||
|
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11858/00-097C-0000-0006-AAE0-A/corpus-en-sk-plaintext.tar.gz
|
||||||
|
tar -xvzf corpus-en-sk-plaintext.tar.gz
|
||||||
|
gzip -d corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.en.gz
|
||||||
|
gzip -d corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.sk.gz
|
||||||
|
mkdir -p $NUMBER-$CORPUSNAME/corpus/en-sk/
|
||||||
|
mv corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.en $NUMBER-$CORPUSNAME/corpus/en-sk/
|
||||||
|
mv corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.sk $NUMBER-$CORPUSNAME/corpus/en-sk/
|
||||||
|
cp $NUMBER-$CORPUSNAME/corpus/en-sk/eceuropa.en-sk.sk $NUMBER-eceuropa/corpus/en-sk/train.sk
|
||||||
|
rm -r corpus-en-sk-plaintext
|
||||||
|
cd $NUMBER-$CORPUSNAME
|
||||||
|
bash ../pre-process.sh $CORPUSNAME
|
||||||
|
TEXT=$CORPUSNAME.en-sk
|
||||||
|
fairseq-preprocess --source-lang sk --target-lang en --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test --destdir data-bin/$CORPUSNAME.en-sk
|
||||||
|
mkdir -p checkpoints/fconv
|
||||||
|
CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/$CORPUSNAME.en-sk --optimizer nag --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 --arch fconv_iwslt_de_en --max-epoch 1 --save-dir checkpoints/fconv
|
||||||
|
fairseq-generate data-bin/$CORPUSNAME.en-sk --path checkpoints/fconv/checkpoint_best.pt --batch-size 128 --beam 5
|
||||||
|
cp data-bin/$CORPUSNAME.en-sk/dict.en.txt checkpoints/fconv/
|
||||||
|
cp data-bin/$CORPUSNAME.en-sk/dict.sk.txt checkpoints/fconv/
|
||||||
|
cp $CORPUSNAME.en-sk/code checkpoints/fconv/bpecodes
|
||||||
|
MODEL_DIR=checkpoints/fconv
|
||||||
|
fairseq-interactive --path $MODEL_DIR/checkpoint_best.pt $MODEL_DIR --beam 5 --source-lang sk --target-lang en --tokenizer moses --bpe subword_nmt --bpe-codes $MODEL_DIR/bpecodes
|
13
idoc_install.txt
Normal file
13
idoc_install.txt
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
conda install python=3.7 numpy=1.19
|
||||||
|
conda install cudatoolkit=11.0 nccl -c nvidia
|
||||||
|
conda install pytorch -c pytorch
|
||||||
|
# https://github.com/pytorch/fairseq
|
||||||
|
git clone https://github.com/pytorch/fairseq
|
||||||
|
cd fairseq
|
||||||
|
pip install .
|
||||||
|
cd ..
|
||||||
|
git clone https://github.com/NVIDIA/apex
|
||||||
|
cd apex
|
||||||
|
# https://github.com/NVIDIA/apex/issues/1043
|
||||||
|
git reset --hard 3fe10b5597ba14a748ebb271a6ab97c09c5701ac
|
||||||
|
1904 pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./
|
86
pre-process.sh
Normal file
86
pre-process.sh
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
CORPUSNAME=$1
|
||||||
|
|
||||||
|
echo 'Cloning Moses github repository (for tokenization scripts)...'
|
||||||
|
git clone https://github.com/moses-smt/mosesdecoder.git
|
||||||
|
|
||||||
|
echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
|
||||||
|
git clone https://github.com/rsennrich/subword-nmt.git
|
||||||
|
|
||||||
|
SCRIPTS=mosesdecoder/scripts
|
||||||
|
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
|
||||||
|
LC=$SCRIPTS/tokenizer/lowercase.perl
|
||||||
|
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
|
||||||
|
BPEROOT=subword-nmt/subword_nmt
|
||||||
|
BPE_TOKENS=10000
|
||||||
|
|
||||||
|
|
||||||
|
if [ ! -d "$SCRIPTS" ]; then
|
||||||
|
echo "Please set SCRIPTS variable correctly to point to Moses scripts."
|
||||||
|
exit
|
||||||
|
fi
|
||||||
|
|
||||||
|
src=en
|
||||||
|
tgt=sk
|
||||||
|
lang=en-sk
|
||||||
|
prep=$CORPUSNAME.en-sk
|
||||||
|
tmp=$prep/tmp
|
||||||
|
orig=corpus
|
||||||
|
|
||||||
|
mkdir -p $orig $tmp $prep
|
||||||
|
|
||||||
|
echo "pre-processing train data..."
|
||||||
|
for l in $src $tgt; do
|
||||||
|
f=$CORPUSNAME.$lang.$l
|
||||||
|
tok=$CORPUSNAME.$lang.tok.$l
|
||||||
|
cat $orig/$lang/$f | \
|
||||||
|
perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
perl $CLEAN -ratio 1.5 $tmp/$CORPUSNAME.$lang.tok $src $tgt $tmp/$CORPUSNAME.$lang.clean 1 175
|
||||||
|
for l in $src $tgt; do
|
||||||
|
perl $LC < $tmp/$CORPUSNAME.$lang.clean.$l > $tmp/$CORPUSNAME.$lang.$l
|
||||||
|
done
|
||||||
|
|
||||||
|
xd=$CORPUSNAME.$lang.$l
|
||||||
|
|
||||||
|
echo "pre-processing valid/test data..."
|
||||||
|
for l in $src $tgt; do
|
||||||
|
for o in `ls $orig/$lang/$xd`; do
|
||||||
|
fname=${o##*/}
|
||||||
|
f=$tmp/${fname%.*}
|
||||||
|
echo $o $f | \
|
||||||
|
# grep '<seg id' $o | \
|
||||||
|
# sed -e 's/<seg id="[0-9]*">\s*//g' | \
|
||||||
|
# sed -e 's/\s*<\/seg>\s*//g' | \
|
||||||
|
# sed -e "s/\’/\'/g" | \
|
||||||
|
perl $TOKENIZER -threads 8 -l $l | \
|
||||||
|
perl $LC > $f
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "creating train, valid, test..."
|
||||||
|
for l in $src $tgt; do
|
||||||
|
awk '{if (NR%23 == 0) print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/valid.$l
|
||||||
|
awk '{if (NR%23 != 0) print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/train.$l
|
||||||
|
|
||||||
|
cat $tmp/$CORPUSNAME.en-sk.$l \
|
||||||
|
> $tmp/test.$l
|
||||||
|
done
|
||||||
|
|
||||||
|
TRAIN=$tmp/train.en-sk
|
||||||
|
BPE_CODE=$prep/code
|
||||||
|
rm -f $TRAIN
|
||||||
|
for l in $src $tgt; do
|
||||||
|
cat $tmp/train.$l >> $TRAIN
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "learn_bpe.py on ${TRAIN}..."
|
||||||
|
python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
|
||||||
|
|
||||||
|
for L in $src $tgt; do
|
||||||
|
for f in train.$L valid.$L test.$L; do
|
||||||
|
echo "apply_bpe.py to ${f}..."
|
||||||
|
python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
|
||||||
|
done
|
||||||
|
done
|
Loading…
Reference in New Issue
Block a user