Upload files to ''

This commit is contained in:
Dominik Nagy 2022-01-11 23:01:48 +00:00
parent e757160ae5
commit 2fcdc9edbb
8 changed files with 189 additions and 0 deletions

19
BPETokenizer.py Normal file
View File

@ -0,0 +1,19 @@
from tokenizers import Tokenizer
from tokenizers.models import BPE
tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
tokenizedLine = ""
fileName = "eceuropa.test.raw"
def listToString(s):
str1 = " "
return (str1.join(s))
with open('raw/'+fileName) as read_file:
for line in read_file:
tokenizedLine = tokenizer.encode(line.rstrip())
with open('tokenized/bpe-tok_'+fileName, 'a') as input_file:
stringified = listToString(tokenizedLine.tokens)
print(stringified)
input_file.write(stringified)
input_file.write("\n")

13
BPETokenizerTrainer.py Normal file
View File

@ -0,0 +1,13 @@
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
# training the tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.pre_tokenizer = Whitespace()
files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
tokenizer.train(files, trainer)
tokenizer.save("data/bpe-tokenizer-wiki.json")

BIN
Trenovania.ods Normal file

Binary file not shown.

21
WordPieceTokenizer.py Normal file
View File

@ -0,0 +1,21 @@
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
tokenizer = Tokenizer.from_file("wordpiece-tokenizer-eujournal-sk.json")
tokenizedLine = ""
fileName = "eceuropa.sk"
files = ["train", "valid", "test"]
def listToString(s):
str1 = " "
return (str1.join(s))
for file in files:
with open('raw/'+fileName+'.'+file+'.raw') as read_file:
for line in read_file:
tokenizedLine = tokenizer.encode(line.rstrip())
with open('tokenized/wordpiece-tok_'+fileName+'.'+file+'.en', 'a') as input_file:
stringified = listToString(tokenizedLine.tokens)
print(stringified)
input_file.write(stringified)
input_file.write("\n")

View File

@ -0,0 +1,13 @@
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
# training the tokenizer
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.pre_tokenizer = Whitespace()
# files = [f"raw/eceuropa.{split}.raw" for split in ["test", "train", "valid"]]
files = [f"raw/eujournal.sk.raw"]
tokenizer.train(files, trainer)
tokenizer.save("wordpiece-tokenizer-eujournal-sk.json")

24
all-in-one.sh Normal file
View File

@ -0,0 +1,24 @@
NUMBER=1
CORPUSNAME=eceuropa
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11858/00-097C-0000-0006-AAE0-A/corpus-en-sk-plaintext.tar.gz
tar -xvzf corpus-en-sk-plaintext.tar.gz
gzip -d corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.en.gz
gzip -d corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.sk.gz
mkdir -p $NUMBER-$CORPUSNAME/corpus/en-sk/
mv corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.en $NUMBER-$CORPUSNAME/corpus/en-sk/
mv corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.sk $NUMBER-$CORPUSNAME/corpus/en-sk/
cp $NUMBER-$CORPUSNAME/corpus/en-sk/eceuropa.en-sk.sk $NUMBER-eceuropa/corpus/en-sk/train.sk
rm -r corpus-en-sk-plaintext
cd $NUMBER-$CORPUSNAME
bash ../pre-process.sh $CORPUSNAME
TEXT=$CORPUSNAME.en-sk
fairseq-preprocess --source-lang sk --target-lang en --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test --destdir data-bin/$CORPUSNAME.en-sk
mkdir -p checkpoints/fconv
CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/$CORPUSNAME.en-sk --optimizer nag --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 --arch fconv_iwslt_de_en --max-epoch 1 --save-dir checkpoints/fconv
fairseq-generate data-bin/$CORPUSNAME.en-sk --path checkpoints/fconv/checkpoint_best.pt --batch-size 128 --beam 5
cp data-bin/$CORPUSNAME.en-sk/dict.en.txt checkpoints/fconv/
cp data-bin/$CORPUSNAME.en-sk/dict.sk.txt checkpoints/fconv/
cp $CORPUSNAME.en-sk/code checkpoints/fconv/bpecodes
MODEL_DIR=checkpoints/fconv
fairseq-interactive --path $MODEL_DIR/checkpoint_best.pt $MODEL_DIR --beam 5 --source-lang sk --target-lang en --tokenizer moses --bpe subword_nmt --bpe-codes $MODEL_DIR/bpecodes

13
idoc_install.txt Normal file
View File

@ -0,0 +1,13 @@
conda install python=3.7 numpy=1.19
conda install cudatoolkit=11.0 nccl -c nvidia
conda install pytorch -c pytorch
# https://github.com/pytorch/fairseq
git clone https://github.com/pytorch/fairseq
cd fairseq
pip install .
cd ..
git clone https://github.com/NVIDIA/apex
cd apex
# https://github.com/NVIDIA/apex/issues/1043
git reset --hard 3fe10b5597ba14a748ebb271a6ab97c09c5701ac
1904 pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./

86
pre-process.sh Normal file
View File

@ -0,0 +1,86 @@
CORPUSNAME=$1
echo 'Cloning Moses github repository (for tokenization scripts)...'
git clone https://github.com/moses-smt/mosesdecoder.git
echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
git clone https://github.com/rsennrich/subword-nmt.git
SCRIPTS=mosesdecoder/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
LC=$SCRIPTS/tokenizer/lowercase.perl
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
BPEROOT=subword-nmt/subword_nmt
BPE_TOKENS=10000
if [ ! -d "$SCRIPTS" ]; then
echo "Please set SCRIPTS variable correctly to point to Moses scripts."
exit
fi
src=en
tgt=sk
lang=en-sk
prep=$CORPUSNAME.en-sk
tmp=$prep/tmp
orig=corpus
mkdir -p $orig $tmp $prep
echo "pre-processing train data..."
for l in $src $tgt; do
f=$CORPUSNAME.$lang.$l
tok=$CORPUSNAME.$lang.tok.$l
cat $orig/$lang/$f | \
perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
echo ""
done
perl $CLEAN -ratio 1.5 $tmp/$CORPUSNAME.$lang.tok $src $tgt $tmp/$CORPUSNAME.$lang.clean 1 175
for l in $src $tgt; do
perl $LC < $tmp/$CORPUSNAME.$lang.clean.$l > $tmp/$CORPUSNAME.$lang.$l
done
xd=$CORPUSNAME.$lang.$l
echo "pre-processing valid/test data..."
for l in $src $tgt; do
for o in `ls $orig/$lang/$xd`; do
fname=${o##*/}
f=$tmp/${fname%.*}
echo $o $f | \
# grep '<seg id' $o | \
# sed -e 's/<seg id="[0-9]*">\s*//g' | \
# sed -e 's/\s*<\/seg>\s*//g' | \
# sed -e "s/\/\'/g" | \
perl $TOKENIZER -threads 8 -l $l | \
perl $LC > $f
echo ""
done
done
echo "creating train, valid, test..."
for l in $src $tgt; do
awk '{if (NR%23 == 0) print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/valid.$l
awk '{if (NR%23 != 0) print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/train.$l
cat $tmp/$CORPUSNAME.en-sk.$l \
> $tmp/test.$l
done
TRAIN=$tmp/train.en-sk
BPE_CODE=$prep/code
rm -f $TRAIN
for l in $src $tgt; do
cat $tmp/train.$l >> $TRAIN
done
echo "learn_bpe.py on ${TRAIN}..."
python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
for L in $src $tgt; do
for f in train.$L valid.$L test.$L; do
echo "apply_bpe.py to ${f}..."
python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
done
done