Upload files to ''
This commit is contained in:
		
							parent
							
								
									e757160ae5
								
							
						
					
					
						commit
						2fcdc9edbb
					
				
							
								
								
									
										19
									
								
								BPETokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								BPETokenizer.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,19 @@ | ||||
| from tokenizers import Tokenizer | ||||
| from tokenizers.models import BPE | ||||
| 
 | ||||
| tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json") | ||||
| tokenizedLine = "" | ||||
| fileName = "eceuropa.test.raw" | ||||
| 
 | ||||
| def listToString(s): | ||||
|     str1 = " " | ||||
|     return (str1.join(s)) | ||||
| 
 | ||||
| with open('raw/'+fileName) as read_file: | ||||
|     for line in read_file: | ||||
|         tokenizedLine = tokenizer.encode(line.rstrip()) | ||||
|         with open('tokenized/bpe-tok_'+fileName, 'a') as input_file: | ||||
|             stringified = listToString(tokenizedLine.tokens) | ||||
|             print(stringified) | ||||
|             input_file.write(stringified) | ||||
|             input_file.write("\n") | ||||
							
								
								
									
										13
									
								
								BPETokenizerTrainer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								BPETokenizerTrainer.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,13 @@ | ||||
| from tokenizers import Tokenizer | ||||
| from tokenizers.models import BPE | ||||
| from tokenizers.trainers import BpeTrainer | ||||
| from tokenizers.pre_tokenizers import Whitespace | ||||
| 
 | ||||
| # training the tokenizer | ||||
| tokenizer = Tokenizer(BPE(unk_token="[UNK]")) | ||||
| trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) | ||||
| tokenizer.pre_tokenizer = Whitespace() | ||||
| files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]] | ||||
| tokenizer.train(files, trainer) | ||||
| tokenizer.save("data/bpe-tokenizer-wiki.json") | ||||
| 
 | ||||
							
								
								
									
										
											BIN
										
									
								
								Trenovania.ods
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								Trenovania.ods
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										21
									
								
								WordPieceTokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								WordPieceTokenizer.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,21 @@ | ||||
| from tokenizers import Tokenizer | ||||
| from tokenizers.models import WordPiece | ||||
| 
 | ||||
| tokenizer = Tokenizer.from_file("wordpiece-tokenizer-eujournal-sk.json") | ||||
| tokenizedLine = "" | ||||
| fileName = "eceuropa.sk" | ||||
| files = ["train", "valid", "test"] | ||||
| 
 | ||||
| def listToString(s): | ||||
|     str1 = " " | ||||
|     return (str1.join(s)) | ||||
| 
 | ||||
| for file in files: | ||||
|     with open('raw/'+fileName+'.'+file+'.raw') as read_file: | ||||
|         for line in read_file: | ||||
|             tokenizedLine = tokenizer.encode(line.rstrip()) | ||||
|             with open('tokenized/wordpiece-tok_'+fileName+'.'+file+'.en', 'a') as input_file: | ||||
|                 stringified = listToString(tokenizedLine.tokens) | ||||
|                 print(stringified) | ||||
|                 input_file.write(stringified) | ||||
|                 input_file.write("\n") | ||||
							
								
								
									
										13
									
								
								WordPieceTokenizerTrainer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								WordPieceTokenizerTrainer.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,13 @@ | ||||
| from tokenizers import Tokenizer | ||||
| from tokenizers.models import WordPiece | ||||
| from tokenizers.trainers import WordPieceTrainer | ||||
| from tokenizers.pre_tokenizers import Whitespace | ||||
| 
 | ||||
| # training the tokenizer | ||||
| tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) | ||||
| trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) | ||||
| tokenizer.pre_tokenizer = Whitespace() | ||||
| # files = [f"raw/eceuropa.{split}.raw" for split in ["test", "train", "valid"]] | ||||
| files = [f"raw/eujournal.sk.raw"] | ||||
| tokenizer.train(files, trainer) | ||||
| tokenizer.save("wordpiece-tokenizer-eujournal-sk.json") | ||||
							
								
								
									
										24
									
								
								all-in-one.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								all-in-one.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,24 @@ | ||||
| NUMBER=1 | ||||
| CORPUSNAME=eceuropa | ||||
| 
 | ||||
| wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11858/00-097C-0000-0006-AAE0-A/corpus-en-sk-plaintext.tar.gz | ||||
| tar -xvzf corpus-en-sk-plaintext.tar.gz | ||||
| gzip -d corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.en.gz | ||||
| gzip -d corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.sk.gz | ||||
| mkdir -p $NUMBER-$CORPUSNAME/corpus/en-sk/ | ||||
| mv corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.en $NUMBER-$CORPUSNAME/corpus/en-sk/ | ||||
| mv corpus-en-sk-plaintext/ec-europa/eceuropa.en-sk.sk $NUMBER-$CORPUSNAME/corpus/en-sk/ | ||||
| cp $NUMBER-$CORPUSNAME/corpus/en-sk/eceuropa.en-sk.sk $NUMBER-eceuropa/corpus/en-sk/train.sk | ||||
| rm -r corpus-en-sk-plaintext | ||||
| cd $NUMBER-$CORPUSNAME | ||||
| bash ../pre-process.sh $CORPUSNAME | ||||
| TEXT=$CORPUSNAME.en-sk | ||||
| fairseq-preprocess --source-lang sk --target-lang en --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test --destdir data-bin/$CORPUSNAME.en-sk | ||||
| mkdir -p checkpoints/fconv | ||||
| CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/$CORPUSNAME.en-sk --optimizer nag --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 --arch fconv_iwslt_de_en --max-epoch 1 --save-dir checkpoints/fconv | ||||
| fairseq-generate data-bin/$CORPUSNAME.en-sk --path checkpoints/fconv/checkpoint_best.pt --batch-size 128 --beam 5 | ||||
| cp data-bin/$CORPUSNAME.en-sk/dict.en.txt checkpoints/fconv/ | ||||
| cp data-bin/$CORPUSNAME.en-sk/dict.sk.txt checkpoints/fconv/ | ||||
| cp $CORPUSNAME.en-sk/code checkpoints/fconv/bpecodes | ||||
| MODEL_DIR=checkpoints/fconv | ||||
| fairseq-interactive --path $MODEL_DIR/checkpoint_best.pt $MODEL_DIR --beam 5 --source-lang sk --target-lang en --tokenizer moses --bpe subword_nmt --bpe-codes $MODEL_DIR/bpecodes | ||||
							
								
								
									
										13
									
								
								idoc_install.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								idoc_install.txt
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,13 @@ | ||||
| conda install python=3.7 numpy=1.19 | ||||
| conda install cudatoolkit=11.0 nccl -c nvidia | ||||
| conda install pytorch -c pytorch | ||||
| # https://github.com/pytorch/fairseq | ||||
| git clone https://github.com/pytorch/fairseq | ||||
| cd fairseq | ||||
| pip install . | ||||
| cd .. | ||||
| git clone https://github.com/NVIDIA/apex | ||||
| cd apex | ||||
| # https://github.com/NVIDIA/apex/issues/1043 | ||||
| git reset --hard 3fe10b5597ba14a748ebb271a6ab97c09c5701ac | ||||
| 1904 pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./ | ||||
							
								
								
									
										86
									
								
								pre-process.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										86
									
								
								pre-process.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,86 @@ | ||||
| CORPUSNAME=$1 | ||||
| 
 | ||||
| echo 'Cloning Moses github repository (for tokenization scripts)...' | ||||
| git clone https://github.com/moses-smt/mosesdecoder.git | ||||
| 
 | ||||
| echo 'Cloning Subword NMT repository (for BPE pre-processing)...' | ||||
| git clone https://github.com/rsennrich/subword-nmt.git | ||||
| 
 | ||||
| SCRIPTS=mosesdecoder/scripts | ||||
| TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl | ||||
| LC=$SCRIPTS/tokenizer/lowercase.perl | ||||
| CLEAN=$SCRIPTS/training/clean-corpus-n.perl | ||||
| BPEROOT=subword-nmt/subword_nmt | ||||
| BPE_TOKENS=10000 | ||||
| 
 | ||||
| 
 | ||||
| if [ ! -d "$SCRIPTS" ]; then | ||||
|     echo "Please set SCRIPTS variable correctly to point to Moses scripts." | ||||
|     exit | ||||
| fi | ||||
| 
 | ||||
| src=en | ||||
| tgt=sk | ||||
| lang=en-sk | ||||
| prep=$CORPUSNAME.en-sk | ||||
| tmp=$prep/tmp | ||||
| orig=corpus | ||||
| 
 | ||||
| mkdir -p $orig $tmp $prep | ||||
| 
 | ||||
| echo "pre-processing train data..." | ||||
| for l in $src $tgt; do | ||||
|     f=$CORPUSNAME.$lang.$l | ||||
|     tok=$CORPUSNAME.$lang.tok.$l | ||||
|     cat $orig/$lang/$f | \ | ||||
|     perl $TOKENIZER -threads 8 -l $l > $tmp/$tok | ||||
|     echo "" | ||||
| done | ||||
| perl $CLEAN -ratio 1.5 $tmp/$CORPUSNAME.$lang.tok $src $tgt $tmp/$CORPUSNAME.$lang.clean 1 175 | ||||
| for l in $src $tgt; do | ||||
|     perl $LC < $tmp/$CORPUSNAME.$lang.clean.$l > $tmp/$CORPUSNAME.$lang.$l | ||||
| done | ||||
| 
 | ||||
| xd=$CORPUSNAME.$lang.$l | ||||
| 
 | ||||
| echo "pre-processing valid/test data..." | ||||
| for l in $src $tgt; do | ||||
|     for o in `ls $orig/$lang/$xd`; do | ||||
|     fname=${o##*/} | ||||
|     f=$tmp/${fname%.*} | ||||
|     echo $o $f | \ | ||||
|     # grep '<seg id' $o | \ | ||||
|     # sed -e 's/<seg id="[0-9]*">\s*//g' | \ | ||||
|     # sed -e 's/\s*<\/seg>\s*//g' | \ | ||||
|     # sed -e "s/\’/\'/g" | \ | ||||
|     perl $TOKENIZER -threads 8 -l $l | \ | ||||
|     perl $LC > $f | ||||
|     echo "" | ||||
|     done | ||||
| done | ||||
| 
 | ||||
| echo "creating train, valid, test..." | ||||
| for l in $src $tgt; do | ||||
|     awk '{if (NR%23 == 0)  print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/valid.$l | ||||
|     awk '{if (NR%23 != 0)  print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/train.$l | ||||
| 
 | ||||
|     cat $tmp/$CORPUSNAME.en-sk.$l \ | ||||
|         > $tmp/test.$l | ||||
| done | ||||
| 
 | ||||
| TRAIN=$tmp/train.en-sk | ||||
| BPE_CODE=$prep/code | ||||
| rm -f $TRAIN | ||||
| for l in $src $tgt; do | ||||
|     cat $tmp/train.$l >> $TRAIN | ||||
| done | ||||
| 
 | ||||
| echo "learn_bpe.py on ${TRAIN}..." | ||||
| python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE | ||||
| 
 | ||||
| for L in $src $tgt; do | ||||
|     for f in train.$L valid.$L test.$L; do | ||||
|         echo "apply_bpe.py to ${f}..." | ||||
|         python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f | ||||
|     done | ||||
| done | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user