From 2fcdc9edbb24be27404cb2b796ba6e57b804c899 Mon Sep 17 00:00:00 2001 From: Dominik Nagy Date: Tue, 11 Jan 2022 23:01:48 +0000 Subject: [PATCH] Upload files to '' --- BPETokenizer.py | 19 ++++++++ BPETokenizerTrainer.py | 13 ++++++ Trenovania.ods | Bin 0 -> 10650 bytes WordPieceTokenizer.py | 21 +++++++++ WordPieceTokenizerTrainer.py | 13 ++++++ all-in-one.sh | 24 ++++++++++ idoc_install.txt | 13 ++++++ pre-process.sh | 86 +++++++++++++++++++++++++++++++++++ 8 files changed, 189 insertions(+) create mode 100644 BPETokenizer.py create mode 100644 BPETokenizerTrainer.py create mode 100644 Trenovania.ods create mode 100644 WordPieceTokenizer.py create mode 100644 WordPieceTokenizerTrainer.py create mode 100644 all-in-one.sh create mode 100644 idoc_install.txt create mode 100644 pre-process.sh diff --git a/BPETokenizer.py b/BPETokenizer.py new file mode 100644 index 0000000..9db172b --- /dev/null +++ b/BPETokenizer.py @@ -0,0 +1,19 @@ +from tokenizers import Tokenizer +from tokenizers.models import BPE + +tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json") +tokenizedLine = "" +fileName = "eceuropa.test.raw" + +def listToString(s): + str1 = " " + return (str1.join(s)) + +with open('raw/'+fileName) as read_file: + for line in read_file: + tokenizedLine = tokenizer.encode(line.rstrip()) + with open('tokenized/bpe-tok_'+fileName, 'a') as input_file: + stringified = listToString(tokenizedLine.tokens) + print(stringified) + input_file.write(stringified) + input_file.write("\n") \ No newline at end of file diff --git a/BPETokenizerTrainer.py b/BPETokenizerTrainer.py new file mode 100644 index 0000000..1f4de60 --- /dev/null +++ b/BPETokenizerTrainer.py @@ -0,0 +1,13 @@ +from tokenizers import Tokenizer +from tokenizers.models import BPE +from tokenizers.trainers import BpeTrainer +from tokenizers.pre_tokenizers import Whitespace + +# training the tokenizer +tokenizer = Tokenizer(BPE(unk_token="[UNK]")) +trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) +tokenizer.pre_tokenizer = Whitespace() +files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]] +tokenizer.train(files, trainer) +tokenizer.save("data/bpe-tokenizer-wiki.json") + diff --git a/Trenovania.ods b/Trenovania.ods new file mode 100644 index 0000000000000000000000000000000000000000..23dbe079a68d941318f6537af79c5019ba043545 GIT binary patch literal 10650 zcmdUVWmsI<(r)9D#$AHDy9IX-PVhkE(hW2YL4qX&4GuwrySpX06WrZ3K?4N1WX?Ay zGxN=yd(Y2Xu%EqlKfB&lYcH*;S4{~H9v1*W1^~k9n6-lK1;f|?0Km`d@h5hfOwFOz78WjQf5Lo( z`L_^0mZTtHOB*X!=iktvT@6M}3i(fV{#}hfzlozM*uwt5tkKaKV&!ZBh5q+#gu0lzxc*=C z$p5B|HV&p%7EpFc8y5#tN9g~Ui;Rr?+i-e(|K9?9EIF8hZ7eOIE^N-`mcy~b;6M({ zpcCO>i;-Dzb_4`xhJi-gj1JzA7YOX9DcH+xk+C}JIYs*!lN)8S+$)hasYj-yeebFY zK56lQ7-XX}{BQ=Wda4b078YH|lN37RAXv+@J&N2{|LJW|et zpSMtbni6o52lmuNBBPK+NZMR%`Vb3_tfN@!s=FLyMtfdSh`d`8D$J}xHLcTV4XvYy z2@sD!Q630#2_$V&RqEzWI8zt=ly$V;`?7T+FkATLSLVpk+2uFDye;F^){0}pz!jf( zYp-LFm+M6Lw-xHRmmj!}pYJz6^L*I3XB&MWS}(70leQNh`Dk?$y;E~at}R!tri6rC zb7eh_00RI7!T|vPngf6L`F@*?7A~f29uD>qu}V>?9Js+(cX)kitlzOHK2e2hX=@W` z)@o=LHouv5FEHqh3?~}*Gcg`f7SbC+H)(^8ip;1jXG95k!Ok$*)xDvEmw6c!Kfq&f z4!UAo&b-|>vLPpr2e6gyM!xj%fj5y0oUCba(%5Ic*}8iRTD*ESUXFc0JcO*bhB!Ds z^DXxEy@)>UM~`FQ_h;?SzE@A-mo|<-?<0#E4R<|n9p_+NLa&QpF`l7x`Us& zsy>gyCFBXeGf>6}gR?7b?byJDE@M)6pIu8n&##w#;l>~QBHe`LLl%?0`;dOTrrBEN zB^(yULSPtHkC`Os{Soq7&}lf0eJuU~%b@&Q2!kTy_xQN+rv~4`on&KQ@_->WXFaE`Zop|0vi$|n2CXE)=x%vdPO9R3Aw(2hw?2SHbJ%hwcOVnJ>x-V&VsHrA1d>Obde8*_tX@FPNf6wG-BqRf? zil;N%bolhsTz*@U*Sl`m-8s}Id`6;>qB(5hiNec8r}Zuu3SXY|qL3bqk@9si9m{PJ z*_s%2WK0RgAUMdjwoOU}!~TV0hzEKODehQfxAkpyRoHk;jQ~vd9UIclmvhn&auKfs zX%;Q;l!}O>_UPV)-LRZzHIs6d->evwzAHj!Rh$m+R)H`cDoLvr=&FTros|P#qv_-G^I4`h_MVRO9 zi6421Q<1t9qb}El_tu6MLf8lCt(3PkxkD(Osc^B|gxSrE(G0p5DIu1`8cr0S)L%W4P)9wY@iO_UKz z!tqVz$pRa8U~`{WuBM;3i7=**pP~zCC7B#{} zwuIozcvu;V0|R{W5Sclj6oqV{jji46lHd4Mt!TFahbhB=-SE|()>!4*_iC-xjLmF5 zokKeAIXlWZGzgaJbk$55!&_T*4}aW^8M&3a-FUV>(ovKacVgthg~6-$^{ub2M#$Eh ziB@{O{T96+l3T7YcJI7xhGVh4a*WMmCB65iWwm{@v00~eIDW)&)IM=mgavs9ud#)3 zWD(JfgbBQ61anO$<2W~^XzDY*o4ovz*7_dL%1P1Dyj1`?HR_mOj^I>Qu8O$?hxjc_ zBjy!RRp=cTuJX4hgHIXI+r@2`m7axNO~47|*`jC3A(Z54=4I4j#S^A+!bP)2PWF(1 z;h%~FiE7z{`C-G*#I%ARWNm5^!JuhTUb2v;p8UA%gEpVJiRT!4VKdg$vurYvGGR78 zn(}%qk*WiuUw8Ggca!Ah2`@!mF%+czXx%s1K1CEaMvde_19qH`AkeXM{pE#Uy3_a{U53NZ&rZ{SjuJXRY&y!WQztWlC z{#Y{{rDBj%z8Bdd<@4iwZFVVhv5?$K%7EXt<-~tlFTGLwAV+O=jFDck4STcCC6naK z>a9b%mm5Zbw|u_p`4-Ncf(^|UGPKK97tQV(bw$*G3smWI@!l^bKrK|;ZX@H_89_eO zkn|`wn$|fo^TjX^Qt~H$IbAJeSHSVeV)vYa(=2jGDc81eXzuInR!cq=*t8eQZ?Yc5 z-kIKg_>_^x;0K(y;t8RTiEvUjO#Q z_s@}(YY|b=xcpQ5e#|CA5nRmz)h1qBUbU5xD7i(5dmrii(W5ZbZ%2F zYOif#mZE5z2?2vAVGk%Q@=x^_{Z=S*D$PLhx|)Mw$YJBUcdnr~)9dT_v-8dAkggK^O}%py zJ6h_!hzU3W3(ghhPGZ)%vfv3W2x-7#Y(+%;A+bB*0rux^Cg8?WlYj^S*wg)|bIOy) z-3$Z)yZm$s9=Ee2!zJ+JcGk9{iA+}aOh4hr2`ytC4?-QC&6k?lV_5xc^RhUGXsyFT z+mj0Z#n|x442#~=0y?fk9eyI|bm;dqa$n)N{j^qngf0*9a1e32dkMH4lsT5ghJJT8 zk{<=>$T{0zZ|2_joR#2C6ms&HgdrtHy*81u4fVn2?ApS?eH)V$%#A9zU5$zOW;BTV zBNi%jX-+duuL>-5?6^#@lNOBmWr!zn(87K0L}NRZTr9~J9k~YElI@6lhPFB9$haJG zllKGx9)pmzeUw+{3D)6cMczbV@(_MDscur;kQlh^Nw9Ro zp{U@sw?2zQejVx#lrj4*jeaqP3cvIhbX}QPX(RX~rQ^-m$BidIGX_n8^>JPeW4( zC^+bkpu+t4#opm1Xhmwhg!%0?!i)zG`cfR7TavDAHQJK}5>tqBNqV6j0S-LJR zyw3t{Zfml0Zh6L~dF}f)f9Ty&402*^r$7>QUfRq4xaV$3F3`sM%mm;da$Hwh>pGJR z$;c7=Y+*9;7IBcTM1Z+EtKMuTCIdbm3kN4;)BJR2yUFmf32Uj$@tA|uM$>|+Regx3 zKsa)eObR0m1NF7$;2A2yf^z;0ecJ5*of}doO!uIZ8JJq9pa; zt4Oc0p>32{rfYpAV&WmOr(9Q3-kq359a<^#aH}y-aD|aRvEuPK%{2TtDs^(#BT93S zNc(D?#a-;264CiWtue6z4*Qu*Rct#+o4_O!{$K@ts$qHx4fq>`34oe5Bbeg1e-<2FMO{+12hxgHs)(XfP?$G+gc3FjaA!=nNp4UM$o( z@t^IPZlG%le2U@bK1q(Tp9#{oBKSDSYVMo^WKPovd=hjQtX5gsgVXrkw~JHGUltZA z+P|vqCFgiA>neEh`X$9Pzb2y{wr#9{D5CVClqQ_jF(eq* zT-wk6Niz%pt$06ac~KmRJYz{(3~hglkL}t73MW-VumbsAOV(cFC=|EOggVcQi$x@g zl`}(hwIgIy%29LkyW>58D3?+lHbaxG6P?A20#aD7yS<+YBA~Eeq&Yk6auc%Fnbq-m zEyw7_<>#T#iTDH|ejC6?D~2_^p+hMzE8Wc1f5 zGaPw8Z%pDv#D0XS(zIfKlGCVC-N?L`SEteplX{0}k4{`6Xy3720F%0bXs`PC;Lwq` zFJQd9OK?Kt8GT$)fPPilo!e@pR7}m}hX<<=?(Y%4DL|lns@2=KB6XY;dpV`-;-sL5 zd&bS}EzDg%BSQGzy{NgvkJ9qT?Ufjhp8%ry404rPBq4dGIOuz41%_ky!b=}H zZdKx7B$i+))VpOcGl^Ov=x)6TQ?pG-|aomP9xF&Ph~-M*Lg3A*%)r?vKTRThH@p)c-^SsR+} z6ipcC?~bjD5GZ4WP223t`&douf;{(w1gDg)YqG+lyq40J36i8wgZ8Q%*bVCJ=T0}- zH_&PmpW77d3$>+odoNWkyUEjFq(rVTLRUcje!M2B33AR_Og!N0e97F57RV1_Msj7p zjyN%g2S^Ot{cql(OEHW!0TKY9tMH%p)1P4zsD+D*4cO{8|L{oPz&%?C*H^4PdvK;& zi0Wwe%aqWQ(!}KwSmOZ$|AOR@Acfa`KW=c;tn`sANrTHq^z$+B|^^OjZWWk zti~nfEZ!lX1{>*-2EaINa@?N`HGfueU*#@>Rp(pv`HiqiJig@j3ui%KC&thTtME!t62#^=Icb$~(4Z_xq^^)Dv{H;SBzhVc34`vYjw8do zpT!4&65dXmWb0zU^W*aza@8EvIVf_%=S*U<_#=VQ*X2QyGeOMX7$(Aq^43H?z|jx9 z=Y>nMLRRwp&c$QFOV4S(^6GlZ zvkqz@G;hp?Dj<`z{JMNjJ|o~RL<~_fA#&`EaH<)&W9fVPyXc^ABFQ4F0)f%Xl`3Yv z$qM<@d~<00$X`{8;Mi8#3QsXB5EG#KGy1^_W0Z-T11u~|``rP{U;@$FCIS3qWP1Wu ztWhm7`mZiq7`Y?F#s-vNX)Zj|xJ<|#%U$!?2Nms%i4O6hJTbuz8*je2 zk(Dz}!+v3;>uhRhI_#n$mHY;B^m_NA5_d~LlHBW$SfR)47Mr_4&X?E_Y!B26d;~f+ z$om(#Nv!y()j)GkJa%T>Psf!3XaXha{8%t7OuX{5xAi3rvzI+sNy0;Ntp+P}LT=mh z%|A#O!M$B7pdYtoI4a&_pUJNrR165el`}sO8KdnjMV^{l7vtLuH2P_k?5U{e_6G2_ zUSwo%k9~7ob?a3hEbhOoY&m9lsqqAHV|$#^>zwD4;@|;$8e7i@zXhJj!qU{|qe$C? znYBovrfT}MJ`JaruFp)CY)mRaAM9vb+NZxuF#~5nwlPyrL&5IG-ru=epkA-QDy;T{ zz8dvbgAtQ4JiwvjbQCZ-ZUAP$K_)Hf&>Si%cA8W`V?EQ%WC9ZfMY zY8_Nop`(N?3Evc!2(3Qnx?69}tCUG>PC22cj5dOUS*GyVOzv$nD)M|DP&`N|Y0;+E zL1h<^$?Bg5fQwPcrq)c$)*yvr$!bIvP6Y%#g0$sR6&SLs>?<1t004&i_a*xGL`kykXA9-W$lcHD zXHrAU+SS1fY-(cFO-)Vx`0-#KOYD#>U3M!NJ+t+5P?f&q^O2 zde2zH005LM1!)N_kNKn2x8Q|7Y#C~G?E2-a)`tgnlb9;2_b6!|bRL=5xG==;8>jY; z+>Gkl&R6_yPoK+PaPuxk#ZHfJjj+=F6-WKD7($uJy_Cxjy2(cGmZda{I4 zc4xUM=50w_TM#V-1MMp;`zOTL1r)A5yyJ+;2bL3>8@?LoCz25D`1YT@Iy~S?zWq*? zJO4TB44y#K&u`4;!#=M`Kj!(X)=zqgi@LB#*D~ybi_3(k9NXoDWnXjA1kolfH8pNF})q7CYa7z6;-U}nlEm$P&F4jZpys75KzXXT7bktqXV$mpCqmV zuS@qLx?He=*02-qYnDLy)_en@1LzXRj*dbzO;>!tC2 z;c*x0`kEn4(K6&$?d-0`WZBCJjpz=SDf3fC^zSoSkAU8<9Qa#UK}2&RI2#FkHI#;# zGBGxFe%!kUccydj!|%ktPX=4p3&)XgBt4sh4dzO<60ioNmigEz*|pV2_0VRE&Qx06 zCRZ<>GgIfqJKwN_ilZ`G{O%`b?finXY>XZrZoYaEyHkiC@q?_9@Y>tV`?M7q7OBi@ zKjiN;&}i{4;V;C?=V|B1b7s>~-Ufu=Q`^U9Yncu!09g=WxRVay3y<_Ll=vuySu`Bx zB`GdB%fBck=86=A)?Cttbi6>dni?M`ncFe--P+W{_^3ffh#l;u7Z4DrT*Hu@z5Pbw zT-!-zRo05LWK{0SlRMstG}8#aA)Bztw>4blT_Y6R%3DZ4n*De|D1IQ1{_Th>6AMEv zH_wK51Ibj>AjBE_0ghHZkSQoQ%6P73$UY{ZNala zdc;>qT+(+{oCfuFRRYtL95Kv6e?{}6t^^|-_v=U`)nB$4m8izAY^JbDG z`s^e^v#i@IAfmppOy*!d%38H!__3#~|`g&UIA; zI>?_$zZ60lCogrPy}t)awA%IsU^18}=_I*VoqKaR5*kSCRmE?%lF3TUNZCr=#q?p1)K($3bYk5ca9#6y6V_baeng$>B-J^VRP8t~tzG;`dKcgt z&K!*@pl9l)B(#=Lfxmoww{C0+Vw9~}C-1&_#){Un)b}_@cCrUpKM0EigvNod!ui`> z5km5H`w_1DAiT;g;%rPR<$l+t{lSgOG5X26#22Yn!n$7P_ueym<~sO`dce;_cugF+ zr#V|AYPCnk-W1MFxOk6|Rf)*bDkr&Rpv%*E<>jH%*qfSw<0pI7=Q4AU9@^^@;Yx99 zkN5yoUUZ`Svo2RiH`}@O(Zr4&+oc)vA#Ea=^H6_$oZ-=Z=JWd9Q@6}UADz#vwXfR- zBgVQyo4Cg)E{LsZW8_zKRYVltt)_c`+D0kINv$9yn6eqAPSYil!bD^rxW|q;PzH=l z!mIPRv2WG|^Fwb|O+G_vvwQKEe@rL9+A!g$p=eqj!3FiJ7)2n841eQk=S5I& z3V54Q5^w^1J80Q>l8`!3ds%3=ma`dM;CXM9u4zBxDBd;9B4$-A$uk9S#pnD*lq_*` zREUL#=ZtA%5(y~guBmjyKH^Ut3KUzQ%8CvUYNA6doE>>7IQ*f11cAQ|M~W54(5Z-8 zBq?QETTNgUF2<_VmBImfCA|3<)U^@`4=Ziy6)1iyIA*hZi zRiQr`_Bb`7wQa0f{&gw3;)|MKl-_#5FQSlF6=Wgs%_p%m^X~jKh2UDC4VxU$^9{5u zU2XN~+t!mp_sc*h#<&cWwm5cb=K>7&9>>l4w(Xngyn3}%uHh%w=LW@9!>%Sq z^+gTFfxIKdI&p$#1;*E9+71I5wW|KN4k{Q?`tLO|peMD&wchbMQ#&`)4PM2{F~@y7 z2K%Hwn?HmzP8=MpXTWPUEj!tlLU!p%;!3Y$wGcA_PkzuDKE$%$O-L+W-{)`YP$ABZ(fzSp#43KYgB4LRZPBkWobC%TCQ+I_G!$NJCG=vohsbcqwT#R8 z7H6#aM=Pcs)Yl!KGi_?-lfVA51lbDm?%Qx{$2)DHCA}%lbXo{FyVDspC`cEBs$;^9 zUtHcIISkrA4o%T=UZ2f+M9D!LU*_SrGi}4k=eNX5eKNd@Zk?_c`d-oErn#$>j=gPz z(7Fs!PXWwX8(+J%mVq9k>hy2(exD$C>lXOiejRFl`}N!7CLdrPXIV7N|Ihzx80O!} zLEFfFIxU1)kHj3b>O3Q}#+Q zxVw(4Mo*hT6+g!7^*LY?h>lH$2~>B!HDgva)Q&M@eC8W+C4yKvm*%aTMnQK_X-fVh zP20b82ERGv6?xnK^SXsHzE(*)ikJ_G+Jd2bd=4)n8Cn(j6tqNW6iUxw)Nc8%{AnJh z2RZlnQc+6g@Q#E((1Q;*;kiUV>Sz^}OE{uq9#!OiIPaOo-wfxVbI_f-L+ueNQ1Wb9 zL{N4sLl28Y!@+1hRslF@t0V%@G${oZA|=;@Qldmw&x+4*sTk5C0bO0#@vU;8W>PsX z{de?u8nVVh9o1V$A1YXU^f%N~>0c{N8K4*?!Q>!LA^9+!(Bn(t7?I{Af|QtTJcbhw zVAZc3$_YeCp$~xrO*GcTr~XTUmaSYA-uRcd@atLYvG~t~s6W^JOY5&Fk7xS-h4PCa_5X^}@i!>H z2vh%z@{_>yOD-Rae{Inp2~_`#^OHjLOV}QJ;IBA;CRF`1(w{A|{u`t}60H6i=g$^p z{teEb30MD&^k<9a{|4!g1gw9?`ANb0CD^e4`y2f)Le{^)e&-bZk!|!dw)jh)KZ5;x z?RX?;{XU3)*Zd=!tDQ=zv}%lI{GuG z_)GX6&Hh7%@vn-1^x6O1z!022hf;sn{3oCOSCx!J|Lo!aj`I6w`s3>Hb6x%=#g9>h bzg?x(ln@^CssI4i $tmp/$tok + echo "" +done +perl $CLEAN -ratio 1.5 $tmp/$CORPUSNAME.$lang.tok $src $tgt $tmp/$CORPUSNAME.$lang.clean 1 175 +for l in $src $tgt; do + perl $LC < $tmp/$CORPUSNAME.$lang.clean.$l > $tmp/$CORPUSNAME.$lang.$l +done + +xd=$CORPUSNAME.$lang.$l + +echo "pre-processing valid/test data..." +for l in $src $tgt; do + for o in `ls $orig/$lang/$xd`; do + fname=${o##*/} + f=$tmp/${fname%.*} + echo $o $f | \ + # grep '\s*//g' | \ + # sed -e 's/\s*<\/seg>\s*//g' | \ + # sed -e "s/\’/\'/g" | \ + perl $TOKENIZER -threads 8 -l $l | \ + perl $LC > $f + echo "" + done +done + +echo "creating train, valid, test..." +for l in $src $tgt; do + awk '{if (NR%23 == 0) print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/valid.$l + awk '{if (NR%23 != 0) print $0; }' $tmp/$CORPUSNAME.en-sk.$l > $tmp/train.$l + + cat $tmp/$CORPUSNAME.en-sk.$l \ + > $tmp/test.$l +done + +TRAIN=$tmp/train.en-sk +BPE_CODE=$prep/code +rm -f $TRAIN +for l in $src $tgt; do + cat $tmp/train.$l >> $TRAIN +done + +echo "learn_bpe.py on ${TRAIN}..." +python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE + +for L in $src $tgt; do + for f in train.$L valid.$L test.$L; do + echo "apply_bpe.py to ${f}..." + python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f + done +done