From a158fecca6c798089bcf9992565039a03465ec6a Mon Sep 17 00:00:00 2001 From: demdecuong Date: Wed, 30 Sep 2020 22:12:19 +0700 Subject: [PATCH] upload multiheadattn decoder --- src/__pycache__/config.cpython-38.pyc | Bin 0 -> 457 bytes src/__pycache__/loader.cpython-38.pyc | Bin 0 -> 2723 bytes src/config.py | 2 +- src/loader.py | 8 +- ...71269.minh-B365-M-AORUS-ELITE.8517.9893.v2 | Bin 0 -> 40 bytes ...71269.minh-B365-M-AORUS-ELITE.8517.9885.v2 | Bin 0 -> 40 bytes ...71297.minh-B365-M-AORUS-ELITE.8610.9893.v2 | Bin 0 -> 40 bytes ...71297.minh-B365-M-AORUS-ELITE.8610.9885.v2 | Bin 0 -> 40 bytes ...71395.minh-B365-M-AORUS-ELITE.8771.9893.v2 | Bin 0 -> 40 bytes ...71395.minh-B365-M-AORUS-ELITE.8771.9885.v2 | Bin 0 -> 40 bytes ...78719.minh-B365-M-AORUS-ELITE.5694.9893.v2 | Bin 0 -> 40 bytes ...78719.minh-B365-M-AORUS-ELITE.5694.9885.v2 | Bin 0 -> 40 bytes ...68968.minh-B365-M-AORUS-ELITE.6210.9893.v2 | Bin 0 -> 40 bytes ...68968.minh-B365-M-AORUS-ELITE.6210.9885.v2 | Bin 0 -> 40 bytes ...69044.minh-B365-M-AORUS-ELITE.6429.9893.v2 | Bin 0 -> 40 bytes ...69044.minh-B365-M-AORUS-ELITE.6429.9885.v2 | Bin 0 -> 40 bytes ...69060.minh-B365-M-AORUS-ELITE.6495.9893.v2 | Bin 0 -> 40 bytes ...69060.minh-B365-M-AORUS-ELITE.6495.9885.v2 | Bin 0 -> 40 bytes ...69083.minh-B365-M-AORUS-ELITE.6566.9893.v2 | Bin 0 -> 40 bytes ...69083.minh-B365-M-AORUS-ELITE.6566.9885.v2 | Bin 0 -> 40 bytes ...69618.minh-B365-M-AORUS-ELITE.7018.9893.v2 | Bin 0 -> 40 bytes ...69618.minh-B365-M-AORUS-ELITE.7018.9885.v2 | Bin 0 -> 40 bytes ...69633.minh-B365-M-AORUS-ELITE.7084.9893.v2 | Bin 0 -> 40 bytes ...69633.minh-B365-M-AORUS-ELITE.7084.9885.v2 | Bin 0 -> 40 bytes ...69656.minh-B365-M-AORUS-ELITE.7162.9893.v2 | Bin 0 -> 40 bytes ...69656.minh-B365-M-AORUS-ELITE.7162.9885.v2 | Bin 0 -> 40 bytes ...69760.minh-B365-M-AORUS-ELITE.7294.9893.v2 | Bin 0 -> 40 bytes ...69760.minh-B365-M-AORUS-ELITE.7294.9885.v2 | Bin 0 -> 40 bytes ...7062.minh-B365-M-AORUS-ELITE.11214.9893.v2 | Bin 0 -> 40 bytes ...7062.minh-B365-M-AORUS-ELITE.11214.9885.v2 | Bin 0 -> 40 bytes ...7097.minh-B365-M-AORUS-ELITE.11292.9893.v2 | Bin 0 -> 40 bytes ...7097.minh-B365-M-AORUS-ELITE.11292.9885.v2 | Bin 0 -> 40 bytes ...7165.minh-B365-M-AORUS-ELITE.11366.9893.v2 | Bin 0 -> 40 bytes ...7165.minh-B365-M-AORUS-ELITE.11366.9885.v2 | Bin 0 -> 40 bytes ...70046.minh-B365-M-AORUS-ELITE.7607.9893.v2 | Bin 0 -> 40 bytes ...70046.minh-B365-M-AORUS-ELITE.7607.9885.v2 | Bin 0 -> 40 bytes ...77441.minh-B365-M-AORUS-ELITE.2383.9893.v2 | Bin 0 -> 40 bytes ...77441.minh-B365-M-AORUS-ELITE.2383.9885.v2 | Bin 0 -> 40 bytes .../__pycache__/__init__.cpython-38.pyc | Bin 0 -> 143 bytes src/models/__pycache__/decoder.cpython-38.pyc | Bin 0 -> 3230 bytes src/models/__pycache__/encoder.cpython-38.pyc | Bin 0 -> 1023 bytes .../__pycache__/multiheadattn.cpython-38.pyc | Bin 0 -> 3030 bytes .../__pycache__/transformer.cpython-38.pyc | Bin 0 -> 5167 bytes src/models/decoder.py | 51 ++++++- src/models/encoder.py | 2 +- src/models/multihead_attn.py | 49 ------ src/models/transformer.py | 141 ++++++++++++++++++ src/train.py | 56 +++++-- 48 files changed, 241 insertions(+), 68 deletions(-) create mode 100644 src/__pycache__/config.cpython-38.pyc create mode 100644 src/__pycache__/loader.cpython-38.pyc create mode 100644 src/logs/gradient_tape/20200930-H0749/test/events.out.tfevents.1601471269.minh-B365-M-AORUS-ELITE.8517.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H0749/train/events.out.tfevents.1601471269.minh-B365-M-AORUS-ELITE.8517.9885.v2 create mode 100644 src/logs/gradient_tape/20200930-H0817/test/events.out.tfevents.1601471297.minh-B365-M-AORUS-ELITE.8610.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H0817/train/events.out.tfevents.1601471297.minh-B365-M-AORUS-ELITE.8610.9885.v2 create mode 100644 src/logs/gradient_tape/20200930-H0955/test/events.out.tfevents.1601471395.minh-B365-M-AORUS-ELITE.8771.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H0955/train/events.out.tfevents.1601471395.minh-B365-M-AORUS-ELITE.8771.9885.v2 create mode 100644 src/logs/gradient_tape/20200930-H1159/test/events.out.tfevents.1601478719.minh-B365-M-AORUS-ELITE.5694.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H1159/train/events.out.tfevents.1601478719.minh-B365-M-AORUS-ELITE.5694.9885.v2 create mode 100644 src/logs/gradient_tape/20200930-H2928/test/events.out.tfevents.1601468968.minh-B365-M-AORUS-ELITE.6210.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H2928/train/events.out.tfevents.1601468968.minh-B365-M-AORUS-ELITE.6210.9885.v2 create mode 100644 src/logs/gradient_tape/20200930-H3044/test/events.out.tfevents.1601469044.minh-B365-M-AORUS-ELITE.6429.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H3044/train/events.out.tfevents.1601469044.minh-B365-M-AORUS-ELITE.6429.9885.v2 create mode 100644 src/logs/gradient_tape/20200930-H3100/test/events.out.tfevents.1601469060.minh-B365-M-AORUS-ELITE.6495.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H3100/train/events.out.tfevents.1601469060.minh-B365-M-AORUS-ELITE.6495.9885.v2 create mode 100644 src/logs/gradient_tape/20200930-H3123/test/events.out.tfevents.1601469083.minh-B365-M-AORUS-ELITE.6566.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H3123/train/events.out.tfevents.1601469083.minh-B365-M-AORUS-ELITE.6566.9885.v2 create mode 100644 src/logs/gradient_tape/20200930-H4018/test/events.out.tfevents.1601469618.minh-B365-M-AORUS-ELITE.7018.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H4018/train/events.out.tfevents.1601469618.minh-B365-M-AORUS-ELITE.7018.9885.v2 create mode 100644 src/logs/gradient_tape/20200930-H4033/test/events.out.tfevents.1601469633.minh-B365-M-AORUS-ELITE.7084.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H4033/train/events.out.tfevents.1601469633.minh-B365-M-AORUS-ELITE.7084.9885.v2 create mode 100644 src/logs/gradient_tape/20200930-H4056/test/events.out.tfevents.1601469656.minh-B365-M-AORUS-ELITE.7162.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H4056/train/events.out.tfevents.1601469656.minh-B365-M-AORUS-ELITE.7162.9885.v2 create mode 100644 src/logs/gradient_tape/20200930-H4240/test/events.out.tfevents.1601469760.minh-B365-M-AORUS-ELITE.7294.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H4240/train/events.out.tfevents.1601469760.minh-B365-M-AORUS-ELITE.7294.9885.v2 create mode 100644 src/logs/gradient_tape/20200930-H4422/test/events.out.tfevents.1601477062.minh-B365-M-AORUS-ELITE.11214.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H4422/train/events.out.tfevents.1601477062.minh-B365-M-AORUS-ELITE.11214.9885.v2 create mode 100644 src/logs/gradient_tape/20200930-H4457/test/events.out.tfevents.1601477097.minh-B365-M-AORUS-ELITE.11292.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H4457/train/events.out.tfevents.1601477097.minh-B365-M-AORUS-ELITE.11292.9885.v2 create mode 100644 src/logs/gradient_tape/20200930-H4605/test/events.out.tfevents.1601477165.minh-B365-M-AORUS-ELITE.11366.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H4605/train/events.out.tfevents.1601477165.minh-B365-M-AORUS-ELITE.11366.9885.v2 create mode 100644 src/logs/gradient_tape/20200930-H4726/test/events.out.tfevents.1601470046.minh-B365-M-AORUS-ELITE.7607.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H4726/train/events.out.tfevents.1601470046.minh-B365-M-AORUS-ELITE.7607.9885.v2 create mode 100644 src/logs/gradient_tape/20200930-H5041/test/events.out.tfevents.1601477441.minh-B365-M-AORUS-ELITE.2383.9893.v2 create mode 100644 src/logs/gradient_tape/20200930-H5041/train/events.out.tfevents.1601477441.minh-B365-M-AORUS-ELITE.2383.9885.v2 create mode 100644 src/models/__pycache__/__init__.cpython-38.pyc create mode 100644 src/models/__pycache__/decoder.cpython-38.pyc create mode 100644 src/models/__pycache__/encoder.cpython-38.pyc create mode 100644 src/models/__pycache__/multiheadattn.cpython-38.pyc create mode 100644 src/models/__pycache__/transformer.cpython-38.pyc delete mode 100644 src/models/multihead_attn.py create mode 100644 src/models/transformer.py diff --git a/src/__pycache__/config.cpython-38.pyc b/src/__pycache__/config.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb6304595bbcfb51c479b54454bf4a3cb28fe8d4 GIT binary patch literal 457 zcmY+A&rZTX5XQGap|q65U?NAaM5Ql43_>7yQ6qBM%Vy~;bOZg9b`jwTdA3P52jpe!lUpeA8_T>HDhW=J?=3_mM6~lh zR{K`B>*0M%`3|_^ULpB8Iu9LJJ2K=;!L|}qpc_XBgj@!SHjm^7Jc`6R)Ar!*WjJBe zdpgq2U^X6)o{!*)a4rC3v}7PdZEhl|GHt3jVJmIF#y+33Ozs73E(BMbRAej*cp_+< zDU(}1E#$JB?6lPjW7w?4ZSgNz8>M2L9_x>P0X%ei&;S4c literal 0 HcmV?d00001 diff --git a/src/__pycache__/loader.cpython-38.pyc b/src/__pycache__/loader.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..593fa7c3a073f4aabed607f293c53e81466cd0e5 GIT binary patch literal 2723 zcmZuzTW=&s74E9O_l&*9MA;++FbO;i5Sz;rXk-Z4h|7a{ST9mq8?}0;YG!)e(>|`FEPyTF* z&r{F)Crw^{I3~Zws|6b2308Y~-?N;%xt|BhGa&SRZ|$L zO`%tHWhRWa-#!WY2(OPFv-=v-2aS@XRFM`LxI#`G6XdQP!7j{t>cYGSchV0}j5v3^) zC;B2!rd1=VW~N_JmDDEZJ|tC#XawuL`*7rah(-#L130Z!MyeyNCP#H!3aJiP&ulWT%jFYM9`E4% zzwmlj@6c)i9S_HKJ1J{@j6uK2LhD*Kkvo3iC6ETG2Vi0I61fTZhUCYPp;mo zmgP+9qrNMeRxQiAx{_k#Ti$9LtV%Pt!J@4iMVG;>l)P1_wl%$ZF~o|Zxl028)0=(% zcY6Nut1o|u*@)+9j{)3khpKEuyDZ=^BS)F#D`68Qd)w52t{@L9vfTR1a%H1Zn`bL& zeQlIYOj|VP%hfYQxEUqt9_>peZDV9(v`xyo2I4MFp3$f`%MN=aZf6%TQEb-+Ta;en z_@G~+3v_RtGm1>K$uI|N5bgS}Vzq6m+C4q8&%&GQH8y1X>`j(% z^<&7shGPo0^Wr`R1_&1Xxz~9QylZw%MC6~ma)cD7Bbr|NNauaJyJp7gu>DUS^FHH( zQkM~R`N*9SxBT!Ymfb?yrL9vo9nWMVFIVdLfbOD&09&^=e1|8qP93aUzYysL_1dFiq$uq!G zU?PW_5Nq)w*`%G<1;7(qM_shY%)rTqiyh%(H)NY#b8oSCyZ()@R)ZI;3mxjh6L3MR ziPz&*DTpL7+(#}Jt5PA?WL>}>0Fv3klk@3Rs)BGe+PC{IoeE^GmYUEHFsyCpw702g z&twCGRG#&W0mXElArFcRIWeuW*&Q4#7q|Rz2c_8YhtTiitGD!#fj|DTSVETEw&QKjwGo)-m%dc4%xmp>YT!rKF=IrZJd(6p=K%nqlODO&n7K{TDw=kPy6$V>1Jbxr~2LiGVI11t7ripWa6L*MuoHSbdM zZED`8=C^22QrzDZsK0j?z`_Y_($|C6)|~KlT1)%z(mx5@s^3E6v3 ' + x + ' ').astype(str).to_numpy() - all_images = contents['filename'].map(lambda x: os.path.join(data_root, 'images', x)).astype(str).to_numpy() + all_images = contents['filename'].map(lambda x: os.path.join(data_root, 'images/images_normalized', x)).astype(str).to_numpy() train_images, valid_images, train_texts, valid_texts = train_test_split(all_images, all_text, test_size=0.2, random_state=42) @@ -47,7 +47,9 @@ def load_data(data_path): train_images, valid_images, train_texts, valid_texts, all_text = load_csv(data_path) tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=config.top_k, - oov_token="") + oov_token="", + filters='!"#$\t\n', + lower= True) tokenizer.fit_on_texts(all_text) tokenizer.word_index[''] = 0 diff --git a/src/logs/gradient_tape/20200930-H0749/test/events.out.tfevents.1601471269.minh-B365-M-AORUS-ELITE.8517.9893.v2 b/src/logs/gradient_tape/20200930-H0749/test/events.out.tfevents.1601471269.minh-B365-M-AORUS-ELITE.8517.9893.v2 new file mode 100644 index 0000000000000000000000000000000000000000..55853008a0f4333a719f69680d9d0d002b7cdcbd GIT binary patch literal 40 rcmb1OfPlsI-b$Pd3=St1?p}A4;!P?_%*@ksElbTSu`;Unkx2mnwkZnf literal 0 HcmV?d00001 diff --git a/src/logs/gradient_tape/20200930-H0749/train/events.out.tfevents.1601471269.minh-B365-M-AORUS-ELITE.8517.9885.v2 b/src/logs/gradient_tape/20200930-H0749/train/events.out.tfevents.1601471269.minh-B365-M-AORUS-ELITE.8517.9885.v2 new file mode 100644 index 0000000000000000000000000000000000000000..55853008a0f4333a719f69680d9d0d002b7cdcbd GIT binary patch literal 40 rcmb1OfPlsI-b$Pd3=St1?p}A4;!P?_%*@ksElbTSu`;Unkx2mnwkZnf literal 0 HcmV?d00001 diff --git a/src/logs/gradient_tape/20200930-H0817/test/events.out.tfevents.1601471297.minh-B365-M-AORUS-ELITE.8610.9893.v2 b/src/logs/gradient_tape/20200930-H0817/test/events.out.tfevents.1601471297.minh-B365-M-AORUS-ELITE.8610.9893.v2 new file mode 100644 index 0000000000000000000000000000000000000000..158763aa76cbd400e067b0d0d3cd1a840d73c59f GIT binary patch literal 40 rcmb1OfPlsI-b$Pd3=S6*?p}A4;!P?_%*@ksElbTSu`*hbe^VL&xep7l literal 0 HcmV?d00001 diff --git a/src/logs/gradient_tape/20200930-H0817/train/events.out.tfevents.1601471297.minh-B365-M-AORUS-ELITE.8610.9885.v2 b/src/logs/gradient_tape/20200930-H0817/train/events.out.tfevents.1601471297.minh-B365-M-AORUS-ELITE.8610.9885.v2 new file mode 100644 index 0000000000000000000000000000000000000000..158763aa76cbd400e067b0d0d3cd1a840d73c59f GIT binary patch literal 40 rcmb1OfPlsI-b$Pd3=S6*?p}A4;!P?_%*@ksElbTSu`*hbe^VL&xep7l literal 0 HcmV?d00001 diff --git a/src/logs/gradient_tape/20200930-H0955/test/events.out.tfevents.1601471395.minh-B365-M-AORUS-ELITE.8771.9893.v2 b/src/logs/gradient_tape/20200930-H0955/test/events.out.tfevents.1601471395.minh-B365-M-AORUS-ELITE.8771.9893.v2 new file mode 100644 index 0000000000000000000000000000000000000000..999cad9d3ce3d424b98cb0fd044786d4a54d5d56 GIT binary patch literal 40 rcmb1OfPlsI-b$Pd3NT9%quVr5iuKNT9%quVr5iuKyA>qNkxg7d3vs8sd*(yA>qNkxg7d3vs8sd*(NT9%quVrBGF{ud_z!LNT9%quVrBGF{ud_z!LyA>qNkxg7d3vs8sd*(yA>qNkxg7d3vs8sd*(yA>qNkxg7d3vs8sd*(yA>qNkxg7d3vs8sd*(g`kf?X9Q@gVv!h(HF6K#l_t7qb9~6oz01O-8?!3`HPe1o6v6KO;XkRX;Z~ zFGIgPzbIS3qA0OawM literal 0 HcmV?d00001 diff --git a/src/models/__pycache__/decoder.cpython-38.pyc b/src/models/__pycache__/decoder.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5d80e849f42fd4b0e64622afd98f63f49cd9502 GIT binary patch literal 3230 zcmcguUvC>l5Z^!d&yM4`X&Ty6L4bl}L}S`YNI)s26pDlzBxpgh5Xbdy9ox0<(%p5N zM&}n=seS23Kyn`amG+gVd<7&_X7+5yP6`MfIP1o{v%9miGxPi1-dSEQ6L?mB)#0;M zLVm-}{Nq698ocT@h=e4xN17D=S&#L3lT&h&Bs_7Rki-$R>vY}6yy=0?O+3(fGo24Q zKM6n=%ydCQza#b10azwmb!t{`4SKrsm57qhv=*uEWND0J^ADeN)OlzK>r0b8B`uoJ zggptGyzr*!JS7fkx?tZmC$2~BBub;fp;wUN){8>G zL!?R=_QOSZFB$rMof$`o9!wm-A~mL_6ozpxQYs9`Cx zE=aMHNiAeU$+*$a646tQM8wdLn>&xB2X46VssIF`Ra%4J-q~XT+MIeCPXmtgaS>Yh z9A5P^2u(&bXK->hY`~^_&g89OAl0 zhCb+n+@EOu+|RuhgE;ani4_QEccR^ns+Xh>U9%*1ccL^2lTKe5mZrwhQQDSEFeWP~ zmT{J^vX<^gyQVBfGKfW}2K{;XXCBoS%zRPi4^?C5r=JAD!2j?MLG zT8rUig3TA?HO{0kCk}@niq3XhD^rcLREu35J`l0aqy$)zLY%~%ojB_C&{Ks2#I5iu z4PJO|nb&wgt88!mcs6iJ-NX7xBXBG<7vmPw2isUk zXjnY}u?V@G0noSx*mb>;pK~ZE)C7g5Tgpk7L#aRsCHy%QS6H6RUBVFy)5=U!^p|zj zu5uFhIoeu5n2x=z+ornR%VefQ09h-O{W>=_DWEjcP>K5kk;YIXm8nKYT9{K(!~-dT zp)8y`QLnQn0iPMf84&=1={Vya$~h&A~(3Oue{L4g%4ak4CdFr_d0cSVwP(rr@` zvnId!?H$9nvVKJw`5N?Y@ z3f+Ps4qm_`*T9bhj8MI710^p&yPymYP#H&B;(=uY#rq(P4;0Xjb_C!V|KvqbPC0_^ z%2#pXITWv>cnyT{vVq#`7dNrvo{0;*FUn#FR;b`&nmixEb|yI3MY_ZOMwcF<-Jcl2=eML%pHl zA@JoVICd4qApmBW{|WTP7o7upE~5U_vC#f|_{&dm<7+6cqd4LMf^!$%DY(xa+j0)& z@{2_ahbYDp-9j^%VnFdqa8{zOUNU94lB-O%df7wELe16Udbz1(lq&pm!ZjDE#eUn2 mdkZh0MP(v4STl28TmLb7Cb&{uWv=3l*&1uw literal 0 HcmV?d00001 diff --git a/src/models/__pycache__/encoder.cpython-38.pyc b/src/models/__pycache__/encoder.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b2c127bd620c8bf506afb2275306ae475fa7c1f GIT binary patch literal 1023 zcmYjQJ8u**5VpO~T<$_ZAd!NUNLz3x5lRFINT|35(QIQaYwz7&e7lS7Q*zOzN~Bc$ z0H`iy{tpdqsnSu?z|3A=BhBo1Y|niDX7+Qh*C8N(=9d3tg#5ADEoKRvtOo@QMc?-gQ3xFrlGG5SgIK|flLEu9dm(y${D{1#eKfa^itlt6a^554r;wWIMmtvnU+nht<(df^T8BIQf44mZ1tPDUIQ4|eG`Hqbc1cuqs=Qe z``2KH`E^_H5c(15J5aL+WlU>sH! zMr0YT?k(CO8EOjIF8_gCVi6dZ80Z1Qs!V!Z literal 0 HcmV?d00001 diff --git a/src/models/__pycache__/multiheadattn.cpython-38.pyc b/src/models/__pycache__/multiheadattn.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c7047610d0fb3960c677a9c013ab14f5eeb3945 GIT binary patch literal 3030 zcmZ`*TXP)66`r2k&aNa|7_hCFK&QgR3t~lK3KA7g;sTO9fF7_@l@e4cp~q-q@+jxjQjxpK)>>&7k&UQ<~u#Rk|Nu^>Z87$)2GknJM)i~m5zh< z(+@iSc?JJza?ZtWPTPrcCtXQkvH;3!6R=w%)+dN z-B!l#JKg9Me9l%%au%(PL4f`lx|)J$XT)l!W*H=Z?;ZKtId;*#nw_xBTcdsd#61dX z?2LlruntbxE$1k#z04oC<(D;$f>)eRu!a+@QB;SSmxbTEM=i)buaY zm3-ym{v=PUJkf&df#8YOqR_*#;QL~@JJ71v;e2nMkJtH(Pb#JPAejk1Npw0G7P~wb zNk;#(;Y1V)e3s!wr>CN*ayS+gd2UY28?44`P2L)uQIajbdEj^I18OtLgl zI@!qu@0XGjCs}~5#@{`;t2Xf;X4vF=m5>L`&HV0Ne!JJ}t)D3iGeH?W=-SicO+FSc zJPQk5I6K?qvm~#?izLtHr{h^8o2;?P|C*PH=J1z-*84*}n6v-?ldLkNKIgi8R^v0P zcYFME(NEwXn6TtqWg*A{)&^e-UCCl@hO*MrO0S=XN<0r0oPN+{W=YGWP}5S0c<1yf z+SV?j+Yvwh_&@CW$-|-QwvDU%CPW}is@(Wwe-l`TnpU5lb>~ClV|QSDwI{W4i^8;2 z+1HcgA(+{oVPnH)bViN0+l`!fZ*1IbynREZNiMSZESBTE+(l%Vm(Ew_Dx0u*%@aqy z3LSs>gjx7;Nm9a=h_SCI|0JF)gJQazn{04}g z9Bq5ZYai(1XB~?(Y&*!Jtc{iMJI4p=AiI~$wXaCWzeC4+i>yu;l1Vy^7hTnuXh#Eq z9=XTviNl=Qt(_R?1L_kUdDnq_V^^)~;9sm+`Hh3!Py=g5?c)w173NEq=F6AnE0^Y1 zGUvYY#NCn%I&`lh5UQz=@+M{`isNB1)NyRuMKy`Z;L5bJ*m}WSxu=wnl&oJ!SxOVw zoRS2qCKwBuC==$%fso1spCeC%argJo5hwdII*p~ShZqxiU;Y+c@->hr%(UY;%@d{K z*v5stPJ6dNy6EZ}h{LY1kcI9wIB}KX=N{gA*518^`p+1?1reW-upWqoXI8rznHgpu zd3}OqhX4fY$C*VH(&;4l*5F^Bfn))0}`LLiYKeN>WW|~xfu*giyDTP9S za8;!P%n3>$&L$xrDt>#1Fg{kpL$S^;$eRPkLa>DhNnvE&cZ!PQoRT9?X$_|I;mb)k zXq?i*odny~&hibMFzqwA=z8)UT44`O=lt@<0hFEA3uZ~t)0;^616GwSs{T!O`1*6~ zf8p>}N^!AhRPaM|>x2JdV13}B)Uq`PMHa=k#_Y($$V1^|NASxrynzwCbR5(k3b1SU z1EdO#=|q6UpfS@Z$5>~z|H%K=JqoD^k6I|nbAQ5{0ZKDU^)U){9b_T=-tK$gjUcB} zN4Tk7w-?2_Ne}rhRF&@$Sts&75wdi5$y!)OwD5Z(ZxVTn$Q>dCH1Y!?8$@ihzl~A1 zW34U;o#a&_SBShsgg$q+S1-e* z%M3r?Wv*j?Pb2TT8?w$*-|M)CH=g52l9N(!EgFr<_11x;{5*+}Kz+CiX~MP;Q4JOx z3RwaJDnN>ahKIyLVOAx<7+^Re_~(R8=oIq7mm_Z5n)x4Oa!p16B9sHIgOX& zc&|$GX2)Vk?A*u@<~dM=bOZ7Sc?Sdr>>CMy<}8MaYzLZP93a z7SwbB*#u$dB7}k6C-|PU-;h9Ue8oEdrg=cUnf3iOAhBEnV>fKViC{`nu9!g3GbDYs znpPq9Z5?OE10Gzq;>)QNFkn3Y2#cvme*?ZIRoP8nQbfRFo#x{O6@S0E&IT#Rru-d< h3BDxow(qno7cM#vvFOvr)_|tpjDoS11>ZQ7> zm02y>iqZv{kS7Qe!4VFGeDsYA2aa60%^%1s2f_!=L~y`-FRM2#u691y_f!U zZmweBd+#s{&c9|D|D?wBXQOcwMP?w85wX~)v&f9B$41@!(1`5Fd2B=uXC13!cbt8z z?nZ7@!e|MjUR1_td1}-{ZzY;R@61$h8NIX79D3)bdMgpTZ&c@B;ACUHL7a@fGAa(r zZ4}uD$&3!mjhsc`?WwhAXU3t4V&&`^i>zfDx1X7NPL7d|bLi&IGj_??b8{=QyJhil zPBrJe@hqtx+oH?ku4skv9eJ}fWgCf0komzSC*xGIJBOq?IajX@k{+|OFv53`IX zSvyVq9p2t-WwKWB{jC+hyW(&Ay@Aa9R=CamUYIpn?PSxBc^J{(sNLg<#5%p;RkQm% z8N^u|tNCQbir=pB+L|x)Ukv1Mt-gWfwGYhl_UW= z8vkyIO}U1@5W||kHQ?e=akBsZ`~Kxxt+p~|7GeT3S*xmNch~$bf8|_AXlm~En!g>! z1OBR=w}-QX?P70|#+rXOPQ%QHz9hKbX=knB@}pPM%nn2W zslT2koFt%S@CnZbA{mMy9c2AMwlXo5U}!3s{h-R!OeVra_EX7&jTbNRt#87*4gUG( zf5)gDK5WZsS(#Z=xiEy@AXYZXUpZQ#s??-wUAwAmjJA|5w?wAQBvB=qHnU#%5R2KC zVqL|yX^!mH%f&vyR#%x(w|+r3!k9oH`>DvdSe2r&+DjuI%T;|zm`jg~TK|!HV|ZqiWJF#Rc>=XkhxI>Ga9(fN;Vp;M9T9)X4DIt6T7D8|Y*M zg$Sou-6>G2?j9I6Pz%02qZaRxt9=Bszcupd#%O2So$8|^b2Ks|1ODurhvqYb8M&Do z0k8+uCfc!%K}L7=LfE*GIx zZX7=1LMrDDyo4)rXA1>Zva?OI(N(ozVLWb%FJP5;1LQeVrE zid+OS*gSKYYc4{G3k*MV_tHsu*GBB05PS(++@{THAR3%mZbl?#h`nPq3FsLCKEeS4 z6@{oxo1mOvd-wPE`!OJl@EUAp6DalD3HkU~RwIxp9Qnb>FZD{nB@D!ugGLK&0upSe zUC?h!|MCVQd?4Gqe8r#IZwMEojWs|BG9#_N7lVqh4=(1OJ3Z;~ACV{q= zv-l!rsPY&rs+RaV^{|JkGC90Z0QsiYf|`+Z^}-U~L$A~!)xOAf-#kVBQ)_9*XdPGxv~1Zxh(+kl(b}_6vk*4f9`teuZJ-7%9Xh#%@N4RE2d+YOn&H4P zsLnD3WAwA!-nV~c?zt3#J0%3fc%rx}x zrDrXfE~~t>t%|vO;S@#EJ`FOi(MNSsy;dMGKXf8EP!s9`qOf&XRDlJ%T%^E&0iZ-y z&Empzx#R54z&?2vAQ#B^&oM6biKm;^)YK}D@(8Pvu*ZQ_m4Et8QUV1XF^1MLC?VAt-58rG)S3D9)4x zJUA}Ja}B{Ez+jky1r(xxiFJjmvWvF3iB<#U$VX;;6GgrWGKzXpN$2SdF;8ddh0JgUll2!*m`2|FA?b?ozf+Q$*!r7hNoe2)kPM4|0?;`L+K7q>9FfugnUv5KR6 z=&nb;(Thj}#6;^l+o3F{49x=^DcjMxjia@{phH=aeSJ(3{$GFkBYl7S;BJGIKKiIm zGNPhie@AnijPe_EpE4a70P2>m4>Ot$-fl3Fshno2XOz{<<+e7O`C(QwYM;pwM1qB( zDQ!Oe_g@eH^N+vWfHjqPU$$eI(pm93*b*NR(Xu2rS5~hTYEfzJO{`5@m|E1gT}Rp( z#_e5glB%L}oCJMWrs`K!>5fSINN<%F6_qRF+Eq12?}dOrL{^)GG5nok?<5Hl9}+LN zfFFxbXyh`HiFS$K9!inb0@s1(<|139WXRlIKCYgL1&{QNwFZI7*hTAT6c!9UKoid> zY}yox@hY~-T@fyA%+Nvgf`RaekO^dSQ7d7VP48Ph3oQg1aU)6{8G2Z~lxn zu@jz0K5>(#(W0^EITO5v)Lb<)#h>QsgnyAnwSUoO2gU(AFb|NxlJ7kBG9c1EyJPH= z7qO{P>*T2R6?^?iNF$J=AVfKbBzM__k^Fj*a^ok)6Y~jsV(r_1Ft==g9$6>Aimw0M zRA%iR@hMg9!m7!ji@ftb)yAf*D%az-%o=I0f8#TZXsNl(e7Sph>gr?ItABN2A0XgpmO3|{FKPgi2Q=cm`_0t@;j6t-xqJRJDNs-# z1FhPp+NWlELi7oAuLBevYdog!iC1c+`0oq6`F9Ge(}}8&i3Q*}8t`MS6ZqpUHU%-HoPGK literal 0 HcmV?d00001 diff --git a/src/models/decoder.py b/src/models/decoder.py index 07df82e..08762e3 100644 --- a/src/models/decoder.py +++ b/src/models/decoder.py @@ -1,5 +1,5 @@ import tensorflow as tf - +from models.transformer import MultiHeadAttention class BahdanauAttention(tf.keras.Model): def __init__(self, units): @@ -46,11 +46,11 @@ def __init__(self, embedding_dim, units, vocab_size): self.fc1 = tf.keras.layers.Dense(self.units) self.fc2 = tf.keras.layers.Dense(vocab_size) - self.attention = BahdanauAttention(self.units) + self.attention = MultiHeadAttention(self.units, num_heads= 8) def call(self, x, features, hidden): # defining attention as a separate model - context_vector, attention_weights = self.attention(features, hidden) + context_vector, attention_weights = self.MultiHeadAttention(features, hidden) # x shape after passing through embedding == (batch_size, 1, embedding_dim) x = self.embedding(x) @@ -74,3 +74,48 @@ def call(self, x, features, hidden): def reset_state(self, batch_size): return tf.zeros((batch_size, self.units)) + +class MultiheadDecoder(tf.keras.Model): + def __init__(self, embedding_dim, units, vocab_size): + super(MultiheadDecoder, self).__init__() + self.units = units + + self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) + self.gru = tf.keras.layers.GRU(self.units, + return_sequences=True, + return_state=True, + recurrent_initializer='glorot_uniform') + self.fc1 = tf.keras.layers.Dense(self.units) + self.fc2 = tf.keras.layers.Dense(vocab_size) + + self.multiheadattention = MultiHeadAttention(self.units, num_heads= 8) + self.attention = BahdanauAttention(self.units) + + def call(self, x, features, hidden): + # defining attention as a separate model + + features, _ = self.multiheadattention(features, features, features) + context_vector, attention_weights = self.attention(features, hidden) + + # x shape after passing through embedding == (batch_size, 1, embedding_dim) + x = self.embedding(x) + + # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size) + x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1) + + # passing the concatenated vector to the GRU + output, state = self.gru(x) + + # shape == (batch_size, max_length, hidden_size) + x = self.fc1(output) + + # x shape == (batch_size * max_length, hidden_size) + x = tf.reshape(x, (-1, x.shape[2])) + + # output shape == (batch_size * max_length, vocab) + x = self.fc2(x) + + return x, state, attention_weights + + def reset_state(self, batch_size): + return tf.zeros((batch_size, self.units)) \ No newline at end of file diff --git a/src/models/encoder.py b/src/models/encoder.py index 3743d0f..7c8cfe3 100644 --- a/src/models/encoder.py +++ b/src/models/encoder.py @@ -1,7 +1,7 @@ import tensorflow as tf from tensorflow.keras.applications import EfficientNetB3 -from src import config +import config class Encoder(tf.keras.Model): diff --git a/src/models/multihead_attn.py b/src/models/multihead_attn.py deleted file mode 100644 index 11b3690..0000000 --- a/src/models/multihead_attn.py +++ /dev/null @@ -1,49 +0,0 @@ -from keras - -class MultiHeadAttention(tf.keras.layers.Layer): - def __init__(self, d_model, num_heads): - super(MultiHeadAttention, self).__init__() - self.num_heads = num_heads - self.d_model = d_model - - assert d_model % self.num_heads == 0 - - self.depth = d_model // self.num_heads - - self.wq = tf.keras.layers.Dense(d_model) - self.wk = tf.keras.layers.Dense(d_model) - self.wv = tf.keras.layers.Dense(d_model) - - self.dense = tf.keras.layers.Dense(d_model) - - def split_heads(self, x, batch_size): - """Split the last dimension into (num_heads, depth). - Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth) - """ - x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) - return tf.transpose(x, perm=[0, 2, 1, 3]) - - def call(self, v, k, q, mask): - batch_size = tf.shape(q)[0] - - q = self.wq(q) # (batch_size, seq_len, d_model) - k = self.wk(k) # (batch_size, seq_len, d_model) - v = self.wv(v) # (batch_size, seq_len, d_model) - - q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth) - k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth) - v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth) - - # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth) - # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k) - scaled_attention, attention_weights = scaled_dot_product_attention( - q, k, v, mask) - - scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth) - - concat_attention = tf.reshape(scaled_attention, - (batch_size, -1, self.d_model)) # (batch_size, seq_len_q, d_model) - - output = self.dense(concat_attention) # (batch_size, seq_len_q, d_model) - - return output, attention_weights \ No newline at end of file diff --git a/src/models/transformer.py b/src/models/transformer.py new file mode 100644 index 0000000..95b0e69 --- /dev/null +++ b/src/models/transformer.py @@ -0,0 +1,141 @@ +import tensorflow as tf + +def scaled_dot_product_attention(q, k, v, mask): + """Calculate the attention weights. + q, k, v must have matching leading dimensions. + k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v. + The mask has different shapes depending on its type(padding or look ahead) + but it must be broadcastable for addition. + + Args: + q: query shape == (..., seq_len_q, depth) + k: key shape == (..., seq_len_k, depth) + v: value shape == (..., seq_len_v, depth_v) + mask: Float tensor with shape broadcastable + to (..., seq_len_q, seq_len_k). Defaults to None. + + Returns: + output, attention_weights + """ + + matmul_qk = tf.matmul(q, k, transpose_b=True) # (..., seq_len_q, seq_len_k) + + # scale matmul_qk + dk = tf.cast(tf.shape(k)[-1], tf.float32) + scaled_attention_logits = matmul_qk / tf.math.sqrt(dk) + + # add the mask to the scaled tensor. + if mask is not None: + scaled_attention_logits += (mask * -1e9) + + # softmax is normalized on the last axis (seq_len_k) so that the scores + # add up to 1. + attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # (..., seq_len_q, seq_len_k) + + output = tf.matmul(attention_weights, v) # (..., seq_len_q, depth_v) + + return output, attention_weights + + +class MultiHeadAttention(tf.keras.layers.Layer): + def __init__(self, d_model, num_heads): + super(MultiHeadAttention, self).__init__() + self.num_heads = num_heads + self.d_model = d_model + + assert d_model % self.num_heads == 0 + + self.depth = d_model // self.num_heads + + self.wq = tf.keras.layers.Dense(d_model) + self.wk = tf.keras.layers.Dense(d_model) + self.wv = tf.keras.layers.Dense(d_model) + + self.dense = tf.keras.layers.Dense(d_model) + + def split_heads(self, x, batch_size): + """Split the last dimension into (num_heads, depth). + Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth) + """ + x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) + return tf.transpose(x, perm=[0, 2, 1, 3]) + + def call(self, v, k, q, mask = None): + batch_size = tf.shape(q)[0] + + q = self.wq(q) # (batch_size, seq_len, d_model) + k = self.wk(k) # (batch_size, seq_len, d_model) + v = self.wv(v) # (batch_size, seq_len, d_model) + + q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth) + k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth) + v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth) + + # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth) + # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k) + scaled_attention, attention_weights = scaled_dot_product_attention( + q, k, v, mask) + + scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth) + + concat_attention = tf.reshape(scaled_attention, + (batch_size, -1, self.d_model)) # (batch_size, seq_len_q, d_model) + + output = self.dense(concat_attention) # (batch_size, seq_len_q, d_model) + + return output, attention_weights + + def reset_state(self, batch_size): + return tf.zeros((batch_size, self.units)) + +def create_padding_mask(seq): + seq = tf.cast(tf.math.equal(seq, 0), tf.float32) + + # add extra dimensions to add the padding + # to the attention logits. + return seq[:, tf.newaxis, tf.newaxis, :] # (batch_size, 1, 1, seq_len) + +class FullyConnected(tf.keras.layers.Layer): + def __init__(self, dmodel, dff = 2048): + self.ffn = tf.keras.Sequential([ + tf.keras.layers.Dense(dff, activation='relu'), # (batch_size, seq_len, dff) + tf.keras.layers.Dense(d_model) # (batch_size, seq_len, d_model) + ]) + def call(self, x ): + return self.ffn(x) + + +class TransformerLayer(tf.keras.layers.Layer): + def __init__(self, d_model, num_heads, dff, rate=0.1, with_external = False): + super(TransformerLayer, self).__init__() + + self.mha = MultiHeadAttention(d_model, num_heads) + self.ffn = FullyConnected(d_model, dff) + + self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) + self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) + + self.dropout1 = tf.keras.layers.Dropout(rate) + self.dropout2 = tf.keras.layers.Dropout(rate) + + self.with_external = with_external + # if self.with_external: + + def call(self, x, training, mask): + attn_output, _ = self.mha(x, x, x, mask) # (batch_size, input_seq_len, d_model) + attn_output = self.dropout1(attn_output, training=training) + out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, d_model) + + ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model) + ffn_output = self.dropout2(ffn_output, training=training) + out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model) + + return out2 + +class Transformer(tf.keras.layers.Layer): + def __init__(self, num_layers, d_model, num_heads, dff, rate=0.1): + self.enc_layers = [TransformerLayer(d_model, num_heads, dff, rate) + for _ in range(num_layers)] + self.enc_layers = [TransformerLayer(d_model, num_heads, dff, rate, with_external=Transformer) + for _ in range(num_layers)] + diff --git a/src/train.py b/src/train.py index 6a8f6df..b9b74f1 100644 --- a/src/train.py +++ b/src/train.py @@ -4,18 +4,18 @@ import tensorflow as tf from tensorflow.keras.utils import Progbar -from src import config -from src.loader import load_data -from src.models.encoder import Encoder -from src.models.decoder import Decoder - +import config +from loader import load_data +from models.encoder import Encoder +from models.decoder import Decoder, MultiheadDecoder if __name__ == '__main__': train_ds, valid_ds, max_length_train, max_length_valid, tokenizer = load_data(config.data_path) encoder = Encoder(config.embedding_dim) - decoder = Decoder(config.embedding_dim, config.units, config.vocab_size) + # decoder = Decoder(config.embedding_dim, config.units, config.vocab_size) + decoder = MultiheadDecoder(config.embedding_dim, config.units, config.vocab_size) optimizer = tf.keras.optimizers.Adam() loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none') @@ -47,14 +47,16 @@ def train_step(img_tensor, target): # because the captions are not related from image to image hidden = decoder.reset_state(batch_size=target.shape[0]) dec_input = tf.expand_dims([tokenizer.word_index['']] * target.shape[0], 1) - + # target length x units with tf.GradientTape() as tape: + # 81 x 256 features = encoder(img_tensor) + print('encode shape :', features.shape) for i in range(1, target.shape[1]): # passing the features through the decoder predictions, hidden, _ = decoder(dec_input, features, hidden) - + loss += loss_function(target[:, i], predictions) # using teacher forcing @@ -70,20 +72,52 @@ def train_step(img_tensor, target): return loss, total_loss + def evaluate_step(img_tensor, target): + loss = 0 + + # initializing the hidden state for each batch + # because the captions are not related from image to image + hidden = decoder.reset_state(batch_size=target.shape[0]) + dec_input = tf.expand_dims([tokenizer.word_index['']] * target.shape[0], 1) + + features = encoder(img_tensor) - EPOCHS = 20 + for i in range(1, target.shape[1]): + # passing the features through the decoder + predictions, hidden, _ = decoder(dec_input, features, hidden) + + loss += loss_function(target[:, i], predictions) + + # using teacher forcing + dec_input = tf.expand_dims(target[:, i], 1) + + total_loss = (loss / int(target.shape[1])) + + return loss, total_loss + + EPOCHS = config.EPOCHS for epoch in range(0, EPOCHS): start = time.time() total_loss = 0 pb_i = Progbar(max_length_train, stateful_metrics=['loss']) + # Training + print('[TRAIN]') for (batch, (img_tensor, target)) in enumerate(train_ds): batch_loss, t_loss = train_step(img_tensor, target) total_loss += t_loss - pb_i.add(config.BATCH_SIZE, values=[('loss', total_loss)]) - + pb_i.add(config.BATCH_SIZE, values=[('total loss', total_loss)]) + pb_i.add(config.BATCH_SIZE, values=[('batch loss', batch_loss)]) + + # Evaluate + print('[EVALUATE]') + for (batch, (img_tensor, target)) in enumerate(valid_ds): + batch_loss, t_loss = evaluate_step(img_tensor, target) + total_loss += t_loss + pb_i.add(config.BATCH_SIZE, values=[('total loss', total_loss)]) + pb_i.add(config.BATCH_SIZE, values=[('batch loss', batch_loss)]) ckpt_manager.save()