From 35029a0c84e7c9f2c56e2b2256746654a5084432 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Mon, 7 Oct 2024 10:40:43 -0700 Subject: [PATCH 01/18] long context perf Signed-off-by: Youngeun Kwon --- .../source/performance/performance_summary.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/source/performance/performance_summary.md b/docs/source/performance/performance_summary.md index 98dae2dc0a78..0234e1d7ed42 100644 --- a/docs/source/performance/performance_summary.md +++ b/docs/source/performance/performance_summary.md @@ -40,3 +40,22 @@ | LLAMA2-7B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 24824 | 663 | ***0.8*** | | LLAMA2-13B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 14629 | 757 | ***1.4*** | | LLAMA2-70B | LoRA | 8 | 32 | 1 | 4096 | 2 | 4 | 2621 | 722 | ***7.9*** | + + +### Long Input Sequences + +- The results in the table below show the pre-training performance of the LLAMA2-7B model with various input sequence lengths at FP8 precision. + - Container: [NeMo24.03.01.framework](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags) + - System: DGX-H100 + +| Sequence Length (K)| #-GPUs | GBS | MBS | TP | PP | CP | VP | DP | Tokens / sec / GPU | Model TFLOP / sec / GPU | ***Est. time to train in days (10T tokens, 1K GPUs)*** | +| -------------------| ------ | --- | --- | -- | -- | -- | -- | -- | ------------------ | ----------------------- | ------------------------------------------------------ | +| 4 | 4 | 1024 | 1 | 1 | 1 | 1 | 1 | 4 | 16671 | 768 | ***7*** | +| 8 | 8 | 512 | 1 | 1 | 2 | 1 | 1 | 4 | 13907 | 730 | ***8*** | +| 16 | 16 | 256 | 1 | 2 | 1 | 1 | 1 | 8 | 10082 | 660 | ***11*** | +| 32 | 32 | 128 | 1 | 2 | 1 | 2 | 1 | 8 | 6687 | 610 | ***17*** | +| 64 | 64 | 64 | 1 | 4 | 1 | 2 | 1 | 8 | 4021 | 574 | ***28*** | +| 128 | 128 | 32 | 1 | 4 | 1 | 4 | 1 | 8 | 2260 | 555 | ***50*** | +| 256 | 256 | 16 | 1 | 4 | 1 | 8 | 1 | 8 | 1214 | 549 | ***93*** | +| 512 | 512 | 8 | 1 | 8 | 1 | 16 | 1 | 4 | 635 | 549 | ***178*** | +| 1024 | 1024 | 4 | 1 | 8 | 1 | 32 | 1 | 4 | 318 | 536 | ***356*** | \ No newline at end of file From cd3a808656283d38a415d7917af2e6f2ed519dc6 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Mon, 7 Oct 2024 13:19:13 -0700 Subject: [PATCH 02/18] update the long context perf Signed-off-by: Youngeun Kwon --- .../performance/performance_long_sequence.md | 155 ++++++++++++++++++ .../source/performance/performance_summary.md | 19 --- docs/source/performance/speedup_figure.png | Bin 0 -> 20611 bytes 3 files changed, 155 insertions(+), 19 deletions(-) create mode 100644 docs/source/performance/performance_long_sequence.md create mode 100644 docs/source/performance/speedup_figure.png diff --git a/docs/source/performance/performance_long_sequence.md b/docs/source/performance/performance_long_sequence.md new file mode 100644 index 000000000000..c2816485b54d --- /dev/null +++ b/docs/source/performance/performance_long_sequence.md @@ -0,0 +1,155 @@ +# Long Sequence Performance + +## LLAMA2-7B (FP8) + +- The results in the table below show the pre-training performance of the LLAMA2-7B model with-CP (context parallelism) and without-CP for various input sequence lengths at FP8 precision. Detailed configurations and the achievable performance are provided for the with-CP configurations. For the without-CP configurations, the best achievable performance is reported within the given memory capacity constraint. + + - Container: [NeMo24.03.01.framework](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags) + - System: DGX-H100 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SeqLen (K)# of GPUsWithout-CPWith-CPSpeedup with-CP/without-CP
TFLOPS / GPUTPPPDPCPTFLOPS / GPU
4476811417681.00
8873012417301.00
161666021816601.00
323259521826101.03
646453441825741.07
12812842441845551.31
25625639241885491.40
512512104814165495.28
1024102426.58143253620.23
+ + +### Speedup enabled by the CP +![Speedup Graph](speedup_figure.png) \ No newline at end of file diff --git a/docs/source/performance/performance_summary.md b/docs/source/performance/performance_summary.md index 0234e1d7ed42..98dae2dc0a78 100644 --- a/docs/source/performance/performance_summary.md +++ b/docs/source/performance/performance_summary.md @@ -40,22 +40,3 @@ | LLAMA2-7B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 24824 | 663 | ***0.8*** | | LLAMA2-13B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 14629 | 757 | ***1.4*** | | LLAMA2-70B | LoRA | 8 | 32 | 1 | 4096 | 2 | 4 | 2621 | 722 | ***7.9*** | - - -### Long Input Sequences - -- The results in the table below show the pre-training performance of the LLAMA2-7B model with various input sequence lengths at FP8 precision. - - Container: [NeMo24.03.01.framework](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags) - - System: DGX-H100 - -| Sequence Length (K)| #-GPUs | GBS | MBS | TP | PP | CP | VP | DP | Tokens / sec / GPU | Model TFLOP / sec / GPU | ***Est. time to train in days (10T tokens, 1K GPUs)*** | -| -------------------| ------ | --- | --- | -- | -- | -- | -- | -- | ------------------ | ----------------------- | ------------------------------------------------------ | -| 4 | 4 | 1024 | 1 | 1 | 1 | 1 | 1 | 4 | 16671 | 768 | ***7*** | -| 8 | 8 | 512 | 1 | 1 | 2 | 1 | 1 | 4 | 13907 | 730 | ***8*** | -| 16 | 16 | 256 | 1 | 2 | 1 | 1 | 1 | 8 | 10082 | 660 | ***11*** | -| 32 | 32 | 128 | 1 | 2 | 1 | 2 | 1 | 8 | 6687 | 610 | ***17*** | -| 64 | 64 | 64 | 1 | 4 | 1 | 2 | 1 | 8 | 4021 | 574 | ***28*** | -| 128 | 128 | 32 | 1 | 4 | 1 | 4 | 1 | 8 | 2260 | 555 | ***50*** | -| 256 | 256 | 16 | 1 | 4 | 1 | 8 | 1 | 8 | 1214 | 549 | ***93*** | -| 512 | 512 | 8 | 1 | 8 | 1 | 16 | 1 | 4 | 635 | 549 | ***178*** | -| 1024 | 1024 | 4 | 1 | 8 | 1 | 32 | 1 | 4 | 318 | 536 | ***356*** | \ No newline at end of file diff --git a/docs/source/performance/speedup_figure.png b/docs/source/performance/speedup_figure.png new file mode 100644 index 0000000000000000000000000000000000000000..af73e6f5375b85f789d10cfa59d40aa2e5f104d2 GIT binary patch literal 20611 zcmdtKXHXSu*9CY`K@ku|6hSg5B1w`+MuLLG3rY?u8RVek3?iT?k|hUGq7sBdjshwm zQF1y2Ns@C;vm4?4?)T20si~T&srhrMi*)zXPuP3ywbpK5C55|0rzuXOP$(i9X-Q=i z>ewI(g;R414}N28%#(#eDczHiynWw6cNTjxf>dg6f01E=FgWDfamjO1UMgatsUIRL zSzo_4M)BX4Ix)Y{5+ZT##EIjtsXbpaT~E383ZL=%^;Fu(cQMwf`}5mn?#+#zO*2ib zf_cKi^IgmB3@Q)W=Dgze

c?Xb9n%D5kxKYRFG%!YV%aF?4G9GW>YzwLk>dDsO`5FgC3y za&Dug`7T*K`!V@3+c+V|hLZX6%Ts)o1Eqwn2LpX| z_a{>gwXlxxA)kR8OfRrf59MJU7#{Ejht0t86W*6Km5ZzhA-)w>7(mYb zBiF3sC+YS3wLIpX4GodpOfzxt$6_{K^QYn*$H|F$14Q>6QZuFAT+|Hb&?~nYDZApu zp<8-uW&rs!124<^4^nSdjx}&M3$;-S*j&4>mn=t4)TdwL-?3HZydHIq;f{$;0UBGC ze^#f$ZMOkthMDxVw|IimH4*mcn-)3x+WvGdFExs7(wOziOVdO--^B{rn{BXb z<)>pwgH~|iNmKMhd+uCzT21J#&uY!-dKCj`|3BT?`V^^eF0z^NNl7BhE^U!mxzFvg z@ic_bQrEf?mn?!x(5}1jXY_?DJI1FhRYR}JM|rLK`6D|euvb}xsC<8Ko8omepC$SX zjmQ(em;FzxiFz%EzCJfw4^ZWQ|8uZ@X>4O{4}D3laGjWv9@b)2|MC@Pyor=uTfOFGegEKP zKD1mH6ZhI0%wV9V^ouj+kWH$5*|dcY*_mQy3`7D)uXFq5-#fnL`EwT8=CpZ1cvSSr z-cG#P`d;GiknT{}y!!h}|3ALj&|MbQ^cyK>AhimIGNXc&{dEBn>`SFueU(PLJ?FUVLAU;^9x~|RiXp&KX5!SHa=wBSE zvE3%3;JLwY$L~w@ao@xJvk^C>O*$H zM5KW2C$z-R?__#U6k=`#WeTSrJvtsXzI?NIeWY1)o!7i{-*T;nB{|JrBi~>|vF&u! zT^iT+ITGo_znk`JmO*}ybrXNh%Aa2hSPc`)ZeMN6$skO(GLwYd2jBcM=^p_!mY-BJ z%+yYj9Pa&vk00*_Ohr72?2z1}QfN|`XpSFr*_upA5GWeTF=~w3pEjs_;%t?y#hw|; zJs3x(=kkxSk__(A6D>#6SKjgsm}urdF}0iNesqRf=+bpvvAKb=64Q=P?=twrLt>vvrET%6{fQm!B)wzk3)6JYOwP2_A-rvyxu(-G|%JgO=W<#{cLMzk2h`XRk{J zweS?WY+70Gu69M=dv`tZ81CtasgBRleU=s2bghD>xhN7O2GN>`Y|n73Ky+Uoeo50S z|BCjJ>C?L4E6EOvzbS6bDrai*FV}Nablc+#AyMoU3H|^ijSfGR!l#dpIS`VB;c=AA zaJkYo3C;^G@2(Fjw{cy)ZO;;g~}w++C4>jMVjL zj!#LSo7tAuAy%+f_iPF|E=AZ6y7IMKLaIzwN)RXC8mkXiY+kDIM1rR$t@6TEkD9BG zOqImbN>MfbRI+Ln_Vcc9et!ajYg&Ezn9C(shG)|FW-kWQhI{5zf zi`-0Dc-Wu3noYv%+8NVbS$f?UKQcg^_Am(7fcqM=Ju+=^USgtc4S+J!gj;SKSJ^xQU+%6vn^zwkt8 zd~Hf^BzD{1wcp(~LlSvNq-G*|x-%m|fM12(mMdD8b!R66GD?ospBB0{h%kh=1(i*^ zGGlux+$;XqO^^(~yG`xBv#1oyXI&q}a97-R2`^M&(};{fRMZI4o%%PDulNJMuCNmI za)=9E;bYUxmFD;=wA{eG+Bg zO-H9VE{{+8gm!POOn%($&9^Y`%+N}a+GDeo@FC1DxqFbq>lE&ssx>%o^;=n$^R5d? z93G%3bY}V1)GM6Tfyx31^fIB&1b&KfOmO=rCnPxp)7n@owHVaI18lkI&j zpuBOhEk&{G0h{c*>q#oaKYt6*xbHkG7KzL*I0?u@ZuDE4_Q`NjcQ?%A@25-1Ii=Mp zLdnHP)ALU>eaXrkdgKj0;5!SGeq(xM`UOrP#m%mC4ShbTQ8smzJhKN!wowZ)r6OBd_{}jWaJ?lu>dVGOd`CC6k8>6}Y zqGFd4Ii^C}X;q!LGRwj8UiMvx{I&OA0Q?hKge+^~7R0NNZ#jtOW4KHbg%?1{|Crik z;WpKK4i4>HdX~EbZrig0^Y(!VOcdiCTSQrJpQW z-Ho+$M}MAmRW9PaM$+vUc;kTf?YOO8weeiz7=?HPF~mZyC&$p%P*yRVUB`wqjur{H zC{Y2IjW1~$m}H}O?z_`^a7iErCaR{uyh}l^ec^YIdpNVok975H`WfBIgKE)@$E^KY zBQxBvBMmR;R#WSc2hQ;osdpX0NIIc&S)c7wvs?!h^PdYCn50?MGSm4D;poEyA}~Da zAMX&k{zJy0n_e)xz-C-JmfyfkejmZ-Lwt&&ySYZ8u#x)E;>T{uzQ8?A+CrvUu7fY= zGT(!_Ye~kce&aqF&%qxvIj_(0LSE|fe{e%X$ zM0q#FFHE#D9`Y)qVS4Vn9iz?hdTLbR?hN*@czpd&U-&f7SL_d71f|U_3>q2*uD2pIq@J6$$Seb-`|i;#|&_FU>X+R_Ddb& zJniY;yw65~Cuv+-(Bka0hb0ZW>vAJsl<2+CYmf5LSTjBbG8lF?A#<8~_6Ou#=ulnK z5O#y)9@Ad#>fA^ZAhMN;_0opDV|1aMj9oi*HfKs%$#880Foad>Ob>?FITTLJOn*@u z`VPB{Iz#&F!iGRJ$xxo?J^l_5z8V+8cj92JqP5PgSa%!IpHvtzquU0%LzF?=HeI=4%p| zW^JU}->4CtxjM>q6tqm{%eOO){I5Uw6afGdB8A}*N)CczfVj6WeT$-TfM;<1#AxKWArQ4IWBS$jx2ZjOcXnzsYbKq>6h z{_lUeWi|RLO{XN^=q;3lShP<7%xp@W(_*~^FX@5u4~)dY<4&MaSNc>@%SW9S~WPr8zTBY z2qEXwKtY?KbT&)3tk`kr`N;-InlmLQjveg6MAz-L$k!KG%zgwu!QHQ4<+(Q4VxaI7 z2Ja*8Q0}tX0JU>$vm&#*?D`y*!DkfCddg*i^SX~ODC)tUNf+824T<7FU~UP%$h*&` zO63`4E3L9f$$C{WPKaFG0%O$J@Z;^}*^(<>+A%F9_VcOFod!xB$@j`lTtE7zZ~%}q zvYQ{UfQsw$=Ht$E4Oo@yFnquLa#&osH0305oP0oU#e`-+eL0T28A`qM)Br}md`Fua^!=sxRx+?>h%r%mje{HSJn!}O% zwC{hh+s-e%^`$qafEkSQaGTs*;`MB5%~8^vspUVyz* z7nuXGSMbMJgda2abAu@{@_ahuud?Cn8T#cc_fsFT7yqH>Hf?3plOv>IbGe%Xd1IDE zjiQFW6Patt9NoO_tS!X6eH0$ zf(r(mWc%PanDOynU~?G+f-iyNB+y2`a6Ns9E+^Z3)KZ){~Y;Y>_MSVry|ERQ!DEsfR*uu8cp@IM{$ zaP9gFvmIR5vGxKbzA-y}Umm@{`Y|FohPe6FQ5umZ)KHY<`gWXH=pi2+K2p>@7l=Db zsuQl;vjxS$N!s>TKjH@N9PtW509I2s=?YRF-)W*d%rMzeg7%q4CyR8^kSp!DgGmp; z!96kE2 z>VW&cYa?L}-p{?=ZP-)%M}a)@Jd;8MchfBi?rNAyb3O*t(IrO73bCIY6P#A3##<63 z?6{8sHK8ZsEdP4-q0v&{*`qEf2(R9^ z9<4oBLXI=cN;KDlsX-V(B=;d1A~7jx^0Ilg57GEoxZVK)$v5Dj|DJ>O0ANe|Qwy6- zb!L3l817Kh%Zok-nCc;r$g4!Fz<6}aPgN6(w%bjT9YDgfbpU}<`4fN&{EKRjA>c!| zw6HvFl+b~Z>v?IeY{mNnnm1ZoT?3$HBR()M+0o~2z+Hj9Se55R(4hXl)EkUY9%Wx7j| zyYfo7_G3OeT*&+bKjq($adh_CZG>?TSd~Bxlu-O05uO5jZ=(B6UXwCEUK(#S4`bDM z%*TLSz?JddOOf1-erAEKZMJq6$+=UWJ^+Tq%%+<1-Tf0S{u;vKbbH!I%lv<1 z{t`5{_~`z{kTpLtksDU|#=C9Rq3X?*@cwG)la5c$Sr$FQ{oJkPkJx?_ppxp%A44CjJ9ZB019kKGhd1n8Ay?&F(@q_}P=v5^KJ~icxk|`l zy^Ye3APko-HUaNNI8stAJJA%|m>}_TrUVbE5z%_C}`?OVEUN5k8|z8kP^7T&tb z3VIf~s9R~1wIdK17^hV;ZmUsgl^g>Z3z}O`-ya6;vo17l8wu3QV_mzDwc%OM;E2N{ zue{x;JV^tABcouTbq(Z!L)wG3+u)Q1B!Jb3eGuKc8$C)Ebkfy1T^WGo?n5gtH7fPO zrbP4M9t`cFGvtvK2fd_w7p#ZA;*^j-B`b&ImuC6nzmwAG12whMc4y;RTcXT);eh~Z z;a)UeD2AudxS84-h>B=n79#)8IcO|78hv`DKoTKjD_4J9w)~S_4rNc&GY@GJz`c#( z*uxBym;)qf)b z8v;;n$%((S5Jbpe9$L$}H(M~6AN`y#1}LDK?aY!nE+9&H;m9=JXo}`UNbVlLOPt~X z^EqGqW0X$hz%cj8gmftQL8w2c% zE)xc@RVM7b_N2^pz;VKRkWJ~}qUubY5}RuolzpPRD=H=A$aSIb*v5#ii~f2AxovvZ zZFg(ZLZe3+9S&J`2mKu3HE1tUJI`&=VmpZRbU7$EW7bW+45UK`3y|Y4q+Ys#hbHR{KuiV85 z+r^+aI0Sh!_Mm%t$sdFKUBEJO4 zUz5C%?gIh3?h&@wHu>prl~WEHNGI65=b==ClF|rNcEBG32=_+rW9dcK^*>d>8lS^* zsPnGZ&I+R~WZcu!>h@$RsPk&C#}C8{O>@};N{Pz%~M%>5?#RaKy^PPfX% z@cJ%$EAaT6%okhL*qh$}W#hGzK=^l}3^)HabMxN9Pb9Wp3fJdKZaBfLnVVW}YUp!b z^bKkG*c(qZj%s6}fk;boZfXK&Si-OSVH|oe)(foa2hbdi_Z!ZJ6B#Qz!ip}v1DU(Y zuKo(&D&GYbqWgny%y+hGhv@WAV%%ZpXD<)efs_Ng=EMk$uc2VjX<BfR0YLkV5iJ@5GMSk^ za}jgzU<(wy8J2zd84A3N2>YJnpu~ExM>fL!7Q;r?58y6N4-B{$M-USn{b84gD?Y_< zJ!fAva+0xgQ70ArMCo}kgS$LH4wT_lZ^Mkh~r|XuDPDjrDH05H3zx;VN zmhF=XvLW_Hbh2(f9TF?SuRT)YgRA?CtXB)!nEDa%k?(lVI1ka!P}h-~06vS}Tuz8J z9gisE*ma6exTUEg;j->_&TGm7{jyAgSMM-94iKQ*;Gh%gCkrpla~b~plYZ^C6ddHQ z`v%f!lo-S3ViiqIKBI7u&s@48HdxE1mCpk-WW(8VfZ?ADpyE7dm-P?*mCq=TJn#u6?_J{`4a&JGKjskdG~#-fG*geT-)pO1<}t# zpuoL&O-XF9fon&ydsxe@GpH{o*gy%PEEYI4er@@2H?@qc9-#2!{7AJ0u8s1@irqD> zGK-h<;Nh5>1wHr>+jzK_t#)cTrDp;g!Ef^?5KB0E3hKnni(z~=5P2me3bx1N7?-Vz}Iprw7vIBasuQc2a{vFMIAYzbu}ye9sS(^Y6}Fb4_-M4ts4QO z#Fh>-pR>x%_>S|xJ|u_sgK$w}m)NQm3~r0j1|8NJ3uHxKTBFZ6LY;2M9XN9I+wh4v zXN7=ffW*aT0&q!=oErOqQa0CtBd(cPA8>9)8<^Pt^!>Mu!==@`Jx0IGmK(H7&n z*0W_4pjvD{PyMHE79Y~oXeMl~11l3En}ST+2P1-z1JpsdPjil8eH2KsmH)2Rg|=rf z#SNvoqQDOwE}+=lR9`_BbpQwOH6{2@k&OZrWO>gZRCtHA3&@t{hft*-^K9hK0Syc! zDaK|}8Ndo$!U}lKrx~(LpZvY4q%y#Ub}9!|NUbpFD+SbVKnOE~yZ_LjD208tN+b%Z zeu<1LFpa%#J%-^uf7K#d``R07k$(=-$mY_R9rutQoH!&`gYN^uWQc7D1582&QaVT8 zdiJ5otrcWv445yD^1i(T6OLu+;vIMzl7RVK>|~WrOlAWa$Lh8@Miv5T1K}ZrH&2dU z1ycG<`(o((Q?2?qC|V7mlOzHv?faj-#GbfyDARjriS8`CXhJIkA&fy|1>k)TwRw+y zO{4FSS)Z6-g|SJkh4|BvmZev2;Waen$``mP`>P!<9qz2M)nErIQ3 z+1=^UKLYViP|fH{s^?BO3iYZ6xCmPk9|%H8EI~7U1_AG!3|NSk!upfzuKuxIskVFztu~;yVml=B5hO@uWcB+Sep{}o3TWRK^QCCl;Skz^2z|&+FGlc~2J?^j zanDS>Lg?Ff(_*2)WB@WrLLVZ7@b9j)tqwvhi{Ej)h)u2Sy~|~6 zo_RP>nS1(xlM=06EPHak&t1HP)Qo%62&4u+@HJu!V}|+|p+wJ6`GSpxf&Aj+WkAtq zBo=F}7K0fKem*Ae!@`+r%hD6I=z~Q~r3vhF3Bf}onc+|=JFp~_BYKbk`!=FU>$xm? z^TU>H)G&@sVj9m`f?rMrz5_|wYYNf#!Iwg@bwQ`9Ack4gZbrRZ_<8?CG|x3M`PwFW zL6w70_-Y2ueCtSJgF-MlKL!h!Pk5Os(|V%GX-(^gi#t#;2f7?cew^NpUz4;IUsyfB zxM;b8ifo$_!V+;HAd1=Kob@M^dTX@5w>utV-So?`q*@}6^#enVI{pB4k3Z|D^CAkA z3&&wUm8WuPlHYiE#P%%c0ZwcuU>4?5(DDxftwUcuYdrD*6k5)N-@u!Ya`t!MMPrt%HsHXJ8n zQ>v9v6^CWuP{pX<3iv%NfwE5|n2=UFqxW3-hiw3rpz1!`$k*BP_PUM$lk=LP)=hph zQ=5PNe(HNN>z^7=zgixKSd_9>RqOlRMxG%r$R zYW+cn5uy_E1uA3HFJOu()~_XBkOrbVW44v!0Q(y#<}L^uHCz%dziHYknL>49rwKx$ zX5ObcN^aw(+~{`THge1EyrnpbaB+aE#y8iNq;mB-aVnmL;n6q@;b8gyP3T~QK<+Y! zYYai-N}2^dBWIrC;Pm7$DQ{47KK7#4GR%%!7#qhC1%|`0&^KM~il9s`ef^I+CBgis zkn+MHfT|oc&J?DV(eCSAZ*l6XOesQH0uftUZ-HgjXT`dlrJNC)neKnWgNU{Y4xmeq z{>yowuO^@6J@~Bps0_7#$oA1XS5>bAh(VOu)xxbF<9H2W9QS|!qke#Ja!oq{d*Rn9 z+>Q7)9tU8Dx!rag+LGlj@BWAalc`d)-?}cCAk^oF4%igo;B%v_>rlMNK>@HDu>}NM z4;D2DuR{z51)vtfhv{VlcC=J0v_ZD+F7xz8s+b+Oy-i~kh@*&e2yr(}FWy(^#i^^+ zprC|PG{?#{Qup&63%A#Nx$E}SEck7`uq^7xe0;m>X%A>fyoe424b6+}+9vk6!Jbe8 zy_u+0z&I>E29)#PC@LHCqCm|cKR2sluXpYYBQ78uJJOy<1{=O#stX#9vll?Yb?MV` z2#{pP-J7#Df$to)jzP)$0oO9Cw$Sp}t>7RKky8|?uo~qDIO=5A|KO+{14KWubJ^xJ zxDrBhNSO+Rt>e3&E23Y56k{H?K9goHm2u1%e%XcE}pzv!fb>@)Ev9- z7t{<+_5S(^Vb5@oYJnHV8TK}uC1f`CYH2O0*T5Ij3UVQbFMDsHO3aUbe03eHB9VQ- zr-1Bb=M(cV9P0Pzx0m^P?N7Qi!ZsNj(Q_QTprc!^U1TNDY!AU&4stqg`_H9iw=0-0 zwf-Wo;dOrIgWn2?hDJ>>{B~Ujr$Lg5VA79*_j)oN(+NvPmez-Unz<$kf+a4GP7Q8e z@p9%aG&oqtJ4@4rFf685xUT%;WAV>5evO+J zy#_Uflp{PYCoLQ|f%us^`CKSW{pmQch5G)K{2!PwrMzN^(qG+xQ<=%Um&^cemYBsm zXCWWn*4EW6m#ZDr$u;?>gqa?&7VSHkKVczHdCfZs$2`E})+iT!vvDr!`H2O|xVI;? z;V_4|Iy!1Ns_d07P`Lxqs@at&6Y?eQ7lN|Wq{Y{PA~pfhUP6B3ptn>q5 zmU1{aDvTN_R7xEdTLxz@XcSq4%m+m^?0b*{QP?C7aO9+?v0Se2E7E0 z&cXDMk*`3QJ2?o75(!$WsZiguCS0zigB670*_H|Hf9tbXuMB=k!ZyE|iD88e_cbe7 zF&?b|43W|wq}GaK4N3FF8Ks15BWXPUK zID{7W!dOjxpz2i3&`h$ueDg_5+G~Tg^FYf1r*v2r)LK~OOs8X-PJ5#xZD)!m;uvg> z`@72qA-(Oy3Ixp(V3MedY?@yF8Ax5qaMzy^{s;*YO zpmX}cM-5)GLN|X7uwkQ@^1$941@=lG|1>U~s#oV2Y<_T}=Kd-UVspXYAG4P(h+Fqr z6hyjgF7-72lc8PIO3I-tlvcG~L*xDg#O;2iKO$qHDh;B0i^K=SDGTc5!A5#Mnp_`YoLu5I@0gTBK9g)?zpO^NWg8iF_^qG%6m>0(4&w-}{1_CWnwp}5XrqXtd~ zA;70ftgh;|COtN6|eEyl>aYrdrs zrFCo_$Qqt+ynhYmqU}s^vvk|P@ds5>6nNo4wfP>~p)5zZL2FOuOWCTl>K%V2Pi1TOE-cguW-f|ncLwswGi?XsEfl;58% zSP(XDqTKyGK3f5PBqR>Xu8*DZ*av;jNw)po*E39_gEG5thuBPn`|Wfpq8ChssT$c$ zLqM`q^4OuZ|5v;L4S4P>-i)Q#eq&(uU z6@eVv2!6O3_GcEvi*Lp@cR~JMJyC@q*n(Y>a^N}5*$am}TbxrtWg5?N-~HT!F%H09 z@E``w^^TT;ZqD3$Fu7KXpXY3w6Qv@*>AzOodR)HK?u&V zs!Fw4l2%WaUbok~|}Ld;;NnY5V9{%zRFPv_lk?GdPoh4n~`&+C$1)$+l^O;%R_?*NRQ^q)^$rLp1mH+ z3*yEu5d^w0ua?5n2(Z>(tcSh(D^8F9pBqrTm1q`G+tJU5`%?)f6JTGQBp!dm6*vmw z1OlgbzubagWYeFfjT+z8UIYE28{4R3k4>_G@+vp_7(+*uJ*#L2g9!iT)+zoFH7M|7 z#fg}hdd<(fUA2X%CLm(})dR-}GWZsY6o||Xp!7b$vTUuLby=Qh?jq#{p?N=6VIF5JkQBvb0oxp)EHV^m#5x!7ceM zW3S@$D@BD0k)4%{w9K)HkYS8&rlb=Z@$b$ z+I;ZDQjs`N`lfg_qzxIuyDjMgb6~N-US^C&;POG9@9#ReRjCD9~T7gw~Uq zfqnE#^f9K$YWoevi4Twxwpe@!)dCj-xPI&>yrgc;cR1M6zgHJ)!?BR}1a+uL6w5=4 zOVx=NgU(A1Go2xk$$U{f8Z0ZGpyftH4mG};&#_bN`ze(unehdSeH(UTd8Cy<=bMxp z#XIJrQNJLj&iKxA@i9)W22c+=Lunz+T2)YSh0CQr%fsrel*6B}?=_=j;;6J22UJ}q<8`5-(R1ayZ0Clx zRDtyqHdMypKU2%0ob&t1hV~9h`J`z}08$zuMWK{+3$nBY`)}giJAwrWQjcdF8 z;SXY=xOH3Jp0DWB!AX2Slvv3qw&BC#Ww}r9vfT&0H;C-3?L3<|DPE9H&GM!J$AjsX z=?(m|<-@0k`zlTi?uErJAOe&%*0u}MJ0ih??rw}wY$YtHv*kMUF4Y$fA0M|+2ELg# zjImjSC!1r$@ato$*_J<{ZXt;Mnx91G~kURl0O9Yz#4q04{MzwVCVZubw`ahA;y9 zUvX#!Jwxz9#FVNEL+tb1lZOS30^CLRi*fSgho)pIj*bxpXmv{B3)w1R?sUI4VBXO`=ez>qRH>y zfxn@TgBN8Wd!zMXa3Jx50aN8~V43i#0#6O?oue+T){DPSFa%Ld)*jCm1oz3N;Gf&P zLZkQo5E179gQZvewHIQ5O~MLr{?cSZ^`XombgBQ)laUKroLo1Nax2TOI4jB0rEM+m z!$I-u-iA4wac-ISGcJ~;I53z7@0i~3t$pdhg;d9c(Kb|JzV6UwT=wNe0ShAkOWMS>d3_7

VPl6{gh6*r3&k#vp|q#cQ6+q>6KSfcpZirO9q z2N5qBTwrr>_D0S5v$l{>yCBs)_*mnIdGR@{ek}Sj#kD9QB6cEy0fHvTMxEB@V%bmF zO|>)YY*;`yXz_wpqI3}9enmekaH;a^>rmmbVh1^l1*>TWf#-^!TLcF`7j~IJPrFKD z5=>)l!uwXw%#95G$RMpMJF&V!T}EKyUc(M4<)Rc8@B&ZOMdO78xsv;6f7};|w1e1}-0HU!kVmW$3woR{#bLZ>m0Zbqh2~G)pP;+cb!REA6{!J|y?8nw z-hl_YwHjNPl!Lr)GoI1|o`14x9>f-h;sO)9GyNf?F5<2LcaFgfN)ScEfI;utPC9 zYzL4*ix7&0nIGmi_&Ay?c(M|dggCl}9w{FhLJFR1)wkLl>ix*MTfNI-*uh*VPeyHB z{A92k>vsD5y&ru)l}>N3p5j1SIuXaVtJ2S1-%D|>6Hgj-N z0}=5A_4_V`Uo~*-^1yJ|@|!dNS2r5cxckH8sJYB&D-f3)vHsWx{HHmWGx)ncqmcbf z_b&)vcj}?^`D_cg8gtkygtRYTvV{(~YU_S7pm+DN1xaOB;z%Krgi@N^M~D5Kro?bxvK2_C~iJUy}f-T<06#!BVn+w zdeoj8qnS^~+F|tr42v za`=96-tA@WZ^tHK{=5LX;`F!?C|Rd?>LKQ}SOGf_1zcGV_O~0G^6W+^MF7dOkRc0q z+IyQ5qWh7VgFsu#1DxSnsR#R-BZTR;RjH<8FphEw%_~DIhK2nD_sx1eao0c$Smw|W z!l+g^-`a|Tto7JOM7ILbK^|#IvOablm}ra+PY6(ZvN&h|iZyORd$e7|=TDB4zgmW- z;pq$4Lx=gpa8HG^YLHBY$9Bgyqp$V_9s767oDk@KNrIy*A7_@2CZC6(U14gTR|TR!*%!u}lP?b2njKO{TBFGJcM9 z>AZ3wVk<72(T;&s}Cr8e^-tFZ>~ z1d$y~DB_#wl=-tJ{WV0uRKf-3^CU5Kx~)cRnZI#`#Q0;^CMktE{a37E6Y4%e(z}rG zBy11QHL0RdmeeQec;%6QekZ~~YXG~mvi)n|nL5x-V&%7G^~lfrep~qQqpq+xz5iJa zRd3h!D*XeIe(gyRRS}2Vo?zSrU#puHXxwJ8JHMfImX7M=CiG94V4*rXXJ=`kmN76I~-DdG76>k3d#lq%HM1r69b zC;IJjwlC#eoaGPGD^J=3(GOZ&WoT`eBHu+yXGvYB`8#>i8=2ZgIdi0HI=1ij>~q=IJA~H^ZsIs%5t0h&60%KO84kPr*wyXhEEBlm zIHB{RNH=2CLUxZ$pUH3oUqK;vLdPg9Xu=8nsG1ylHh26kCD?)yh+y@NVTHsKSYHdK zk^nmMUvc{L)-s+s2d*1(43&H$oRt9NtUPNqFb*NOZcr2WIe`KB%7wz)t0M)YDc2bN z?Y|chs&p4V`-q1@{It_ZXY}ld&Jca|MWMd~b9CeGQ-qaCTHoIu+|SHPI}uDL;bVdY zekvrB<9A5UGId>3tl+96&|Xhc*~50v^pH*R7sSqp_@qlY`5SJefstqU(8wc`do-pX zhEn1?JwZ>n6cn2bkT#ct&uQE_v+U-3ipH7gw*1s+%ClkDKtt6)OpcQ-rdJWsK?A+Sqve|94?F+VDm$_LrJgi2;^rHRpN}lN zIIz!vAu*_Qv#Gt=6Exa5ze~wD>cL>U#Qjgh%`3^Z&*yz|tz+EK>SlZ!{KmP6eU#l_# zZXvT>07ulHLfjAe1*2U}hy8$tdx5yyzv%i(ne#?ZoA7Vc_ryX}GK=+-SuKT+f@pTZ z*BNX?rRcH^d{B-9hCfh_t_JLOmJU^h;W|e4#4_H*dHtlbNKg%W$Cfk*hR+mzdPDVU zU~toHTYZ40Dn)t?dKE=R4BUgKHII4?h*s<8++OZ&k9=4=!A#t}E6Gd>qKq3P)r z;}D4Hv-&5AO>mov9s)Np*<{}N#nR4NZKLKMXzjCoP!G^%GMq_6dtxL&RH2KhBuh$l z_I5BNLdf?EI0tC#{~O=^fljsmTVEOSAbx@^?m)g4a?s!i?ZQyzv?3qjLAobqz{@)q zcf)<7nkILn18H{c%G7D=klK!h4*`rR^-o& zKCVZFv^V~aMJ2>40_m_$N-X3__B_2WS{I(l3U z(Oclx<)6)MlY@@4nPftD%!)Ah&vG-t<7Wh?^HJHpY+N%i*ij(X^~j?FmG-v1a6L#7 zh(z?dVsGm+)&Rb>aneE%DEFCUuP-ms_57%XHTT&aMV_n;?m!RfAH*yl1$#^&ePZ7Q z5Kr%2FzvB#&Y!=CQuZS&A0gMIufFDUhU%aPgAUxMQR9%8ZbCZFEt%3-fln;;-T!pI z4tffDcff7ZiK(_TxezqmjwyuGssus~b*B5f1^e9gQi}zh=5DeU=u`#AdU+*n0=O&i!@lH)r8`!GHE9JC7kw%ZEvB3QKYz*<@MGWbJy5hkU?Z~?NG z$^Jpp>VF3I)M|Kev@XO}{myWex5RU0AK%(g$Kg@JqTC*gTP-1t@*wO5B`jzV@pOc5 zz(wG_M>9;{Aw|+WIAmcBhR}p!&@9b#o*%;%xBcZ{JZIP!| zN-Q%7^PS{ub#s#w8R;FC*10+`Zg;|vuH9!E{UDv?UMOuF^lh0>vd<{jDSG|V2c1ik z7877q=;(v>NE!&!J4@f_|9g>L;2pAMG~B0fP$WO1ClQ*#E+iN;)XM%@*9kFk*k(8Q zD?{ZlTLQb-X4UGdkn+ac`(!dg?h?5ccRm+5z8R}kUzDDittGc2mu0sF@Zi)={s8%K zjq7MO>ge8gjPQa`NRMr={5^K1P`k+Q)yQ{LGNNxi`@Ju4hfY5}X&%y6c&1yG7e2Z8 z=U4xFgp5a+8}!!{MBh)GtzA|HgIdNse0rn!WQBDOdve)dQ(AH|A*_f8JF zHD;2Z1Yw4Z;~b5`2o~XEDYE{GIw&!J+2jSbFm|0!1d~YIv{TU@-Q&-uLe=_@Znop1 zMpF$42E~(ahEASW^y%ME3fIl~jPuoZbPN6szHV^|X_fka?b~{_`^Wv8sB^3yzrPJn PMaf7hNM?#Z^!z^nt54i^ literal 0 HcmV?d00001 From 8922a84be7d5048050a50020f94e8c7a3aad326a Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Mon, 7 Oct 2024 10:06:29 -0700 Subject: [PATCH 03/18] Akoumparouli/mcore microbatch calculator fix (#10780) * move tests/lightning/{,_}io Signed-off-by: Alexandros Koumparoulis * add microbatch calculator context manager Signed-off-by: Alexandros Koumparoulis * use microbatch calculator context manager Signed-off-by: Alexandros Koumparoulis * add on_load_checkpoint test to ValidateModelRestoration; use ctx manager to reconfigure microbatch calculator; update save/restore path; add cleanup step at the end Signed-off-by: Alexandros Koumparoulis * remove unused var Signed-off-by: Alexandros Koumparoulis * fix Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa Signed-off-by: Youngeun Kwon --- tests/lightning/{io => _io}/__init__.py | 0 tests/lightning/{io => _io}/test_api.py | 0 tests/lightning/{io => _io}/test_mixin.py | 0 tests/lightning/{io => _io}/test_state.py | 0 tests/lightning/mcore_microbatch_utils.py | 27 +++ tests/lightning/test_dist_ckpt.py | 137 ++++++------- tests/lightning/test_nemo_resume_from_ckpt.py | 55 ++--- tests/lightning/test_state_restoration.py | 190 +++++++++++------- 8 files changed, 246 insertions(+), 163 deletions(-) rename tests/lightning/{io => _io}/__init__.py (100%) rename tests/lightning/{io => _io}/test_api.py (100%) rename tests/lightning/{io => _io}/test_mixin.py (100%) rename tests/lightning/{io => _io}/test_state.py (100%) create mode 100644 tests/lightning/mcore_microbatch_utils.py diff --git a/tests/lightning/io/__init__.py b/tests/lightning/_io/__init__.py similarity index 100% rename from tests/lightning/io/__init__.py rename to tests/lightning/_io/__init__.py diff --git a/tests/lightning/io/test_api.py b/tests/lightning/_io/test_api.py similarity index 100% rename from tests/lightning/io/test_api.py rename to tests/lightning/_io/test_api.py diff --git a/tests/lightning/io/test_mixin.py b/tests/lightning/_io/test_mixin.py similarity index 100% rename from tests/lightning/io/test_mixin.py rename to tests/lightning/_io/test_mixin.py diff --git a/tests/lightning/io/test_state.py b/tests/lightning/_io/test_state.py similarity index 100% rename from tests/lightning/io/test_state.py rename to tests/lightning/_io/test_state.py diff --git a/tests/lightning/mcore_microbatch_utils.py b/tests/lightning/mcore_microbatch_utils.py new file mode 100644 index 000000000000..39b3baee446c --- /dev/null +++ b/tests/lightning/mcore_microbatch_utils.py @@ -0,0 +1,27 @@ +import contextlib + + +# @akoumparouli: use a context manager that saves/restores gbs/mbs when using +# reconfigure_num_microbatches_calculator to avoid interference between tests. +@contextlib.contextmanager +def reconfigure_num_microbatches_calculator_manager(*args, **kwargs): + import megatron.core.num_microbatches_calculator as mb_calc + + # Store current mbs, gbs values + if not mb_calc._GLOBAL_NUM_MICROBATCHES_CALCULATOR is None: + _mbs = mb_calc.get_micro_batch_size() + _gbs = mb_calc.get_current_global_batch_size() + + # use user's settings + mb_calc.reconfigure_num_microbatches_calculator(*args, **kwargs) + else: + _mbs, _gbs = 1, 1 + + try: + # run user's code + yield + # @akoumparouli: no catch + finally: + # restore old mbs, gbs + if not mb_calc._GLOBAL_NUM_MICROBATCHES_CALCULATOR is None: + mb_calc.reconfigure_num_microbatches_calculator(0, None, _gbs, _mbs, data_parallel_size=1) diff --git a/tests/lightning/test_dist_ckpt.py b/tests/lightning/test_dist_ckpt.py index e6ea381fdf0b..d5037f0aa573 100644 --- a/tests/lightning/test_dist_ckpt.py +++ b/tests/lightning/test_dist_ckpt.py @@ -24,7 +24,6 @@ def set_env(): import pytest import pytorch_lightning as pl import torch -from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator import nemo.lightning as nl from nemo.collections import llm @@ -43,13 +42,9 @@ def _get_last_checkpoint_dir(model: pl.LightningModule, suffix: str = '') -> Pat return f'epoch={model.trainer.current_epoch - 1}-step={model.trainer.max_steps - 1}{suffix}' -def get_model_and_data(): - micro_batch_size = 2 - global_batch_size = 2 +def get_model_and_data(mbs=2, gbs=2): seq_length = 128 - data = llm.MockDataModule( - seq_length=seq_length, micro_batch_size=micro_batch_size, global_batch_size=global_batch_size - ) + data = llm.MockDataModule(seq_length=seq_length, micro_batch_size=mbs, global_batch_size=gbs) config = llm.GPTConfig( num_layers=2, @@ -59,13 +54,6 @@ def get_model_and_data(): seq_length=seq_length, apply_query_key_layer_scaling=1, ) - reconfigure_num_microbatches_calculator( - 0, - None, - global_batch_size, - micro_batch_size, - data_parallel_size=1, - ) return llm.GPTModel(config, tokenizer=data.tokenizer), data @@ -76,21 +64,25 @@ def test_dist_ckpt_io_called_for_mcore_models(self, tmp_path): set_env() assert os.environ['NVTE_APPLY_QK_LAYER_SCALING'] == '1' - model, data = get_model_and_data() + gbs, mbs = 2, 2 + model, data = get_model_and_data(mbs, gbs) + from tests.lightning.mcore_microbatch_utils import reconfigure_num_microbatches_calculator_manager - strategy = _get_strategy() + with reconfigure_num_microbatches_calculator_manager(0, None, gbs, mbs, data_parallel_size=1): - trainer = nl.Trainer( - devices=1, - accelerator="gpu", - strategy=strategy, - enable_checkpointing=True, - max_steps=2, - default_root_dir=str(tmp_path), - logger=False, - ) + strategy = _get_strategy() + + trainer = nl.Trainer( + devices=1, + accelerator="gpu", + strategy=strategy, + enable_checkpointing=True, + max_steps=2, + default_root_dir=str(tmp_path), + logger=False, + ) - trainer.fit(model, data) + trainer.fit(model, data) assert isinstance(trainer.strategy.checkpoint_io, MegatronCheckpointIO) # Ckpt path doesn't contain the .ckpt suffix @@ -104,51 +96,54 @@ def test_dist_ckpt_io_called_for_mcore_models(self, tmp_path): def test_async_save_produces_same_checkpoints_as_sync(self, tmp_path): set_env() assert os.environ['NVTE_APPLY_QK_LAYER_SCALING'] == '1' - model, data = get_model_and_data() - - sync_ckpt_dir = tmp_path / 'sync_checkpoints' - async_ckpt_dir = tmp_path / 'async_checkpoints' - - sync_checkpoint_io = MegatronCheckpointIO('torch_dist') - async_checkpoint_io = AsyncFinalizableCheckpointIO(MegatronCheckpointIO('torch_dist', async_save=True)) - - # dummy_trainer just to initialize NCCL - dummy_trainer = pl.Trainer( - devices=1, - logger=False, - max_steps=2, - strategy=_get_strategy(), - ) - dummy_trainer.fit(model, data) - strategy = _get_strategy() - tmp_path = strategy.broadcast(tmp_path) - - ## reset the model and data and train with sync checkpointing - model, data = get_model_and_data() - sync_test_trainer = pl.Trainer( - devices=1, - enable_checkpointing=True, - logger=False, - max_steps=2, - strategy=_get_strategy(), - plugins=[sync_checkpoint_io], - default_root_dir=str(sync_ckpt_dir), - ) - sync_test_trainer.fit(model, data) - - ## reset the model and data and train with sync checkpointing - model, data = get_model_and_data() - async_test_trainer = pl.Trainer( - devices=1, - enable_checkpointing=True, - logger=False, - max_steps=2, - strategy=_get_strategy(), - plugins=[async_checkpoint_io], - callbacks=AsyncFinalizerCallback(), - default_root_dir=str(async_ckpt_dir), - ) - async_test_trainer.fit(model, data) + gbs, mbs = 2, 2 + model, data = get_model_and_data(mbs, gbs) + from tests.lightning.mcore_microbatch_utils import reconfigure_num_microbatches_calculator_manager + + with reconfigure_num_microbatches_calculator_manager(0, None, gbs, mbs, data_parallel_size=1): + + sync_ckpt_dir = tmp_path / 'sync_checkpoints' + async_ckpt_dir = tmp_path / 'async_checkpoints' + + sync_checkpoint_io = MegatronCheckpointIO('torch_dist') + async_checkpoint_io = AsyncFinalizableCheckpointIO(MegatronCheckpointIO('torch_dist', async_save=True)) + + # dummy_trainer just to initialize NCCL + dummy_trainer = pl.Trainer( + devices=1, + logger=False, + max_steps=2, + strategy=_get_strategy(), + ) + dummy_trainer.fit(model, data) + strategy = _get_strategy() + + ## reset the model and data and train with sync checkpointing + model, data = get_model_and_data(mbs, gbs) + sync_test_trainer = pl.Trainer( + devices=1, + enable_checkpointing=True, + logger=False, + max_steps=2, + strategy=_get_strategy(), + plugins=[sync_checkpoint_io], + default_root_dir=str(sync_ckpt_dir), + ) + sync_test_trainer.fit(model, data) + + ## reset the model and data and train with sync checkpointing + model, data = get_model_and_data(mbs, gbs) + async_test_trainer = pl.Trainer( + devices=1, + enable_checkpointing=True, + logger=False, + max_steps=2, + strategy=_get_strategy(), + plugins=[async_checkpoint_io], + callbacks=AsyncFinalizerCallback(), + default_root_dir=str(async_ckpt_dir), + ) + async_test_trainer.fit(model, data) checkpoint = {'sharded_state_dict': model.sharded_state_dict()} diff --git a/tests/lightning/test_nemo_resume_from_ckpt.py b/tests/lightning/test_nemo_resume_from_ckpt.py index 31ab88546cb3..e876e6965000 100644 --- a/tests/lightning/test_nemo_resume_from_ckpt.py +++ b/tests/lightning/test_nemo_resume_from_ckpt.py @@ -27,7 +27,6 @@ def set_env(): import pytest import torch -from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator from megatron.core.optimizer import OptimizerConfig import nemo.lightning as nl @@ -90,7 +89,7 @@ def compare_ckpts(a, b, path=[]): raise ValueError("Unexpected value type " + str(type(a))) -def setup_data_model_optim(log_dir, n_steps, data_path, gbs=2, mbs=1): +def setup_data(log_dir, n_steps, data_path, gbs=2, mbs=1): seq_length = 2048 tokenizer = get_nmt_tokenizer( "megatron", @@ -108,14 +107,11 @@ def setup_data_model_optim(log_dir, n_steps, data_path, gbs=2, mbs=1): tokenizer=tokenizer, split='9999,1,1', ) - # Other tests might have different configs, so need to configure explicitly. - reconfigure_num_microbatches_calculator( - 0, - None, - gbs, - mbs, - data_parallel_size=1, - ) + return data + + +def setup_model_optim(log_dir, n_steps, tokenizer, gbs=2, mbs=1): + seq_length = 2048 gpt_config = llm.GPTConfig( num_layers=2, hidden_size=128, @@ -131,7 +127,7 @@ def setup_data_model_optim(log_dir, n_steps, data_path, gbs=2, mbs=1): masked_softmax_fusion=False, ) - model = llm.GPTModel(gpt_config, tokenizer=data.tokenizer) + model = llm.GPTModel(gpt_config, tokenizer=tokenizer) opt_config = OptimizerConfig( optimizer='adam', @@ -148,7 +144,7 @@ def setup_data_model_optim(log_dir, n_steps, data_path, gbs=2, mbs=1): ) optim = MegatronOptimizerModule(config=opt_config) - return gpt_config, data, model, optim + return gpt_config, model, optim def setup_trainer_and_logger(log_dir): @@ -248,18 +244,29 @@ def train(n_steps, resume): log_dir = f'/tmp/mcore_logs_{n_steps}steps' os.makedirs(log_dir, exist_ok=True) data_path = [DATA_PATH] - gpt_config, data, model, optim = setup_data_model_optim(log_dir, n_steps, data_path) - trainer, nemo_logger = setup_trainer_and_logger(log_dir) - llm.train( - model=model, - data=data, - trainer=trainer, - log=nemo_logger, - resume=resume, - tokenizer='data', - optim=optim, - ) - trainer._teardown() + data = setup_data(log_dir, n_steps, data_path, gbs=2, mbs=1) + # Other tests might have different configs, so need to configure explicitly. + from tests.lightning.mcore_microbatch_utils import reconfigure_num_microbatches_calculator_manager + + with reconfigure_num_microbatches_calculator_manager( + 0, + None, + 2, # gbs + 1, # mbs + data_parallel_size=1, + ): + gpt_config, model, optim = setup_model_optim(log_dir, n_steps, data.tokenizer) + trainer, nemo_logger = setup_trainer_and_logger(log_dir) + llm.train( + model=model, + data=data, + trainer=trainer, + log=nemo_logger, + resume=resume, + tokenizer='data', + optim=optim, + ) + trainer._teardown() set_env() assert os.environ['NVTE_FLASH_ATTN'] == '0' diff --git a/tests/lightning/test_state_restoration.py b/tests/lightning/test_state_restoration.py index 2f4c60395725..076a2f931f57 100644 --- a/tests/lightning/test_state_restoration.py +++ b/tests/lightning/test_state_restoration.py @@ -11,9 +11,10 @@ from nemo.collections.llm.api import train from nemo.collections.llm.gpt.data import PreTrainingDataModule from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer -from nemo.lightning import NeMoLogger +from nemo.lightning import AutoResume, NeMoLogger from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule +from tests.lightning.mcore_microbatch_utils import reconfigure_num_microbatches_calculator_manager VOCAB_PATH = "/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json" MERGES_PATH = "/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt" @@ -21,6 +22,12 @@ EXP_DIR = '/tmp/nemo_exp/' +def teardown(exp_dir=EXP_DIR): + import shutil + + shutil.rmtree(exp_dir) + + class ValidateOptStateRestoration(Callback): def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: # PTL has no on_load_checkpoint_start event to be triggered before @@ -59,7 +66,7 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") - def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: for p in pl_module.parameters(): - assert torch.all(p == 0), "Expected params to be zero" + assert torch.all(p == 0), "Expected params (scratch) to be zero" with torch.no_grad(): for p in pl_module.parameters(): p.fill_(random.uniform(0, 1)) @@ -69,14 +76,19 @@ class ValidateModelRestoration(Callback): def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: for p in pl_module.parameters(): p.detach().zero_() + self.called_on_load_checkpoint = False + + def on_load_checkpoint(self, trainer, pl_module, checkpoint) -> None: + self.called_on_load_checkpoint = True def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: for p in pl_module.parameters(): - assert not torch.all(p == 0), "Expected params to be non-zero" + assert not torch.all(p == 0), "Expected params (resume) to be non-zero" + assert hasattr(self, 'called_on_load_checkpoint') + assert self.called_on_load_checkpoint == True, "Expected to have called on_load_checkpoint" -def make_model_optim_data(): - seq_length = 2048 +def setup_data(mbs=1, gbs=2, seq_length=2048): tokenizer = get_nmt_tokenizer( "megatron", "GPT2BPETokenizer", @@ -87,16 +99,19 @@ def make_model_optim_data(): data = PreTrainingDataModule( paths=DATA_PATH, seq_length=2048, - micro_batch_size=1, - global_batch_size=2, + micro_batch_size=mbs, + global_batch_size=gbs, seed=1234, tokenizer=tokenizer, ) + return data + +def make_model_optim(tokenizer, mbs=1, gbs=2, seq_length=2048): gpt_config = llm.GPTConfig( - num_layers=12, - hidden_size=768, - ffn_hidden_size=3072, + num_layers=2, + hidden_size=128, + ffn_hidden_size=256, num_attention_heads=12, seq_length=seq_length, init_method_std=0.023, @@ -106,7 +121,7 @@ def make_model_optim_data(): make_vocab_size_divisible_by=128, masked_softmax_fusion=False, ) - model = llm.GPTModel(gpt_config, tokenizer=data.tokenizer) + model = llm.GPTModel(gpt_config, tokenizer=tokenizer) opt = MegatronOptimizerModule( config=OptimizerConfig( @@ -125,64 +140,103 @@ def make_model_optim_data(): ), ) - return model, opt, data - - -def run_train_from_scratch(): - model, opt, data = make_model_optim_data() - trainer = nl.Trainer( - devices=2, - max_steps=10, - accelerator="gpu", - strategy=nl.MegatronStrategy(), - callbacks=[ValidateOptStateScratchInit(), ValidateModelScratchInit()], - log_every_n_steps=1, - limit_val_batches=2, - plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), - ) - - train( - model=model, - data=data, - trainer=trainer, - log=NeMoLogger( - log_dir=EXP_DIR, - ), - tokenizer='data', - optim=opt, - ) - - -def run_resume_train(): - model, opt, data = make_model_optim_data() - trainer = nl.Trainer( - devices=2, - max_steps=1, - accelerator="gpu", - strategy=nl.MegatronStrategy(), - callbacks=[ValidateOptStateRestoration(), ValidateModelRestoration()], - log_every_n_steps=1, - limit_val_batches=2, - plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), - ) - - train( - model=model, - data=data, - trainer=trainer, - log=NeMoLogger( - log_dir=EXP_DIR, - ), - tokenizer='data', - optim=opt, - resume=nl.AutoResume( - resume_if_exists=True, - resume_ignore_no_checkpoint=True, - ), - ) + return model, opt + + +def run_train_from_scratch(mbs, gbs, num_dev): + data = setup_data(mbs, gbs) + model, opt = make_model_optim(data.tokenizer, mbs, gbs) + # Other tests might have different configs, so need to configure explicitly. + with reconfigure_num_microbatches_calculator_manager( + 0, + None, + gbs, + mbs, + data_parallel_size=num_dev, + ): + trainer = nl.Trainer( + devices=num_dev, + max_steps=10, + accelerator="gpu", + strategy=nl.MegatronStrategy(), + callbacks=[ValidateOptStateScratchInit(), ValidateModelScratchInit()], + log_every_n_steps=1, + limit_val_batches=2, + plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + ) + + train( + model=model, + data=data, + trainer=trainer, + log=NeMoLogger( + log_dir=EXP_DIR, + version='v1', + use_datetime_version=True, + update_logger_directory=True, + wandb=None, + ), + resume=AutoResume( + resume_if_exists=True, + resume_ignore_no_checkpoint=True, + ), + tokenizer='data', + optim=opt, + ) + trainer._teardown() + + +def run_resume_train(mbs, gbs, num_dev): + data = setup_data(mbs, gbs) + model, opt = make_model_optim(data.tokenizer, mbs, gbs) + # Other tests might have different configs, so need to configure explicitly. + with reconfigure_num_microbatches_calculator_manager( + 0, + None, + gbs, + mbs, + data_parallel_size=num_dev, + ): + trainer = nl.Trainer( + devices=num_dev, + max_steps=1, + accelerator="gpu", + strategy=nl.MegatronStrategy(), + callbacks=[ValidateOptStateRestoration(), ValidateModelRestoration()], + log_every_n_steps=1, + limit_val_batches=2, + plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + ) + from nemo.lightning.pytorch.strategies.utils import RestoreConfig + + train( + model=model, + data=data, + trainer=trainer, + tokenizer='data', + optim=opt, + log=NeMoLogger( + log_dir=EXP_DIR, + version='v1', + use_datetime_version=True, + update_logger_directory=True, + wandb=None, + ), + resume=AutoResume( + resume_if_exists=True, + resume_ignore_no_checkpoint=False, + resume_from_path=f'{EXP_DIR}default/v1/checkpoints/default--None=0.0000-epoch=0/', + ), + ) + trainer._teardown() @pytest.mark.run_only_on('GPU') def test_optim_state_restoration(): - run_train_from_scratch() - run_resume_train() + mbs, gbs = 1, 2 + num_devices = 1 + try: + run_train_from_scratch(mbs, gbs, num_devices) + run_resume_train(mbs, gbs, num_devices) + finally: + teardown() From 4ba92b3608363ee9787c882be0fbf31e001f007c Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Mon, 7 Oct 2024 10:24:13 -0700 Subject: [PATCH 04/18] remove 8x3b recipes (#10764) * remove 8x3b recipes Signed-off-by: Alexandros Koumparoulis * remove 8x3b from test_nemo_run Signed-off-by: Alexandros Koumparoulis * rm from __init__ Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: Youngeun Kwon --- nemo/collections/llm/recipes/__init__.py | 6 - nemo/collections/llm/recipes/mixtral_8x3b.py | 290 ------------------ .../llm/recipes/mixtral_8x3b_16k.py | 132 -------- .../llm/recipes/mixtral_8x3b_64k.py | 133 -------- .../llm/recipes/test_mixtral_8x3b.py | 110 ------- .../llm/recipes/test_mixtral_8x3b_16k.py | 84 ----- .../llm/recipes/test_mixtral_8x3b_64k.py | 84 ----- tests/lightning/test_nemo_run.py | 4 - 8 files changed, 843 deletions(-) delete mode 100644 nemo/collections/llm/recipes/mixtral_8x3b.py delete mode 100644 nemo/collections/llm/recipes/mixtral_8x3b_16k.py delete mode 100644 nemo/collections/llm/recipes/mixtral_8x3b_64k.py delete mode 100644 tests/collections/llm/recipes/test_mixtral_8x3b.py delete mode 100644 tests/collections/llm/recipes/test_mixtral_8x3b_16k.py delete mode 100644 tests/collections/llm/recipes/test_mixtral_8x3b_64k.py diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py index 43c881110603..6bee8c882ffd 100644 --- a/nemo/collections/llm/recipes/__init__.py +++ b/nemo/collections/llm/recipes/__init__.py @@ -22,9 +22,6 @@ llama3_70b_64k, llama31_405b, mistral, - mixtral_8x3b, - mixtral_8x3b_16k, - mixtral_8x3b_64k, mixtral_8x7b, mixtral_8x7b_16k, mixtral_8x7b_64k, @@ -52,9 +49,6 @@ "llama3_70b_64k", "llama31_405b", "mistral", - "mixtral_8x3b", - "mixtral_8x3b_16k", - "mixtral_8x3b_64k", "mixtral_8x7b", "mixtral_8x7b_16k", "mixtral_8x7b_64k", diff --git a/nemo/collections/llm/recipes/mixtral_8x3b.py b/nemo/collections/llm/recipes/mixtral_8x3b.py deleted file mode 100644 index ca5b4e35039f..000000000000 --- a/nemo/collections/llm/recipes/mixtral_8x3b.py +++ /dev/null @@ -1,290 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Callable, Optional - -import nemo_run as run -import pytorch_lightning as pl -import torch -from megatron.core.distributed import DistributedDataParallelConfig -from pytorch_lightning.callbacks.callback import Callback - -from nemo import lightning as nl -from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule -from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x3B, MixtralModel -from nemo.collections.llm.peft.lora import LoRA -from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger -from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing -from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed -from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback -from nemo.lightning.pytorch.callbacks.moe_token_drop import MegatronTokenDropCallback -from nemo.utils.exp_manager import TimingCallback - -NAME = "mixtral_8x3b" - - -@run.cli.factory(name=NAME) -def model() -> run.Config[pl.LightningModule]: - """ - Factory function to create a Mixtral 8x3B model configuration. - - Returns: - run.Config[pl.LightningModule]: Configuration for the Mixtral 8x3B model. - - Examples: - CLI usage: - $ nemo llm pretrain model=mixtral_8x3b ... - - Python API usage: - >>> model_config = model() - >>> print(model_config) - """ - return run.Config(MixtralModel, config=run.Config(MixtralConfig8x3B)) - - -def trainer( - tensor_parallelism: int = 1, - pipeline_parallelism: int = 1, - pipeline_parallelism_type: Optional[torch.dtype] = None, - virtual_pipeline_parallelism: Optional[int] = None, - context_parallelism: int = 1, - sequence_parallelism: bool = False, - expert_parallelism: int = 4, - num_nodes: int = 2, - num_gpus_per_node: int = 8, - max_steps: int = 1168251, - callbacks: Optional[list[run.Config[Callback]]] = None, -) -> run.Config[nl.Trainer]: - """ - Configure the NeMo Lightning Trainer for Mixtral 8x3B model. - - This function sets up the distributed training strategy optimized for the Mixtral 8x3B model. - - Args: - tensor_parallelism (int): Degree of tensor model parallelism. - pipeline_parallelism (int): Degree of pipeline model parallelism. - pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. - virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. - context_parallelism (int): Degree of context parallelism. - sequence_parallelism (bool): Whether to use sequence parallelism. - expert_parallelism (int): Degree of expert parallelism. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - max_steps (int): Maximum number of training steps. - callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. - - Returns: - run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. - - Examples: - CLI usage: - $ nemo llm pretrain trainer=mixtral_8x3b ... - - Python API usage: - >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) - >>> print(trainer_config) - """ - strategy = run.Config( - nl.MegatronStrategy, - tensor_model_parallel_size=tensor_parallelism, - pipeline_model_parallel_size=pipeline_parallelism, - pipeline_dtype=pipeline_parallelism_type, - virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, - context_parallel_size=context_parallelism, - sequence_parallel=sequence_parallelism, - expert_model_parallel_size=expert_parallelism, - gradient_as_bucket_view=True, - ckpt_async_save=True, - ckpt_parallel_load=True, - ddp=run.Config( - DistributedDataParallelConfig, - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=True, - ), - ) - - trainer = run.Config( - nl.Trainer, - accelerator="gpu", - accumulate_grad_batches=1, - callbacks=callbacks, - devices=num_gpus_per_node, - limit_test_batches=50, - limit_val_batches=32, - log_every_n_steps=10, - max_steps=max_steps, - num_nodes=num_nodes, - plugins=bf16_mixed(), - strategy=strategy, - use_distributed_sampler=False, - val_check_interval=2000, - ) - - return trainer - - -@run.cli.factory(target=pretrain, name=NAME) -def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 2, num_gpus_per_node: int = 8, fn=pretrain -) -> run.Partial: - """ - Create a pre-training recipe for Mixtral 8x3B model. - - This function sets up a complete configuration for pre-training, including - model, trainer, and data settings. - - Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): Function to use for pre-training (default: nemo.collections.llm.api.pretrain). - - Returns: - run.Partial: Partial configuration for pre-training. - - Examples: - CLI usage: - $ nemo llm pretrain --factory mixtral_8x3b - $ nemo llm pretrain --factory "mixtral_8x3b(num_nodes=2, name='my_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe(name="mixtral_8x3b_pretrain", num_nodes=2) - >>> print(recipe) - """ - return run.Partial( - fn, - model=model(), - trainer=trainer( - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[run.Config(TimingCallback)], - ), - data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), - log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), - optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), - resume=default_resume(), - ) - - -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 2, num_gpus_per_node: int = 8, fn=pretrain -) -> run.Partial: - """ - Create a performance-optimized pre-training recipe for Mixtral 8x3B model. - - This recipe enables performance optimizations that may not be suitable for all use cases. - It builds upon the standard pre-training recipe and adds additional performance enhancements. - - Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. - - Returns: - run.Partial: Partial configuration for performance-optimized pre-training. - - Examples: - CLI usage: - $ nemo llm pretrain --factory "mixtral_8x3b.pretrain_recipe_performance(num_nodes=2, name='perf_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="mixtral_8x3b", num_nodes=4) - >>> print(recipe) - - Note: - Use this recipe with caution and only when you need maximum performance. - It may not be suitable for all hardware configurations or use cases. - """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - - recipe.trainer.callbacks.extend( - [ - run.Config(MegatronTokenDropCallback), - run.Config(MegatronCommOverlapCallback), - ] - ) - - return recipe - - -def hf_resume() -> run.Config[nl.AutoResume]: - """ - Configure the Hugging Face model resuming for Mixtral 8x3B model. - - This function sets up the configuration for resuming training from a Hugging Face model. - - Returns: - run.Config[nl.AutoResume]: Configuration for resuming from a Hugging Face model. - - Examples: - CLI usage: - $ nemo llm finetune --factory "mixtral_8x3b(resume=hf_resume())" - - Python API usage: - >>> recipe = finetune_recipe(name="mixtral_8x3b_finetune", num_nodes=2) - >>> recipe.resume = hf_resume() - >>> print(recipe) - """ - return run.Config( - nl.AutoResume, - restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x7B-v0.1"), - ) - - -@run.cli.factory(target=finetune, name=NAME) -def finetune_recipe( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 1, - num_gpus_per_node: int = 8, -) -> run.Partial: - """ - Create a fine-tuning recipe for Mixtral 8x3B model. - - This function sets up a complete configuration for fine-tuning, including - model, trainer, and data settings. - - Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the fine-tuning run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - - Returns: - run.Partial: Partial configuration for fine-tuning. - - Examples: - CLI usage: - $ nemo llm finetune --factory mixtral_8x3b - $ nemo llm finetune --factory "mixtral_8x3b(num_nodes=2, name='my_finetune')" - - Python API usage: - >>> recipe = finetune_recipe(name="mixtral_8x3b_finetune", num_nodes=2) - >>> print(recipe) - """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune) - - recipe.resume = hf_resume() - recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) - recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) - return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x3b_16k.py b/nemo/collections/llm/recipes/mixtral_8x3b_16k.py deleted file mode 100644 index 13ca1c2d4537..000000000000 --- a/nemo/collections/llm/recipes/mixtral_8x3b_16k.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Optional - -import nemo_run as run -import pytorch_lightning as pl -import torch - -from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule -from nemo.collections.llm.recipes import mixtral_8x3b - -NAME = "mixtral_8x3b_16k" - - -@run.cli.factory(name=NAME) -def model() -> run.Config[pl.LightningModule]: - """ - Factory function to create a Mixtral 8x3B model configuration with 16k sequence length. - - Returns: - run.Config[pl.LightningModule]: Configuration for the Mixtral 8x3B model with 16k sequence length. - - Examples: - CLI usage: - $ nemo llm pretrain model=mixtral_8x3b_16k ... - - Python API usage: - >>> model_config = model() - >>> print(model_config) - """ - model_config = mixtral_8x3b.model() - model_config.config.seq_length = 16384 - model_config.config.max_position_embeddings = 16384 - return model_config - - -def trainer( - num_nodes: int = 1, - num_gpus_per_node: int = 8, -) -> run.Config: - """ - Configure the NeMo Lightning Trainer for Mixtral 8x3B model with 16k sequence length. - - This function sets up the distributed training strategy optimized for longer sequences. - - Args: - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - - Returns: - run.Config: Configuration for the NeMo Lightning Trainer. - - Examples: - CLI usage: - $ nemo llm pretrain trainer=mixtral_8x3b_16k ... - - Python API usage: - >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) - >>> print(trainer_config) - - Note: - This configuration uses increased parallelism to handle the longer sequence length efficiently. - """ - return mixtral_8x3b.trainer( - tensor_parallelism=2, - pipeline_parallelism=2, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=8, - context_parallelism=2, - sequence_parallelism=True, - expert_parallelism=1, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - ) - - -@run.cli.factory(target=pretrain, name=NAME) -def pretrain_recipe( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 1, - num_gpus_per_node: int = 8, -) -> run.Partial: - """ - Create a pre-training recipe for Mixtral 8x3B model with 16k sequence length. - - This function sets up a complete configuration for pre-training, including - model, trainer, and data settings optimized for 16k sequence length. - - Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - - Returns: - run.Partial: Partial configuration for pre-training. - - Examples: - CLI usage: - $ nemo llm pretrain --factory mixtral_8x3b_16k - $ nemo llm pretrain --factory "mixtral_8x3b_16k(num_nodes=2, name='my_16k_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe(name="mixtral_8x3b_16k_pretrain", num_nodes=2) - >>> print(recipe) - - Note: - This recipe is optimized for handling longer sequences (16k) compared to the standard version. - """ - recipe = mixtral_8x3b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) - - recipe.model = model() - recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) - recipe.data = run.Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) - - return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x3b_64k.py b/nemo/collections/llm/recipes/mixtral_8x3b_64k.py deleted file mode 100644 index e21d85a13dcd..000000000000 --- a/nemo/collections/llm/recipes/mixtral_8x3b_64k.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Optional - -import nemo_run as run -import pytorch_lightning as pl -import torch - -from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule -from nemo.collections.llm.recipes import mixtral_8x3b -from nemo.utils.exp_manager import TimingCallback - -NAME = "mixtral_8x3b_64k" - - -@run.cli.factory(name=NAME) -def model() -> run.Config[pl.LightningModule]: - """ - Factory function to create a Mixtral 8x3B model configuration with 64k sequence length. - - Returns: - run.Config[pl.LightningModule]: Configuration for the Mixtral 8x3B model with 64k sequence length. - - Examples: - CLI usage: - $ nemo llm pretrain model=mixtral_8x3b_64k ... - - Python API usage: - >>> model_config = model() - >>> print(model_config) - """ - model_config = mixtral_8x3b.model() - model_config.config.seq_length = 65536 - return model_config - - -def trainer( - num_nodes: int = 8, - num_gpus_per_node: int = 8, -) -> run.Config: - """ - Configure the NeMo Lightning Trainer for Mixtral 8x3B model with 64k sequence length. - - This function sets up the distributed training strategy optimized for long sequences. - - Args: - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - - Returns: - run.Config: Configuration for the NeMo Lightning Trainer. - - Examples: - CLI usage: - $ nemo llm pretrain trainer=mixtral_8x3b_64k ... - - Python API usage: - >>> trainer_config = trainer(num_nodes=8, num_gpus_per_node=8) - >>> print(trainer_config) - - Note: - This configuration uses significantly increased parallelism to handle the long sequence length efficiently. - """ - return mixtral_8x3b.trainer( - tensor_parallelism=4, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=8, - context_parallelism=4, - sequence_parallelism=True, - expert_parallelism=1, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[run.Config(TimingCallback)], - ) - - -@run.cli.factory(target=pretrain, name=NAME) -def pretrain_recipe( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 8, - num_gpus_per_node: int = 8, -) -> run.Partial: - """ - Create a pre-training recipe for Mixtral 8x3B model with 64k sequence length. - - This function sets up a complete configuration for pre-training, including - model, trainer, and data settings optimized for 64k sequence length. - - Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - - Returns: - run.Partial: Partial configuration for pre-training. - - Examples: - CLI usage: - $ nemo llm pretrain --factory mixtral_8x3b_64k - $ nemo llm pretrain --factory "mixtral_8x3b_64k(num_nodes=8, name='my_64k_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe(name="mixtral_8x3b_64k_pretrain", num_nodes=8) - >>> print(recipe) - - Note: - This recipe is optimized for handling long sequences (64k) compared to the standard version. - It requires significant computational resources due to the extended sequence length. - """ - recipe = mixtral_8x3b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) - - recipe.model = model() - recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) - recipe.data = run.Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) - return recipe diff --git a/tests/collections/llm/recipes/test_mixtral_8x3b.py b/tests/collections/llm/recipes/test_mixtral_8x3b.py deleted file mode 100644 index 238fec74e0e1..000000000000 --- a/tests/collections/llm/recipes/test_mixtral_8x3b.py +++ /dev/null @@ -1,110 +0,0 @@ -import nemo_run as run -import pytest - -from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule -from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x3B, MixtralModel -from nemo.collections.llm.peft.lora import LoRA -from nemo.collections.llm.recipes import mixtral_8x3b -from nemo.lightning import AutoResume, Trainer - - -class TestMixtral8x3B: - @pytest.fixture(scope="class") - def recipe_module(self): - return mixtral_8x3b - - def test_model(self, recipe_module): - model_config = recipe_module.model() - assert isinstance(model_config, run.Config) - assert model_config.__fn_or_cls__ == MixtralModel - assert isinstance(model_config.config, run.Config) - assert model_config.config.__fn_or_cls__ == MixtralConfig8x3B - - def test_trainer(self, recipe_module): - trainer_config = recipe_module.trainer() - assert isinstance(trainer_config, run.Config) - assert trainer_config.__fn_or_cls__ == Trainer - assert trainer_config.accelerator == "gpu" - assert trainer_config.devices == 8 - assert trainer_config.num_nodes == 2 - - # Check strategy configuration - assert isinstance(trainer_config.strategy, run.Config) - assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" - assert trainer_config.strategy.tensor_model_parallel_size == 1 - assert trainer_config.strategy.pipeline_model_parallel_size == 1 - assert trainer_config.strategy.pipeline_dtype is None - assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None - assert trainer_config.strategy.context_parallel_size == 1 - assert trainer_config.strategy.sequence_parallel is False - assert trainer_config.strategy.expert_model_parallel_size == 4 - - def test_pretrain_recipe(self, recipe_module): - recipe = recipe_module.pretrain_recipe() - assert isinstance(recipe, run.Partial) - assert recipe.__fn_or_cls__ == pretrain - assert isinstance(recipe.model, run.Config) - assert recipe.model.__fn_or_cls__ == MixtralModel - assert isinstance(recipe.trainer, run.Config) - assert recipe.trainer.__fn_or_cls__ == Trainer - assert isinstance(recipe.data, run.Config) - assert recipe.data.__fn_or_cls__ == MockDataModule - assert recipe.data.seq_length == 8192 - assert recipe.data.global_batch_size == 512 - assert recipe.data.micro_batch_size == 1 - - def test_finetune_recipe(self, recipe_module): - recipe = recipe_module.finetune_recipe() - assert isinstance(recipe, run.Partial) - assert recipe.__fn_or_cls__ == finetune - assert isinstance(recipe.model, run.Config) - assert recipe.model.__fn_or_cls__ == MixtralModel - assert isinstance(recipe.trainer, run.Config) - assert recipe.trainer.__fn_or_cls__ == Trainer - assert isinstance(recipe.data, run.Config) - assert recipe.data.__fn_or_cls__ == SquadDataModule - assert recipe.data.seq_length == 8192 - assert recipe.data.global_batch_size == 512 - assert recipe.data.micro_batch_size == 1 - assert isinstance(recipe.peft, run.Config) - assert recipe.peft.__fn_or_cls__ == LoRA - assert recipe.peft.target_modules == ['linear_qkv', 'linear_proj'] - assert recipe.peft.dim == 32 - - @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)]) - def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): - recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) - assert recipe.trainer.num_nodes == num_nodes - assert recipe.trainer.devices == num_gpus_per_node - - def test_hf_resume(self, recipe_module): - resume_config = recipe_module.hf_resume() - assert isinstance(resume_config, run.Config) - assert resume_config.__fn_or_cls__ == AutoResume - assert isinstance(resume_config.restore_config, run.Config) - assert resume_config.restore_config.path == "hf://mistralai/Mixtral-8x7B-v0.1" - - def test_trainer_parallelism_options(self, recipe_module): - trainer_config = recipe_module.trainer( - tensor_parallelism=8, - pipeline_parallelism=2, - context_parallelism=4, - sequence_parallelism=False, - expert_parallelism=2, - ) - assert trainer_config.strategy.tensor_model_parallel_size == 8 - assert trainer_config.strategy.pipeline_model_parallel_size == 2 - assert trainer_config.strategy.context_parallel_size == 4 - assert trainer_config.strategy.sequence_parallel is False - assert trainer_config.strategy.expert_model_parallel_size == 2 - - def test_model_config_parameters(self, recipe_module): - model_config = recipe_module.model() - mixtral_config = model_config.config - assert mixtral_config.num_layers == 32 - assert mixtral_config.hidden_size == 2560 - assert mixtral_config.num_attention_heads == 32 - assert mixtral_config.seq_length == 4096 - assert mixtral_config.num_moe_experts == 8 diff --git a/tests/collections/llm/recipes/test_mixtral_8x3b_16k.py b/tests/collections/llm/recipes/test_mixtral_8x3b_16k.py deleted file mode 100644 index 1f1b041584d8..000000000000 --- a/tests/collections/llm/recipes/test_mixtral_8x3b_16k.py +++ /dev/null @@ -1,84 +0,0 @@ -import nemo_run as run -import pytest -import torch - -from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule -from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x3B, MixtralModel -from nemo.collections.llm.recipes import mixtral_8x3b_16k -from nemo.lightning import Trainer - - -class TestMixtral8x3B_16k: - @pytest.fixture(scope="class") - def recipe_module(self): - return mixtral_8x3b_16k - - def test_model(self, recipe_module): - model_config = recipe_module.model() - assert isinstance(model_config, run.Config) - assert model_config.__fn_or_cls__ == MixtralModel - assert isinstance(model_config.config, run.Config) - assert model_config.config.__fn_or_cls__ == MixtralConfig8x3B - assert model_config.config.seq_length == 16384 - assert model_config.config.max_position_embeddings == 16384 - - def test_trainer(self, recipe_module): - trainer_config = recipe_module.trainer() - assert isinstance(trainer_config, run.Config) - assert trainer_config.__fn_or_cls__ == Trainer - assert trainer_config.accelerator == "gpu" - assert trainer_config.devices == 8 - assert trainer_config.num_nodes == 1 - - # Check strategy configuration - assert isinstance(trainer_config.strategy, run.Config) - assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" - assert trainer_config.strategy.tensor_model_parallel_size == 2 - assert trainer_config.strategy.pipeline_model_parallel_size == 2 - assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8 - assert trainer_config.strategy.context_parallel_size == 2 - assert trainer_config.strategy.sequence_parallel is True - assert trainer_config.strategy.expert_model_parallel_size == 1 - - def test_pretrain_recipe(self, recipe_module): - recipe = recipe_module.pretrain_recipe() - assert isinstance(recipe, run.Partial) - assert recipe.__fn_or_cls__ == pretrain - assert isinstance(recipe.model, run.Config) - assert recipe.model.__fn_or_cls__ == MixtralModel - assert isinstance(recipe.trainer, run.Config) - assert recipe.trainer.__fn_or_cls__ == Trainer - assert isinstance(recipe.data, run.Config) - assert recipe.data.__fn_or_cls__ == MockDataModule - assert recipe.data.seq_length == 16384 - assert recipe.data.global_batch_size == 512 - assert recipe.data.micro_batch_size == 1 - - @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)]) - def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): - recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) - assert recipe.trainer.num_nodes == num_nodes - assert recipe.trainer.devices == num_gpus_per_node - - def test_trainer_parallelism_options(self, recipe_module): - trainer_config = recipe_module.trainer() - assert trainer_config.strategy.tensor_model_parallel_size == 2 - assert trainer_config.strategy.pipeline_model_parallel_size == 2 - assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8 - assert trainer_config.strategy.context_parallel_size == 2 - assert trainer_config.strategy.sequence_parallel is True - assert trainer_config.strategy.expert_model_parallel_size == 1 - - def test_model_config_parameters(self, recipe_module): - model_config = recipe_module.model() - mixtral_config = model_config.config - assert mixtral_config.num_layers == 32 - assert mixtral_config.hidden_size == 2560 - assert mixtral_config.num_attention_heads == 32 - assert mixtral_config.seq_length == 16384 - assert mixtral_config.max_position_embeddings == 16384 - assert mixtral_config.num_moe_experts == 8 diff --git a/tests/collections/llm/recipes/test_mixtral_8x3b_64k.py b/tests/collections/llm/recipes/test_mixtral_8x3b_64k.py deleted file mode 100644 index d71017649b1b..000000000000 --- a/tests/collections/llm/recipes/test_mixtral_8x3b_64k.py +++ /dev/null @@ -1,84 +0,0 @@ -import nemo_run as run -import pytest -import torch - -from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule -from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x3B, MixtralModel -from nemo.collections.llm.recipes import mixtral_8x3b_64k -from nemo.lightning import Trainer - - -class TestMixtral8x3B_64k: - @pytest.fixture(scope="class") - def recipe_module(self): - return mixtral_8x3b_64k - - def test_model(self, recipe_module): - model_config = recipe_module.model() - assert isinstance(model_config, run.Config) - assert model_config.__fn_or_cls__ == MixtralModel - assert isinstance(model_config.config, run.Config) - assert model_config.config.__fn_or_cls__ == MixtralConfig8x3B - assert model_config.config.seq_length == 65536 - assert model_config.config.max_position_embeddings == 4096 - - def test_trainer(self, recipe_module): - trainer_config = recipe_module.trainer() - assert isinstance(trainer_config, run.Config) - assert trainer_config.__fn_or_cls__ == Trainer - assert trainer_config.accelerator == "gpu" - assert trainer_config.devices == 8 - assert trainer_config.num_nodes == 8 - - # Check strategy configuration - assert isinstance(trainer_config.strategy, run.Config) - assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" - assert trainer_config.strategy.tensor_model_parallel_size == 4 - assert trainer_config.strategy.pipeline_model_parallel_size == 4 - assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8 - assert trainer_config.strategy.context_parallel_size == 4 - assert trainer_config.strategy.sequence_parallel is True - assert trainer_config.strategy.expert_model_parallel_size == 1 - - def test_pretrain_recipe(self, recipe_module): - recipe = recipe_module.pretrain_recipe() - assert isinstance(recipe, run.Partial) - assert recipe.__fn_or_cls__ == pretrain - assert isinstance(recipe.model, run.Config) - assert recipe.model.__fn_or_cls__ == MixtralModel - assert isinstance(recipe.trainer, run.Config) - assert recipe.trainer.__fn_or_cls__ == Trainer - assert isinstance(recipe.data, run.Config) - assert recipe.data.__fn_or_cls__ == MockDataModule - assert recipe.data.seq_length == 65536 - assert recipe.data.global_batch_size == 512 - assert recipe.data.micro_batch_size == 1 - - @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(32, 8), (64, 4), (128, 2)]) - def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): - recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) - assert recipe.trainer.num_nodes == num_nodes - assert recipe.trainer.devices == num_gpus_per_node - - def test_trainer_parallelism_options(self, recipe_module): - trainer_config = recipe_module.trainer() - assert trainer_config.strategy.tensor_model_parallel_size == 4 - assert trainer_config.strategy.pipeline_model_parallel_size == 4 - assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8 - assert trainer_config.strategy.context_parallel_size == 4 - assert trainer_config.strategy.sequence_parallel is True - assert trainer_config.strategy.expert_model_parallel_size == 1 - - def test_model_config_parameters(self, recipe_module): - model_config = recipe_module.model() - mixtral_config = model_config.config - assert mixtral_config.num_layers == 32 - assert mixtral_config.hidden_size == 2560 - assert mixtral_config.num_attention_heads == 32 - assert mixtral_config.seq_length == 65536 - assert mixtral_config.max_position_embeddings == 4096 - assert mixtral_config.num_moe_experts == 8 diff --git a/tests/lightning/test_nemo_run.py b/tests/lightning/test_nemo_run.py index d651890b5fd3..8d7814bfe530 100644 --- a/tests/lightning/test_nemo_run.py +++ b/tests/lightning/test_nemo_run.py @@ -19,10 +19,6 @@ ("llama31_405b", "pretrain_recipe", "llama31_405b_pretrain"), ("mistral", "pretrain_recipe", "mistral_pretrain"), ("mistral", "finetune_recipe", "mistral_finetune"), - ("mixtral_8x3b", "pretrain_recipe", "mixtral_8x3b_pretrain"), - ("mixtral_8x3b", "finetune_recipe", "mixtral_8x3b_finetune"), - ("mixtral_8x3b_16k", "pretrain_recipe", "mixtral_8x3b_16k_pretrain"), - ("mixtral_8x3b_64k", "pretrain_recipe", "mixtral_8x3b_64k_pretrain"), ("mixtral_8x7b", "pretrain_recipe", "mixtral_8x7b_pretrain"), ("mixtral_8x7b", "finetune_recipe", "mixtral_8x7b_finetune"), ("mixtral_8x7b_16k", "pretrain_recipe", "mixtral_8x7b_16k_pretrain"), From 0aa267117ae5f1ff9f8d8308ed3d1ba1ce939f82 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Mon, 7 Oct 2024 13:29:23 -0700 Subject: [PATCH 05/18] change the figure file name Signed-off-by: Youngeun Kwon --- .../{speedup_figure.png => cp_speedup_figure.png} | Bin .../source/performance/performance_long_sequence.md | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename docs/source/performance/{speedup_figure.png => cp_speedup_figure.png} (100%) diff --git a/docs/source/performance/speedup_figure.png b/docs/source/performance/cp_speedup_figure.png similarity index 100% rename from docs/source/performance/speedup_figure.png rename to docs/source/performance/cp_speedup_figure.png diff --git a/docs/source/performance/performance_long_sequence.md b/docs/source/performance/performance_long_sequence.md index c2816485b54d..d73392e6c78b 100644 --- a/docs/source/performance/performance_long_sequence.md +++ b/docs/source/performance/performance_long_sequence.md @@ -152,4 +152,4 @@ ### Speedup enabled by the CP -![Speedup Graph](speedup_figure.png) \ No newline at end of file +![Speedup Graph](cp_speedup_figure.png) \ No newline at end of file From d3e071217cac844fad0206578f967c7491cff6d5 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Mon, 7 Oct 2024 13:51:27 -0700 Subject: [PATCH 06/18] Accommodating the reviewer's comment Signed-off-by: Youngeun Kwon --- docs/source/performance/performance_long_sequence.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/performance/performance_long_sequence.md b/docs/source/performance/performance_long_sequence.md index d73392e6c78b..77e7c9f46e1a 100644 --- a/docs/source/performance/performance_long_sequence.md +++ b/docs/source/performance/performance_long_sequence.md @@ -2,7 +2,7 @@ ## LLAMA2-7B (FP8) -- The results in the table below show the pre-training performance of the LLAMA2-7B model with-CP (context parallelism) and without-CP for various input sequence lengths at FP8 precision. Detailed configurations and the achievable performance are provided for the with-CP configurations. For the without-CP configurations, the best achievable performance is reported within the given memory capacity constraint. +- The table below shows the pre-training performance of the LLAMA2-7B with CP (context parallelism) and compares it against the results without CP at various input sequence lengths. The detailed model-parallel configurations and the achieved performance are shown in the training results with CP. In non-CP training runs, we use the most performant model- and data-parallel configurations without CP given the memory capacity constraint of the H100 GPU system. - Container: [NeMo24.03.01.framework](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags) - System: DGX-H100 @@ -151,5 +151,5 @@ -### Speedup enabled by the CP +### Speedup of LLAMA2 7B training with CP over without CP ![Speedup Graph](cp_speedup_figure.png) \ No newline at end of file From ae18787b2b604e74630a05b853116f3439563e14 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Mon, 7 Oct 2024 14:21:35 -0700 Subject: [PATCH 07/18] update the y-axis title Signed-off-by: Youngeun Kwon --- docs/source/performance/cp_speedup_figure.png | Bin 20611 -> 20359 bytes .../performance/performance_long_sequence.md | 6 +++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/performance/cp_speedup_figure.png b/docs/source/performance/cp_speedup_figure.png index af73e6f5375b85f789d10cfa59d40aa2e5f104d2..ba4eab5d65a8208d55db0db37ccc9686e4e9b088 100644 GIT binary patch literal 20359 zcmdtKWmuH$_cl6+fCz|!igXI1gp?zVgdi;_h?Gc6N;fE=qDV+LDj`zRF@#{zNaqkz z(jlFD-H5;6f4}?K$FaY>?}z>6!Q+{kd#?Ms*R|Gpp6gr_psFlOa+2yK3WXxMB`2+h zLLD1Mp>Tejz=!{1YRaFBLa8XP0LpBZI2K_-pE zV<;kXri>4j_>SqRv&--Lw=aAI8K1v+nr!#-WVFP)cRnZcjST7qdk%&+_C?X^!Ue(w zbDb@#9^$R)!IpQegfr5;2XuAUgmK|3KL-r=orAxqU%pEu@aMYZCbAM5Mi1oGpwm@2 z@JIFptP$nI>}LaiKA)*Y!=I58qtx)n*6)AE#}4vrMd)}DuNXP{zFqS4INvwC^%N+hG|RDIZwij5)nCtiQ1T3dfUN0#Pu zFU7-d4vQ(l|80>|_T92C|C=SYyi3P(t?PbI>EkhpYV?qc)U0*+blSllFI@AqZ#e1j z-As)xRnE5q1y%#svaj6x^dQ;#_s_TY(N1%{612#hUcsA2ch?7Os0uB*KA!vVrzJ#u zW6(YgKA5KZRx-`JBh92GUhqaWA-r4q=H{ufjP%6zFi1WH=m9FOg1|OHuCr_GBieV@=Y{Gp^Gefy5qPD@-2F} zNQR9YBa7l{-&Daj$?2xl9IzRdOw2i9%N?7akxo%3ZqqCiSyhu1mJW|-=zi#-QnB-e? ztN@>E5KUeju65}s{yf@c=|O?>qF#abffwJyuP3sp#XL%}q8$JERBw(hCZtbc?$<2^ zg^r1-c_&KxD$~})WX_v6kx-PgVo$3ZQ||zTtHdKl5+UioA4<9L{4i)UUi1GxjYKL#Q7Aqw6B5-a*V>b%{@tL%3nWNF z`sI92ok3z#rD)!?6=?fW$2XUmyHDpH$A~6cv?K7f3({S zZF9rb!<|`L9jo&LJ(j(nY9O3@;~#&gq>O|9ras=bJYd6=DVLCx5p>&gmg0_egVt7`)^{j>Ci zbU@c5;NQD1$5@S5{AuBLaz3#)oUh=N$HTx0=(Fyr@2eQFB>zeps$nj%+GsGcBK(lo$Ln?>g#F z;ea1;M}q3_sUUkDA5pdZomvUfNpnkraPwQK=f}Pg)m6H!@n+vI;Le2rB(-b^D`kG5;w=#qS2zHu`ysmOz3tARAr?pBdMiho9YN%4!P{=4F0UP+gawr5)-Z6*5@)_&DAxB<<|xInFj~X(usR{bPnu5nsqF%`StD9Y`8z!*DuZS zzhwU(=X{sn7?Jf`{QWO`h%%`>CIZvXj;?rd#SO~zyYK7>y4T#sQ_ zPYc|F+ni_hyYcVJ(odb+-pL4zo#oPrM$zH5k9c$Fl>T&>>8wA?t*0Y?om0D*pUtG(xi82*K!6G>(YSV=`9miL69b{8UB<@VO{#bH$ zWwM$2Lu24-M56#yaHNpe>s0M;TXejWc}4tITOJC!V`;#iaQ&TYHba&_N@>VTmgXyQ zMvb~7URJnG5GmXa4)A@JLX4yF+#`>@gC(OBr)CiiuU7z!Kh2G&{s*+?_x zV%9fK5|i_v_$(SZ0Z6rV#msbPTU@j3J>AY~ikX;HaBZqVo46TPWoqWqbqeQ8ySD&P z=wsaYQ7?x>N%`~R-78O>iDB>gh!Tw&6 zKE`R@A<>R~#U_LLnq@TLgq~IiCx-O;6F4!JlFB*_fle=!2HTTqKNzxex#9?^EpzBs ziP(~x+lG=3_ZL|ll+-AedRfEL1?Pi=7k(--6P;yp0thi-a!dFSLG%}zmwzYC4S4-5owsKMtp%YiDF-s?|m#`@S)n`Q*P$AA|My(;RC$UR_6bbihNh z#uZEAmm~Rx?eJ;($T#>)?Iz^yyY6cGSF+z8Dup8y2M8_>O6fpzl@x+X>gB(?C`5+x z4A)harq$$DZOc#jgrh%7JwN4C;QFT-jcb@`oHKXsibk)lhyURz3T=ACR1jAuO}_4S z>F~C*PO_U8U9{D6orrM5A9RAi=W|^#>iA?JHP2>mPA}}#v@l7Hobr%BBQcwL9^PuS zGHlm~woPTKP5WApz75*=6CkmU3RnC2gF*Y&zUHbMhifx|s#X3bHP9p^g7fhsc+T9F zt7;^1;`^KLv1hyI`U;A?)0GqEScm_{d`GGRMe|isktH9jpXCc}s&Lm&6R(9Feuu!D z6Ghz%y#dwE|9kVwHwt|-uTTMHfX@bKW@@-PRG+st8u#zV!uuhV3lFB#ldREOf6phm z&BY~Ide2U4$!id_4XaOAOkPt7G+3Xs7%Y9Ej(89nFJf_RT&xxP`jgkmO=A@Btgz^)Ic9uA<7M0HV;zXaO;cN4_K8kA-fYl<;V%!o0DG~>i)hx&(&jChy*$0GpNO)nXYSLi7D4n;5C36*aJa>qguv-(as zK`2&J&vWfAhEkA4vxDIf6Z2F61dP2q)01lh=-(R6{CxZo6$=8A$E%oSgN|#k9rRT1 z!#Nn_)5aDi7D@O9dv)`Eh3mlsl3K1%<5#uXh0YQvY~|U3WVF z;kTD*P$5j_dh@EP3nGytK~`@VESk18PU^ksi}kzyI1=Aj&F$MhB0fv(#Au+>UvGla> zlYwv0R%G5VIjiE;iH3*{r|Bup!-GGty|I`ZGyd{Q&ZoE$p920%1qpSQtSct6mwE9< zkwT}xMuNmxoziS~HXnkf5rA-b&WQ=)(`w$!2G2CHQs~$|LBLDZGViz7ehbJ$$0Qxz zw{`hix3)mAj`!Z06FJR4##J^ErSHYF^=GQ**46RhI@*7EvqZ-k>liDuy*h1(u3XkqC5N`nr-<_wL<22J9A|DUeiMF(l=sIH~7>#5vdreGD zL4=X@z&GRwj;nZYhhdNZK%UCw8q8wVfMtJSJ&m~6;KDEWLwM5)0FkBveLdMvChSz6nB(?P$IIZTY*07idQhd%WLz zFC9R}Dk_S#zR6`9iLVLO;NzW)hd_Ca#13}t5k`x7wTNtpj{>cb!_VDi1gMbT!O^aQ6^U~9?P zK%JXMX!F-Jq@c;Hf|mBCWyp{H&2p%JA4+6ZicN8SfAclfBs>%+KC*wg@;_7#mY;Q*X540v4%f_`kLiVx3oV1NYs?zJY=q!eiqr^7tbMt zVj&Zmce_AUgO8s!HU9)zmKFG!lcPK$E%~@yKwHO827>YsOU|a;+bqWb7YbulT9wLk zTYH3IH%62v31wlZC>a12Cj*^_Tz^>Kd^pY-{SCu>c9)+luo_aQmdH){#6_onk?sgunbrkw1j`rbq*u|4~Vv# zXo#8aNS7rZ9wP97#OS2nS!+cB5J`iVrr4^(U}C(zNNQYs;-7t*etV@NKqkAlrxM*;3LkYfoV|Rfj^C=k6RO;g1*+Sn^MJp< z&g}0j_UN=hxyKL^wm|+Ic|x!%)ed@G!tB7QGBTeDQ^K;cdM};C^PKn1D&ww?5YC04z+uD#cHsth6TNwb|9a4O4{6$w+=X4Wdo90Hob_; zr&FP!6FRbi7v4HT!7IT$KWB~HQCmpSO%{bgfT~X9Z(elpUurC=-k!-!1BfY;)!nG7D3K~{MLop7f&9TNH zwir0CaUQ0Uh%7cH%3qM(isC~2;?r)#n5b&xubP|+wcDOfpPh-yu0?PI?OOc=jXi%}VMkb}S!cLHx)XF09Cnldfr}ygZ%+%yQT|O}!EV>?z?Ls38 zBg6gt{Qx5Ztq-U7NBK#(w2PZWR)0z{KEZ&}EVMfwXH%7j**S%*QF<&)MXYU*VfLLg zCh^1@-va)7CDvGpk;v5B80SUnH06YkSM<|P9^UtN*Qtc`GK2BDPP{iqz0TjfU zCc1u4NlzbWW`BZ!p?qFYT$_mjKVBm^Me`{xbR5D4;SYD&54CSzX1-bh^LA;zFC8l4 zR_HHK-fzV883Bk+bDZsVo5~>;&$yTOZbzcyZ)%m&aWNI|ifF@`C zj1(#)0Vy3nh*hM4JEt^(KaI_jQXSqAQ>Isv>tswU1*>B81N0X*$fhw9(qf9dn1J~I z1zjynO*$!aEh*p(6joO}0U_!fBLp z`L!0gl_uyfvhWp@!<_UVAanIA$xnU?UxzB$r}US_j8-hw62+2J@tck8xR0nM6!8B} z*xCo(X^wS5DO0_`L$+`-(KLW?dQbfl}4#EnWJ$hPbXU!0C@4PKgJ3@@vGl}DPxbURh~M6Ou%aGwGKplcF6 zR&F&DaNaKj)TVw^qCYc-sfKgw>;0*zKG?ItP>RVMRdo?008MZy&-iryla;WphakwBRh)kSH?3q z6%vo$gasi$G!z9-80xSj(9J6vSsx5K_I(aj9sQz5T_M|dBw*he1OR3_#^M@na;j;= zLIO?SkoJL)C^fS&Kajq*5~cqUfh!?yUYnx?3D^{9?U;a&4a-)9ccZI>qMD~;Se#7lrb`m7~SzSO2_;&@=%J!zV;8IOaKGe(4<2o9upAYu-h8SkGS^QYjB z2WTxR`PSI->E(l}orQw9kaw`?N}q&;Uz%Qsn!(Ts%0%{$eFOmj+`&+?U$!>?d8#w- z>=Ax`3*0;T$3COfvtA`RgQNIKc?MuYCwK>%NIW2{u7j# zK}Uk-!49CxG=Ty?Z^Ga;svQ{-RnWhJQ?b%;WsSI0Nh~~!P30O1^|ekf(3q|QJS)Kb)d>!S z9{=()@B9!-w+R2&0*woMvXosi)J{75|E_d=uCACyh9`HF3`fN6s$*%!CQKB{rpf9c0!K`fJk#i< zwKN<-ZgH|d5%cg9oS=UFH2T#mNHX7N`eHHhv65yCq?<=*T*|KkBLO9@?&lMwd@*c{FR(UVg5IFIjjtt{3_1Lnr zKiJ!d1FR5x^4#U2Jf1VasZ`N|ypJd~91&-_sb43oqb@ zV#3|mX3p`jw8jcF7FuKb!!O;IRDiH)25D>l2*1eDd2co9uSO3QS!iMi>&OtapRx&P zpQP#Y!+(N$GWH6udLu&1kMOM=T|SfZKYR!Nf*{@ll$`yjAqI8{Nu*an@TAQo3Fl{Uj4Sx8UREnhk9Hhw;{wfQHPqE) zyQwLe{rjcKryuuwE1nR{N#fO8px(i4KZ4W5d%l4vKQD6d z>@-i^>vL+{-f}MEtV$VSjVf&+Rd)9Hhcb8g_+`kV9b=_~%qAtbcQudX?JyqW$q+c{ z(SX7I6Bt=C-^A65J_9;HF!LX2J9^oy7oko8Hc)DWFj|1_n?d)5)WHf2vQ#zhMXji^ z-~gv^e5VbzTtId4^%nF;+*B9q=77)me02tecABR`zuF6KmCw}ts?G-`Y1kNFe3C-1 zg~75XLR%%BGQ#y<2>tQ-T4dSBORIN3Mea<&Azpn!Y5tQ8aC~8a8n=|W(MQJrO4V71 zj{<}Ym&WCtL&)l5`8XEecy1Mrv4fsv42jGFRSw`SgxygJJ7pD570ZokPyPvC*3-ji z8l{Dq{qR0yOmd3(#m^N>8TbX2~?QK#i2cRgZCgLi+9y!`DD^fwpHmsVy|**5*ox2SSoL8w(q?Y%-$O z&7Rw`Fe7=Q-T)x4+h8QQH3*=EaZfBkpF$K``d|Oiy&Pvc^Zp7{5_Tbsd0yh|F9t^s z6|b$+i&#vLwHh>K+MHr226K_h%_2srcKOE~uvM8RP6;PL1>d2VMN+ZN&(>;uCQU!r zV+AaZQYOPPXa}xtO>}~GDcIxjyr$-=)*2zCcK#>NH9>nJ`O1TaXdB8MQG?p!d$Vf& zdmiezkSct>PLlX$L*Gi!cEkrueE2aF;m|(Caa`Jrx;aL#{bX`b5ae|7-#H?(g3I1& zhsr>+M38MPKIj!6{|m>2Y@7hp2si{&^4%QRL%LTIzub1_OV;k`LX%S`Y5T(d5$uw9 zh!mWyy+!ZbACD+c5q%{(Qe^_Bq2Bb2IY4jO)B7S_*{i?h)aR@HTrUp^)-K*_zNiPz zhwM>C3vo&G-ggzf+4w08f*V=Yztoji_d230Q?ti#6=WNT=^y_-aWBzzl2TL*dWwBQ zT@0sfjw(FMDJ|_35MJFp62&~rq3+`SThdqnTjEbcd%zoWtPIu&@GqC0Y?=G{)|Xws zs)GIAr{w)_eeY#S`5b5NU@(h_cp^s(P3tZWrXQ2u2Rp-%y3ruRL7S|pY$`F6enl=7 zoHkWejnT*!J-h%m4!Wp;LUThj&f^J=zjb9Cvb4b1_t$)gwD|lKEw4pSj)02zULpJ4 zkKx6jF^;Y0ZX+7M&tQrDl(KrcgSjYIE3h30A!;!>tpdpG4i=G$$P(VyftYUQ92d{aqYqi_{~T!)@N--Bqc!+4fqJUhZ}L# z-ul2i>t{hzz=^#vM>;FRnLR*wLYP~iOdoc6O+J@FuK-$-{78{KfZj+m(q6FlXdK@- zWXmJf(9%yB9ghn~1V3HC^O;Q`No6(GHh%2$++O8@m?0_5)h=mS{oVE@{PJD-bndFH z$v8)xYG_|iJ1v&a<|Jq1l(8AKk08qI)PqnkQ0*=v_KG5LMA%ZR?W^{#&d@4I%c00O z_<~>Hvi#8gJEdu~@Ioo`&vXz@)1ZAVlo?=|e*67Rp(bhNRRQb4Uf|ak&0a|?+=}ME zgjRyE zbN5^3QN%v-JmfevBFRWC*V!%Gt?6hqMCM_E$z8=IIMrDU@Bwcdr>t3vs_=({%w?oy zwwHNGS4XP%*K)n5OXqT{_xn9SX`O_IoIrCNy2pS?YYXJ=Rr#%WLHlElc1(+qy-3Fi z35V^{pzkvVwz6?C1!z+h z;yQ91Y40!m<3p=J+f)Rg;5bS{*idstwV`wY*xhWkF->X0xr#&M$sY)p87ATUYyhUe zo8#@y)5!57%zlhB>SaSXyP=-PvP{C9o^?)mwW_}B#JPlrs{0=!mUaGWNpe>3-oW`c zOIx68^()PMArs%qM<`78q0iFfMSVHZoGh+@fAcdg>2GBwzQioON+GFDI;aHugOCoBhgT>Nbks*qQ$_DOenvD+&t=qK zgPUO);7q)0Xm6{PMqt*n4n+gmem#ZF)kqQ4k}d#eUySmUt?vd~IgJc^j7KdB?% zm_fHbbadJ6;{Ci;JQPEO%*i3W(8t5a0W+qXt7;^m>Kkp_tHwh|g^gqC@Y#7Q?7*a3-e2&Yy%Y zAM)n3S}`C-UZuEe0$^{+!`wvVW(Hm@Ie-&9w@CyUU6{3C|LdzNJOC4|@-)yB*97ZJ zmX_G6YCKKi2I9&c6MiFyBgt!f`pH;k{79^bYkuNVegSR`bv^V_6xSyvDl<<12&T{V zj#(TiX$_ z-UAzvaqoHmH(~!;7A>_U^ifg6(3uc3p_C+lJu*dvlK3bD#)8DAmHa1X346f=nS;Gs zf4SEVKb~)B2^aEyaARZR1t-@g@J4~97uhDY7(eT>jZA3fJ)yDf)-sJj`eh5TPsaUR z5?>$Jfzuu4>FO%T%pJ__@?*HzDz01LQRk%(ridj>;_{oHd$mN|(ExJrgAVNG!VRml zaE_&r0Xyo8imD!cM!mXBLML+XV0#EEH6?KLZW4O2!3DbW0OdmDjg~7wL#ao6o=?WS zzC0sYyD2Y;R*NX$kDhC6{MTP_X|lWN-I^#0yceR@=MKZ$od``mLY=k9m7U>(A;nn! z`uOXQYZp$XoJ}{s8WR;oK}5mTIZX!t%`%-aHi^7vy^DBHb`=-=-;7ldGpRMkjw!21iwH$4QM0H5rPG7$L`xSgbR0#Kc1i`T#d}GG5gC9r>0u zw{o58Ep^>f(B+BRvv=v>%`=(#a>SI-G=6f0I%Wm zA(!AtdhKmbD__pM5k!q{$pXwoBjiw*rHknn0ea##mRk5fATl_3YLCazEj97A68l-l zh^eNXVR69CgC}|A?#C42*+5FZIo6Ar+PB=_api}RBYSGZ^cINAa5vuLil4ff8l1XG zQqSHTdjT@|y?4@tSaas)0Ps1`&8yAVBerL`^LA>AkzN~;B^-p@3SY>}pQuBX8nSET zEdSF$LN)3E0#5%>fvL)0(cDLR18Q9C_t(?}lFo1vn*oRrVkJ(&ib(0jEVtL^E#xCP z%x?zKARVyxj?`%DP98(vodAyF0!W2N)(W{hASd%d@^xZx(9G0OT5NlB?jz1P4Mg=o zOo94ttx78%YJ?QDb2OY~5*m9r`X4iuVR4(#ho6_ARahnx7o|W+*o6_EhJZbPy*tsV z-yBR7JZ^WXBzBJ+CU$XyhF#!Wm-rp|Z2{pxc^aG@2#_*6V?MW4 zGkEo$PR9u7QBCv*o3iR4U}0{t%|`K<0H|-9enscCX)x@K1MGsqx&HC3NC}|4C5g+g zzr7C@`#3xXrKu#2tUpVuKoP({?py6{>;+KpxEu&RSAv=X9IMg6*Y{uN^DECWM@63A$ClP>^kTY)y8cxQA6#FuMO|GY*nTw$w48vfDHhUwSE zxqh~U|GH7#X>6Zy&QE&pT^?2Mk^SRLL?` zLmUj{4%6?5g6oYB09i{_6Nu(nljbcyodUDLS@q4xjVuuJZ=I`9j!U)qCO7fZxd%E02#SWdOGu~>pRRwo1U zQD|gX6zT0t%bV) z2$8Of+byS`$v67XH>Wy&Wn~*qLToxUf~Al)Z$-Tk1DkizMd>U3SJ^5?oz=L?#>wng z#&2O16bGc{-nn{$J=%Mh4d91Z7SCyu~=xOstn{{7K9)lkO_MHu*=YXMm7O5FexgLkeu~* zs@9o*tFtwV|4ptUVxY;4-&TP}jP06u|BUSkkGjq`uAj!3>FJj}9LLka5W^(P0-dv+ zaf5J>JYqVc6!UO-HkyDUgOGhY+j+kXM=pxLvJ?1u63xnnn2;}KXH4|U$j zvAE>4;VR~6>h!OuPyNtaOJ8oEvDeo`QTWyXu}`XeI_p6 z!&?LlQe|bcIheUeUoXXO+-z}ZJjby0zVi=M-rt`XwD||5)@9s0C!}j2ug*W3aO(~5 z&=YcApdl-txL!}+AYs?I#((Lp&i8x>s}AdxggZpTZ94a5E$vqJ&e*Z$N<29xA{nsPYY20@a95g=@=bw#X>jD zf6DHXRaF|0*zu*xE|fQJw#8e)#9WRrZh zU3cAs@Ym=wT$z5lT?H{}FgrYEaDeQirg9${6 zCtp>3#ojcH3U9WT{f1IIBy#1;Iiub8ikz;{S+LONINW9zZxLRnRe*ye8B)9T^r>7n z-BOlDW*SQYzAkU3oq-v(Lq*7BgjCwSsEXQ;K<@mFpvOQ|%S4trpcBV46lMIE)$Qsj znNx_f;_C+U{Rc=|B*j@Y^nNifyATK z5*}x&$hAejl<44vyK_91D&Y`g)jWZda_X=pTS}I@vstLaXkv zM2=kj%YqvFWn4MLJGXnb%J^-{hyk%2NB%F}rH2}FG=7U;Bg00u7fMQhy}WYlt+ur- z5Eay0YPrYzw|sAXRvFb!%OXmYr3&z0N`M6haI)2IfZzH-PWjAbWIASVh+C)h6G(aP zO*YkgCeC;vPj|!c!%(BtIj5`0DMp3!6681a@*eslv|3WKB#RDX+;|>y#pmo~5`XmZ z6T`OuE-XCijeY64~5NfX83fsx*Pb_EtI1Gkvxw+NP_7zldh z^n+DNXRvrT1Ads#L7g7?zh877Qfaf@nG}HE^@FxwNT4)BpQ9sJU%UV$A;S+yDe+-QM1wl@0Xdf9z23 z@x?2-6}+bxz;UmHQaSCkwEi;d=GG7KeV)_hL}xEOKs=7!Il2}$$58M7ZJF^@OM>6x z@rML9$6S8BI9uWS%5MPxYs9D|%FfTb{Q=pM`ld2;-WID5s&ro;XULpNaO~DHnd!`M zBDcGdGLrBTM@NrH(1J!O^kN?jGyVC{0K#x{W^PriF?hf=n*e`MU4ly4yyWF~#SUL& z@PaL>?$kj_4a-Q>5~up(BC|HhR{Z)VlpGCN1z74oBnh~BBW^(q(J#eyQf`Tcd!x|q zG(KVPgjhjfzUhvfs>iPYHp!Qj{+Ihy^gXIM6>7d7b99N zZ0jO^Qqvd#tyu{9+z)juM72NVR#?c07PupbZ@ea}s_QiuE|tA(xAuMCmqD!z987$> zZ_i{LukNwCIiP)kt$-5c;eP zK26#H&=NID?3}U!;beN@;h7vLvh3rPby*%TOu@KlXd<(Kb4vqA!2JhR3%33vBn;^n zov9RgS7Z{inDA_JPk+XS;@yAh+X~Gi7lw2`R49rK^ zdx0~Bo{Pb=#)`c2JBp^tQTM**o21Ka7U;*_xy}^4yvw$(Ztl^+aGu;I z$fKrGwTr+uBx`dAUr^9^A^v7>BXZ>wl=6}?jT6ki;J%0&SM zXI~DU$S+=mWAfhz8dOj2?495ONre}&H}7TIHW7^+U;KJ;;}S|jpX({Y*F%aPd042ptrW!*$ zpJ&}uVo=13FwYO?8sw}}YA)6P;vq|f=A9*Eu0q7EdLIpbbBz0cCV2zZzyrW9LXhjV zUeA_fmOv~Ma5!v%%*DQEQIJ${fjmUoY1Nl6GkaMxhvq+ny`acsAid;7<5R}1px_qe z0EWIH%FGO09GRtk5DrG7_oG{om!6aZNjB4kUZSttO{eM9s92Nc*xSf-GR!uSl`2MG ztqFQ8_-u*RAKG3Pb&+=9ZxN862dM$!=Qo3f5n7e)AhH~i68;+U$y#O)*$iMWRD}aY7N*^~`nfXuoQ^<13NWxp z6y@ov_l4j=&~0v~-kCoaGi?Ndnv*lU!hl=rObo7sw9NaFNb7SF_3i}s6R|oiK+IPW8}#P(O-xe7YP$mEw#y#U&cL%e)AnNBF$vJ#Gcmoq zZ{@>`75DV)LywlD_irq*=iYe+9U!gA`c6Zy?g)#eelL)2qJjb?Tmy4{mf-U;O!V3| z-zx*3Fyo%v#zOX5kB0|)EK2#fbOCNZ(SRdvtT$aG!7qpr)yARu&?FJy;h zo-O9C-fahu3OcoFP-pANi^1pV`5vnRB`m{{td^8kD8;zekMO=GoQz~nCaSio1)GZH zXWdwb4yA*giK`E>i>I{;bIa=LJ&8wU{!o!p@=DMCJ7aaVzG`s9K1)>J8-8_11X=%A z_5qN`7S7rcvfqPzT%_m3)e7vTFy}$d(;K3pVb+yv8lJC{Tz(N^*l}GKk;%O;&()?I zo6nW4-p|nyc^A`0(^7&aYhu^B%jWctO96PkU6!R8b zc6RhqE^QUxc>|?qkW%-bu}PPQuH0&Co8eTgWa;C;But=Ul&cSdsN-ahzd}*C8o?9# zg4D(`J}|Sf0k$Q_voez+V`XBp0j8Z4-Q3Feg(G{=gI;#u&S{z@=sXtk*!&da>j7GC z5n^y7-L^8)EbYMxt3#z|jRD<#I&}@P^?>d0_@0U7|FS&Th7Pyfq!7LVHBFfROirYf zar^J`e!b%ErhNybjnQw8I>UEG472| zicUMjkn+rH{*2X$Na}~Ko?qtK5Suuto#{i#<5mii^|a1|Z>_F7j_Pe7hV+C$-8DVu z#UgzW;*<3YcWCp^&s$gvcX<+Py;`MejIhZN2l*#iS(!{O!Nb&;bOXT&7 zWL24WGTsS)1C=r(Du79=!Xt6$uN3V2)7tV*TN&=j~3k;54-PNgGXUTzlcvkSv+#Sld*CZw+)k8dpoC$|?!9Rh?~~K7dX3i0pBKPSMtEXD%%cAA%A{Iq-QT_0D!n|I z(PC83kkV_WtI_s)FeL8pfzQdOE9KV4)E|i`jGtJ1b=!$0F0dnvj}i>V0)RKcl>DXQ z%9$F&MV1e@5_G62hA6a5@C2`9_6TI|r2u)@-m$|UEO6hj&W7DqQ&suOl-VOn!)C$CRNxxIIG64uxS|(Tu5JqjtyJNMpBYxb;*YtMD6- z&C13blBce#u|)+ueN7{%#w}Q1nhmYmXN8Pn6_}QuTz&cveYDLhH??(I;Mg*u{|BkgMkJZ15Oywp$EVdwXZwz+A~b z&Sj()=F_|6@90TyA8g--C-qga<)#bz>v4**!fo&);%&PM>!m#14B_UFiEQ+$G*Y9K+)+R7fafyarh^|K1C9n3QekZ{W}V~!IbGFe(SYqfaMlUS~Sa0=%$1XH>#2g7BFX)%+X$n zO(0HJ=9;Ld$>3GK_gxM$XSc`RT8~r7=?d=UbJJ_Djm5WSGILkevMbWn`b%tb9bf2j zVoWH%eD`BC)OCWjeQsA#XfjwHj;`WJ=v*8^bD&b9VbIA; zbr3z-9*dWuIB3+@Y{4^sySx$BoL zCPH!jz0L)0j~~2+($8)aL#wMZLtR!18A5LIB^5I}MJr_F+*R-!sXB+hod5-E>rc`_ zmg)2PpOHk27LJ^ez z)hWM3u?+^j0SlyzjOPwlo;SaF8e16T#2bd*;*Y2?ng~>en}y0A-r%3%;ifD#=DV5%j2) zMX(a~V|g!tYh^tF03!A{>T%c_>>fi73x6>nsnbj_kCb5;H|#n^YY*)O48|2f0Q!)> znqbcgTVY}d`B{*neSZqQ6RWg*CV}1cF_WM-sUGMfAgU$p*WSTX|4bsb$R8tC>fdv@ zweqD+oyx%PU6cd%&Kue!xd7Pvonf5BDQu;9@pVFL96DDE;IB`CQEB+pxZ%H$Rq=;$>7$>u@s3uD!bCE4JRay$u*zDA#LL=SHNsN#8yWV!g zI?m^atZD-slybe6k)Q(SSfQ~;;qkACJE*@g;z|^wC4N-CaU=eALv^Y((2FNRqy9$h zHML#-JMS!*WT@U--RLZ9hun0!GRz}sY_#Zl}WNN0q#t~sEL!36zETY z(#+rQ#zl=&32JlT@NCXyIZGL&FwG}I-`oO={PL5dr1i_yZ2}@q3T2L9#`bH>T>}+U zr>DQm*$9L35WDXI`0RJ99UkN0+$FljH1$3)ROy?w5<1wgiWucXOrZAUi+J!QI^Z~b*FLSvcQW`Cyu3TWU;!~?sI##%K&4;<25=rF1ZE~egEr>{yJ0B-302A( zFJ7`~d&;f*Y^KiXiZSm@{f$M+oNo@E${wAm4Vd7O?l(zsgW-H^+TM`YjuqyXOn*?x z)!-&CGLM3E+3#%%?))8O$0U^N5A)s(DC303>ue`r&RnXlR)AMS%Y-~{=T>OnZI;gj z7--7yt^R4zQ6MzgD0l@S{@}n`W33p>vidvq^lYO>#R-`?yWYX)8!*fjA(nyW3&>fkUz{Z(3$RzLnKGZ2Q z=dNMzxx60^zh4H@Rs)c10%iZpGYw(6fs6MFgBb@GO*Cx6Vs0DxHv&=WQ@UI*Uq^fe zz*pBg3`|{1Q7UG$|46`n;y7yg{Nb0o$M|huIqfp>-j4qWY`<>{QM)7EmK*QBTP*CupIt0!vC>fm|O8~ X>-ef)5YK>UE~r~F%F;QK51#&CE5oWT literal 20611 zcmdtKXHXSu*9CY`K@ku|6hSg5B1w`+MuLLG3rY?u8RVek3?iT?k|hUGq7sBdjshwm zQF1y2Ns@C;vm4?4?)T20si~T&srhrMi*)zXPuP3ywbpK5C55|0rzuXOP$(i9X-Q=i z>ewI(g;R414}N28%#(#eDczHiynWw6cNTjxf>dg6f01E=FgWDfamjO1UMgatsUIRL zSzo_4M)BX4Ix)Y{5+ZT##EIjtsXbpaT~E383ZL=%^;Fu(cQMwf`}5mn?#+#zO*2ib zf_cKi^IgmB3@Q)W=Dgze

c?Xb9n%D5kxKYRFG%!YV%aF?4G9GW>YzwLk>dDsO`5FgC3y za&Dug`7T*K`!V@3+c+V|hLZX6%Ts)o1Eqwn2LpX| z_a{>gwXlxxA)kR8OfRrf59MJU7#{Ejht0t86W*6Km5ZzhA-)w>7(mYb zBiF3sC+YS3wLIpX4GodpOfzxt$6_{K^QYn*$H|F$14Q>6QZuFAT+|Hb&?~nYDZApu zp<8-uW&rs!124<^4^nSdjx}&M3$;-S*j&4>mn=t4)TdwL-?3HZydHIq;f{$;0UBGC ze^#f$ZMOkthMDxVw|IimH4*mcn-)3x+WvGdFExs7(wOziOVdO--^B{rn{BXb z<)>pwgH~|iNmKMhd+uCzT21J#&uY!-dKCj`|3BT?`V^^eF0z^NNl7BhE^U!mxzFvg z@ic_bQrEf?mn?!x(5}1jXY_?DJI1FhRYR}JM|rLK`6D|euvb}xsC<8Ko8omepC$SX zjmQ(em;FzxiFz%EzCJfw4^ZWQ|8uZ@X>4O{4}D3laGjWv9@b)2|MC@Pyor=uTfOFGegEKP zKD1mH6ZhI0%wV9V^ouj+kWH$5*|dcY*_mQy3`7D)uXFq5-#fnL`EwT8=CpZ1cvSSr z-cG#P`d;GiknT{}y!!h}|3ALj&|MbQ^cyK>AhimIGNXc&{dEBn>`SFueU(PLJ?FUVLAU;^9x~|RiXp&KX5!SHa=wBSE zvE3%3;JLwY$L~w@ao@xJvk^C>O*$H zM5KW2C$z-R?__#U6k=`#WeTSrJvtsXzI?NIeWY1)o!7i{-*T;nB{|JrBi~>|vF&u! zT^iT+ITGo_znk`JmO*}ybrXNh%Aa2hSPc`)ZeMN6$skO(GLwYd2jBcM=^p_!mY-BJ z%+yYj9Pa&vk00*_Ohr72?2z1}QfN|`XpSFr*_upA5GWeTF=~w3pEjs_;%t?y#hw|; zJs3x(=kkxSk__(A6D>#6SKjgsm}urdF}0iNesqRf=+bpvvAKb=64Q=P?=twrLt>vvrET%6{fQm!B)wzk3)6JYOwP2_A-rvyxu(-G|%JgO=W<#{cLMzk2h`XRk{J zweS?WY+70Gu69M=dv`tZ81CtasgBRleU=s2bghD>xhN7O2GN>`Y|n73Ky+Uoeo50S z|BCjJ>C?L4E6EOvzbS6bDrai*FV}Nablc+#AyMoU3H|^ijSfGR!l#dpIS`VB;c=AA zaJkYo3C;^G@2(Fjw{cy)ZO;;g~}w++C4>jMVjL zj!#LSo7tAuAy%+f_iPF|E=AZ6y7IMKLaIzwN)RXC8mkXiY+kDIM1rR$t@6TEkD9BG zOqImbN>MfbRI+Ln_Vcc9et!ajYg&Ezn9C(shG)|FW-kWQhI{5zf zi`-0Dc-Wu3noYv%+8NVbS$f?UKQcg^_Am(7fcqM=Ju+=^USgtc4S+J!gj;SKSJ^xQU+%6vn^zwkt8 zd~Hf^BzD{1wcp(~LlSvNq-G*|x-%m|fM12(mMdD8b!R66GD?ospBB0{h%kh=1(i*^ zGGlux+$;XqO^^(~yG`xBv#1oyXI&q}a97-R2`^M&(};{fRMZI4o%%PDulNJMuCNmI za)=9E;bYUxmFD;=wA{eG+Bg zO-H9VE{{+8gm!POOn%($&9^Y`%+N}a+GDeo@FC1DxqFbq>lE&ssx>%o^;=n$^R5d? z93G%3bY}V1)GM6Tfyx31^fIB&1b&KfOmO=rCnPxp)7n@owHVaI18lkI&j zpuBOhEk&{G0h{c*>q#oaKYt6*xbHkG7KzL*I0?u@ZuDE4_Q`NjcQ?%A@25-1Ii=Mp zLdnHP)ALU>eaXrkdgKj0;5!SGeq(xM`UOrP#m%mC4ShbTQ8smzJhKN!wowZ)r6OBd_{}jWaJ?lu>dVGOd`CC6k8>6}Y zqGFd4Ii^C}X;q!LGRwj8UiMvx{I&OA0Q?hKge+^~7R0NNZ#jtOW4KHbg%?1{|Crik z;WpKK4i4>HdX~EbZrig0^Y(!VOcdiCTSQrJpQW z-Ho+$M}MAmRW9PaM$+vUc;kTf?YOO8weeiz7=?HPF~mZyC&$p%P*yRVUB`wqjur{H zC{Y2IjW1~$m}H}O?z_`^a7iErCaR{uyh}l^ec^YIdpNVok975H`WfBIgKE)@$E^KY zBQxBvBMmR;R#WSc2hQ;osdpX0NIIc&S)c7wvs?!h^PdYCn50?MGSm4D;poEyA}~Da zAMX&k{zJy0n_e)xz-C-JmfyfkejmZ-Lwt&&ySYZ8u#x)E;>T{uzQ8?A+CrvUu7fY= zGT(!_Ye~kce&aqF&%qxvIj_(0LSE|fe{e%X$ zM0q#FFHE#D9`Y)qVS4Vn9iz?hdTLbR?hN*@czpd&U-&f7SL_d71f|U_3>q2*uD2pIq@J6$$Seb-`|i;#|&_FU>X+R_Ddb& zJniY;yw65~Cuv+-(Bka0hb0ZW>vAJsl<2+CYmf5LSTjBbG8lF?A#<8~_6Ou#=ulnK z5O#y)9@Ad#>fA^ZAhMN;_0opDV|1aMj9oi*HfKs%$#880Foad>Ob>?FITTLJOn*@u z`VPB{Iz#&F!iGRJ$xxo?J^l_5z8V+8cj92JqP5PgSa%!IpHvtzquU0%LzF?=HeI=4%p| zW^JU}->4CtxjM>q6tqm{%eOO){I5Uw6afGdB8A}*N)CczfVj6WeT$-TfM;<1#AxKWArQ4IWBS$jx2ZjOcXnzsYbKq>6h z{_lUeWi|RLO{XN^=q;3lShP<7%xp@W(_*~^FX@5u4~)dY<4&MaSNc>@%SW9S~WPr8zTBY z2qEXwKtY?KbT&)3tk`kr`N;-InlmLQjveg6MAz-L$k!KG%zgwu!QHQ4<+(Q4VxaI7 z2Ja*8Q0}tX0JU>$vm&#*?D`y*!DkfCddg*i^SX~ODC)tUNf+824T<7FU~UP%$h*&` zO63`4E3L9f$$C{WPKaFG0%O$J@Z;^}*^(<>+A%F9_VcOFod!xB$@j`lTtE7zZ~%}q zvYQ{UfQsw$=Ht$E4Oo@yFnquLa#&osH0305oP0oU#e`-+eL0T28A`qM)Br}md`Fua^!=sxRx+?>h%r%mje{HSJn!}O% zwC{hh+s-e%^`$qafEkSQaGTs*;`MB5%~8^vspUVyz* z7nuXGSMbMJgda2abAu@{@_ahuud?Cn8T#cc_fsFT7yqH>Hf?3plOv>IbGe%Xd1IDE zjiQFW6Patt9NoO_tS!X6eH0$ zf(r(mWc%PanDOynU~?G+f-iyNB+y2`a6Ns9E+^Z3)KZ){~Y;Y>_MSVry|ERQ!DEsfR*uu8cp@IM{$ zaP9gFvmIR5vGxKbzA-y}Umm@{`Y|FohPe6FQ5umZ)KHY<`gWXH=pi2+K2p>@7l=Db zsuQl;vjxS$N!s>TKjH@N9PtW509I2s=?YRF-)W*d%rMzeg7%q4CyR8^kSp!DgGmp; z!96kE2 z>VW&cYa?L}-p{?=ZP-)%M}a)@Jd;8MchfBi?rNAyb3O*t(IrO73bCIY6P#A3##<63 z?6{8sHK8ZsEdP4-q0v&{*`qEf2(R9^ z9<4oBLXI=cN;KDlsX-V(B=;d1A~7jx^0Ilg57GEoxZVK)$v5Dj|DJ>O0ANe|Qwy6- zb!L3l817Kh%Zok-nCc;r$g4!Fz<6}aPgN6(w%bjT9YDgfbpU}<`4fN&{EKRjA>c!| zw6HvFl+b~Z>v?IeY{mNnnm1ZoT?3$HBR()M+0o~2z+Hj9Se55R(4hXl)EkUY9%Wx7j| zyYfo7_G3OeT*&+bKjq($adh_CZG>?TSd~Bxlu-O05uO5jZ=(B6UXwCEUK(#S4`bDM z%*TLSz?JddOOf1-erAEKZMJq6$+=UWJ^+Tq%%+<1-Tf0S{u;vKbbH!I%lv<1 z{t`5{_~`z{kTpLtksDU|#=C9Rq3X?*@cwG)la5c$Sr$FQ{oJkPkJx?_ppxp%A44CjJ9ZB019kKGhd1n8Ay?&F(@q_}P=v5^KJ~icxk|`l zy^Ye3APko-HUaNNI8stAJJA%|m>}_TrUVbE5z%_C}`?OVEUN5k8|z8kP^7T&tb z3VIf~s9R~1wIdK17^hV;ZmUsgl^g>Z3z}O`-ya6;vo17l8wu3QV_mzDwc%OM;E2N{ zue{x;JV^tABcouTbq(Z!L)wG3+u)Q1B!Jb3eGuKc8$C)Ebkfy1T^WGo?n5gtH7fPO zrbP4M9t`cFGvtvK2fd_w7p#ZA;*^j-B`b&ImuC6nzmwAG12whMc4y;RTcXT);eh~Z z;a)UeD2AudxS84-h>B=n79#)8IcO|78hv`DKoTKjD_4J9w)~S_4rNc&GY@GJz`c#( z*uxBym;)qf)b z8v;;n$%((S5Jbpe9$L$}H(M~6AN`y#1}LDK?aY!nE+9&H;m9=JXo}`UNbVlLOPt~X z^EqGqW0X$hz%cj8gmftQL8w2c% zE)xc@RVM7b_N2^pz;VKRkWJ~}qUubY5}RuolzpPRD=H=A$aSIb*v5#ii~f2AxovvZ zZFg(ZLZe3+9S&J`2mKu3HE1tUJI`&=VmpZRbU7$EW7bW+45UK`3y|Y4q+Ys#hbHR{KuiV85 z+r^+aI0Sh!_Mm%t$sdFKUBEJO4 zUz5C%?gIh3?h&@wHu>prl~WEHNGI65=b==ClF|rNcEBG32=_+rW9dcK^*>d>8lS^* zsPnGZ&I+R~WZcu!>h@$RsPk&C#}C8{O>@};N{Pz%~M%>5?#RaKy^PPfX% z@cJ%$EAaT6%okhL*qh$}W#hGzK=^l}3^)HabMxN9Pb9Wp3fJdKZaBfLnVVW}YUp!b z^bKkG*c(qZj%s6}fk;boZfXK&Si-OSVH|oe)(foa2hbdi_Z!ZJ6B#Qz!ip}v1DU(Y zuKo(&D&GYbqWgny%y+hGhv@WAV%%ZpXD<)efs_Ng=EMk$uc2VjX<BfR0YLkV5iJ@5GMSk^ za}jgzU<(wy8J2zd84A3N2>YJnpu~ExM>fL!7Q;r?58y6N4-B{$M-USn{b84gD?Y_< zJ!fAva+0xgQ70ArMCo}kgS$LH4wT_lZ^Mkh~r|XuDPDjrDH05H3zx;VN zmhF=XvLW_Hbh2(f9TF?SuRT)YgRA?CtXB)!nEDa%k?(lVI1ka!P}h-~06vS}Tuz8J z9gisE*ma6exTUEg;j->_&TGm7{jyAgSMM-94iKQ*;Gh%gCkrpla~b~plYZ^C6ddHQ z`v%f!lo-S3ViiqIKBI7u&s@48HdxE1mCpk-WW(8VfZ?ADpyE7dm-P?*mCq=TJn#u6?_J{`4a&JGKjskdG~#-fG*geT-)pO1<}t# zpuoL&O-XF9fon&ydsxe@GpH{o*gy%PEEYI4er@@2H?@qc9-#2!{7AJ0u8s1@irqD> zGK-h<;Nh5>1wHr>+jzK_t#)cTrDp;g!Ef^?5KB0E3hKnni(z~=5P2me3bx1N7?-Vz}Iprw7vIBasuQc2a{vFMIAYzbu}ye9sS(^Y6}Fb4_-M4ts4QO z#Fh>-pR>x%_>S|xJ|u_sgK$w}m)NQm3~r0j1|8NJ3uHxKTBFZ6LY;2M9XN9I+wh4v zXN7=ffW*aT0&q!=oErOqQa0CtBd(cPA8>9)8<^Pt^!>Mu!==@`Jx0IGmK(H7&n z*0W_4pjvD{PyMHE79Y~oXeMl~11l3En}ST+2P1-z1JpsdPjil8eH2KsmH)2Rg|=rf z#SNvoqQDOwE}+=lR9`_BbpQwOH6{2@k&OZrWO>gZRCtHA3&@t{hft*-^K9hK0Syc! zDaK|}8Ndo$!U}lKrx~(LpZvY4q%y#Ub}9!|NUbpFD+SbVKnOE~yZ_LjD208tN+b%Z zeu<1LFpa%#J%-^uf7K#d``R07k$(=-$mY_R9rutQoH!&`gYN^uWQc7D1582&QaVT8 zdiJ5otrcWv445yD^1i(T6OLu+;vIMzl7RVK>|~WrOlAWa$Lh8@Miv5T1K}ZrH&2dU z1ycG<`(o((Q?2?qC|V7mlOzHv?faj-#GbfyDARjriS8`CXhJIkA&fy|1>k)TwRw+y zO{4FSS)Z6-g|SJkh4|BvmZev2;Waen$``mP`>P!<9qz2M)nErIQ3 z+1=^UKLYViP|fH{s^?BO3iYZ6xCmPk9|%H8EI~7U1_AG!3|NSk!upfzuKuxIskVFztu~;yVml=B5hO@uWcB+Sep{}o3TWRK^QCCl;Skz^2z|&+FGlc~2J?^j zanDS>Lg?Ff(_*2)WB@WrLLVZ7@b9j)tqwvhi{Ej)h)u2Sy~|~6 zo_RP>nS1(xlM=06EPHak&t1HP)Qo%62&4u+@HJu!V}|+|p+wJ6`GSpxf&Aj+WkAtq zBo=F}7K0fKem*Ae!@`+r%hD6I=z~Q~r3vhF3Bf}onc+|=JFp~_BYKbk`!=FU>$xm? z^TU>H)G&@sVj9m`f?rMrz5_|wYYNf#!Iwg@bwQ`9Ack4gZbrRZ_<8?CG|x3M`PwFW zL6w70_-Y2ueCtSJgF-MlKL!h!Pk5Os(|V%GX-(^gi#t#;2f7?cew^NpUz4;IUsyfB zxM;b8ifo$_!V+;HAd1=Kob@M^dTX@5w>utV-So?`q*@}6^#enVI{pB4k3Z|D^CAkA z3&&wUm8WuPlHYiE#P%%c0ZwcuU>4?5(DDxftwUcuYdrD*6k5)N-@u!Ya`t!MMPrt%HsHXJ8n zQ>v9v6^CWuP{pX<3iv%NfwE5|n2=UFqxW3-hiw3rpz1!`$k*BP_PUM$lk=LP)=hph zQ=5PNe(HNN>z^7=zgixKSd_9>RqOlRMxG%r$R zYW+cn5uy_E1uA3HFJOu()~_XBkOrbVW44v!0Q(y#<}L^uHCz%dziHYknL>49rwKx$ zX5ObcN^aw(+~{`THge1EyrnpbaB+aE#y8iNq;mB-aVnmL;n6q@;b8gyP3T~QK<+Y! zYYai-N}2^dBWIrC;Pm7$DQ{47KK7#4GR%%!7#qhC1%|`0&^KM~il9s`ef^I+CBgis zkn+MHfT|oc&J?DV(eCSAZ*l6XOesQH0uftUZ-HgjXT`dlrJNC)neKnWgNU{Y4xmeq z{>yowuO^@6J@~Bps0_7#$oA1XS5>bAh(VOu)xxbF<9H2W9QS|!qke#Ja!oq{d*Rn9 z+>Q7)9tU8Dx!rag+LGlj@BWAalc`d)-?}cCAk^oF4%igo;B%v_>rlMNK>@HDu>}NM z4;D2DuR{z51)vtfhv{VlcC=J0v_ZD+F7xz8s+b+Oy-i~kh@*&e2yr(}FWy(^#i^^+ zprC|PG{?#{Qup&63%A#Nx$E}SEck7`uq^7xe0;m>X%A>fyoe424b6+}+9vk6!Jbe8 zy_u+0z&I>E29)#PC@LHCqCm|cKR2sluXpYYBQ78uJJOy<1{=O#stX#9vll?Yb?MV` z2#{pP-J7#Df$to)jzP)$0oO9Cw$Sp}t>7RKky8|?uo~qDIO=5A|KO+{14KWubJ^xJ zxDrBhNSO+Rt>e3&E23Y56k{H?K9goHm2u1%e%XcE}pzv!fb>@)Ev9- z7t{<+_5S(^Vb5@oYJnHV8TK}uC1f`CYH2O0*T5Ij3UVQbFMDsHO3aUbe03eHB9VQ- zr-1Bb=M(cV9P0Pzx0m^P?N7Qi!ZsNj(Q_QTprc!^U1TNDY!AU&4stqg`_H9iw=0-0 zwf-Wo;dOrIgWn2?hDJ>>{B~Ujr$Lg5VA79*_j)oN(+NvPmez-Unz<$kf+a4GP7Q8e z@p9%aG&oqtJ4@4rFf685xUT%;WAV>5evO+J zy#_Uflp{PYCoLQ|f%us^`CKSW{pmQch5G)K{2!PwrMzN^(qG+xQ<=%Um&^cemYBsm zXCWWn*4EW6m#ZDr$u;?>gqa?&7VSHkKVczHdCfZs$2`E})+iT!vvDr!`H2O|xVI;? z;V_4|Iy!1Ns_d07P`Lxqs@at&6Y?eQ7lN|Wq{Y{PA~pfhUP6B3ptn>q5 zmU1{aDvTN_R7xEdTLxz@XcSq4%m+m^?0b*{QP?C7aO9+?v0Se2E7E0 z&cXDMk*`3QJ2?o75(!$WsZiguCS0zigB670*_H|Hf9tbXuMB=k!ZyE|iD88e_cbe7 zF&?b|43W|wq}GaK4N3FF8Ks15BWXPUK zID{7W!dOjxpz2i3&`h$ueDg_5+G~Tg^FYf1r*v2r)LK~OOs8X-PJ5#xZD)!m;uvg> z`@72qA-(Oy3Ixp(V3MedY?@yF8Ax5qaMzy^{s;*YO zpmX}cM-5)GLN|X7uwkQ@^1$941@=lG|1>U~s#oV2Y<_T}=Kd-UVspXYAG4P(h+Fqr z6hyjgF7-72lc8PIO3I-tlvcG~L*xDg#O;2iKO$qHDh;B0i^K=SDGTc5!A5#Mnp_`YoLu5I@0gTBK9g)?zpO^NWg8iF_^qG%6m>0(4&w-}{1_CWnwp}5XrqXtd~ zA;70ftgh;|COtN6|eEyl>aYrdrs zrFCo_$Qqt+ynhYmqU}s^vvk|P@ds5>6nNo4wfP>~p)5zZL2FOuOWCTl>K%V2Pi1TOE-cguW-f|ncLwswGi?XsEfl;58% zSP(XDqTKyGK3f5PBqR>Xu8*DZ*av;jNw)po*E39_gEG5thuBPn`|Wfpq8ChssT$c$ zLqM`q^4OuZ|5v;L4S4P>-i)Q#eq&(uU z6@eVv2!6O3_GcEvi*Lp@cR~JMJyC@q*n(Y>a^N}5*$am}TbxrtWg5?N-~HT!F%H09 z@E``w^^TT;ZqD3$Fu7KXpXY3w6Qv@*>AzOodR)HK?u&V zs!Fw4l2%WaUbok~|}Ld;;NnY5V9{%zRFPv_lk?GdPoh4n~`&+C$1)$+l^O;%R_?*NRQ^q)^$rLp1mH+ z3*yEu5d^w0ua?5n2(Z>(tcSh(D^8F9pBqrTm1q`G+tJU5`%?)f6JTGQBp!dm6*vmw z1OlgbzubagWYeFfjT+z8UIYE28{4R3k4>_G@+vp_7(+*uJ*#L2g9!iT)+zoFH7M|7 z#fg}hdd<(fUA2X%CLm(})dR-}GWZsY6o||Xp!7b$vTUuLby=Qh?jq#{p?N=6VIF5JkQBvb0oxp)EHV^m#5x!7ceM zW3S@$D@BD0k)4%{w9K)HkYS8&rlb=Z@$b$ z+I;ZDQjs`N`lfg_qzxIuyDjMgb6~N-US^C&;POG9@9#ReRjCD9~T7gw~Uq zfqnE#^f9K$YWoevi4Twxwpe@!)dCj-xPI&>yrgc;cR1M6zgHJ)!?BR}1a+uL6w5=4 zOVx=NgU(A1Go2xk$$U{f8Z0ZGpyftH4mG};&#_bN`ze(unehdSeH(UTd8Cy<=bMxp z#XIJrQNJLj&iKxA@i9)W22c+=Lunz+T2)YSh0CQr%fsrel*6B}?=_=j;;6J22UJ}q<8`5-(R1ayZ0Clx zRDtyqHdMypKU2%0ob&t1hV~9h`J`z}08$zuMWK{+3$nBY`)}giJAwrWQjcdF8 z;SXY=xOH3Jp0DWB!AX2Slvv3qw&BC#Ww}r9vfT&0H;C-3?L3<|DPE9H&GM!J$AjsX z=?(m|<-@0k`zlTi?uErJAOe&%*0u}MJ0ih??rw}wY$YtHv*kMUF4Y$fA0M|+2ELg# zjImjSC!1r$@ato$*_J<{ZXt;Mnx91G~kURl0O9Yz#4q04{MzwVCVZubw`ahA;y9 zUvX#!Jwxz9#FVNEL+tb1lZOS30^CLRi*fSgho)pIj*bxpXmv{B3)w1R?sUI4VBXO`=ez>qRH>y zfxn@TgBN8Wd!zMXa3Jx50aN8~V43i#0#6O?oue+T){DPSFa%Ld)*jCm1oz3N;Gf&P zLZkQo5E179gQZvewHIQ5O~MLr{?cSZ^`XombgBQ)laUKroLo1Nax2TOI4jB0rEM+m z!$I-u-iA4wac-ISGcJ~;I53z7@0i~3t$pdhg;d9c(Kb|JzV6UwT=wNe0ShAkOWMS>d3_7

VPl6{gh6*r3&k#vp|q#cQ6+q>6KSfcpZirO9q z2N5qBTwrr>_D0S5v$l{>yCBs)_*mnIdGR@{ek}Sj#kD9QB6cEy0fHvTMxEB@V%bmF zO|>)YY*;`yXz_wpqI3}9enmekaH;a^>rmmbVh1^l1*>TWf#-^!TLcF`7j~IJPrFKD z5=>)l!uwXw%#95G$RMpMJF&V!T}EKyUc(M4<)Rc8@B&ZOMdO78xsv;6f7};|w1e1}-0HU!kVmW$3woR{#bLZ>m0Zbqh2~G)pP;+cb!REA6{!J|y?8nw z-hl_YwHjNPl!Lr)GoI1|o`14x9>f-h;sO)9GyNf?F5<2LcaFgfN)ScEfI;utPC9 zYzL4*ix7&0nIGmi_&Ay?c(M|dggCl}9w{FhLJFR1)wkLl>ix*MTfNI-*uh*VPeyHB z{A92k>vsD5y&ru)l}>N3p5j1SIuXaVtJ2S1-%D|>6Hgj-N z0}=5A_4_V`Uo~*-^1yJ|@|!dNS2r5cxckH8sJYB&D-f3)vHsWx{HHmWGx)ncqmcbf z_b&)vcj}?^`D_cg8gtkygtRYTvV{(~YU_S7pm+DN1xaOB;z%Krgi@N^M~D5Kro?bxvK2_C~iJUy}f-T<06#!BVn+w zdeoj8qnS^~+F|tr42v za`=96-tA@WZ^tHK{=5LX;`F!?C|Rd?>LKQ}SOGf_1zcGV_O~0G^6W+^MF7dOkRc0q z+IyQ5qWh7VgFsu#1DxSnsR#R-BZTR;RjH<8FphEw%_~DIhK2nD_sx1eao0c$Smw|W z!l+g^-`a|Tto7JOM7ILbK^|#IvOablm}ra+PY6(ZvN&h|iZyORd$e7|=TDB4zgmW- z;pq$4Lx=gpa8HG^YLHBY$9Bgyqp$V_9s767oDk@KNrIy*A7_@2CZC6(U14gTR|TR!*%!u}lP?b2njKO{TBFGJcM9 z>AZ3wVk<72(T;&s}Cr8e^-tFZ>~ z1d$y~DB_#wl=-tJ{WV0uRKf-3^CU5Kx~)cRnZI#`#Q0;^CMktE{a37E6Y4%e(z}rG zBy11QHL0RdmeeQec;%6QekZ~~YXG~mvi)n|nL5x-V&%7G^~lfrep~qQqpq+xz5iJa zRd3h!D*XeIe(gyRRS}2Vo?zSrU#puHXxwJ8JHMfImX7M=CiG94V4*rXXJ=`kmN76I~-DdG76>k3d#lq%HM1r69b zC;IJjwlC#eoaGPGD^J=3(GOZ&WoT`eBHu+yXGvYB`8#>i8=2ZgIdi0HI=1ij>~q=IJA~H^ZsIs%5t0h&60%KO84kPr*wyXhEEBlm zIHB{RNH=2CLUxZ$pUH3oUqK;vLdPg9Xu=8nsG1ylHh26kCD?)yh+y@NVTHsKSYHdK zk^nmMUvc{L)-s+s2d*1(43&H$oRt9NtUPNqFb*NOZcr2WIe`KB%7wz)t0M)YDc2bN z?Y|chs&p4V`-q1@{It_ZXY}ld&Jca|MWMd~b9CeGQ-qaCTHoIu+|SHPI}uDL;bVdY zekvrB<9A5UGId>3tl+96&|Xhc*~50v^pH*R7sSqp_@qlY`5SJefstqU(8wc`do-pX zhEn1?JwZ>n6cn2bkT#ct&uQE_v+U-3ipH7gw*1s+%ClkDKtt6)OpcQ-rdJWsK?A+Sqve|94?F+VDm$_LrJgi2;^rHRpN}lN zIIz!vAu*_Qv#Gt=6Exa5ze~wD>cL>U#Qjgh%`3^Z&*yz|tz+EK>SlZ!{KmP6eU#l_# zZXvT>07ulHLfjAe1*2U}hy8$tdx5yyzv%i(ne#?ZoA7Vc_ryX}GK=+-SuKT+f@pTZ z*BNX?rRcH^d{B-9hCfh_t_JLOmJU^h;W|e4#4_H*dHtlbNKg%W$Cfk*hR+mzdPDVU zU~toHTYZ40Dn)t?dKE=R4BUgKHII4?h*s<8++OZ&k9=4=!A#t}E6Gd>qKq3P)r z;}D4Hv-&5AO>mov9s)Np*<{}N#nR4NZKLKMXzjCoP!G^%GMq_6dtxL&RH2KhBuh$l z_I5BNLdf?EI0tC#{~O=^fljsmTVEOSAbx@^?m)g4a?s!i?ZQyzv?3qjLAobqz{@)q zcf)<7nkILn18H{c%G7D=klK!h4*`rR^-o& zKCVZFv^V~aMJ2>40_m_$N-X3__B_2WS{I(l3U z(Oclx<)6)MlY@@4nPftD%!)Ah&vG-t<7Wh?^HJHpY+N%i*ij(X^~j?FmG-v1a6L#7 zh(z?dVsGm+)&Rb>aneE%DEFCUuP-ms_57%XHTT&aMV_n;?m!RfAH*yl1$#^&ePZ7Q z5Kr%2FzvB#&Y!=CQuZS&A0gMIufFDUhU%aPgAUxMQR9%8ZbCZFEt%3-fln;;-T!pI z4tffDcff7ZiK(_TxezqmjwyuGssus~b*B5f1^e9gQi}zh=5DeU=u`#AdU+*n0=O&i!@lH)r8`!GHE9JC7kw%ZEvB3QKYz*<@MGWbJy5hkU?Z~?NG z$^Jpp>VF3I)M|Kev@XO}{myWex5RU0AK%(g$Kg@JqTC*gTP-1t@*wO5B`jzV@pOc5 zz(wG_M>9;{Aw|+WIAmcBhR}p!&@9b#o*%;%xBcZ{JZIP!| zN-Q%7^PS{ub#s#w8R;FC*10+`Zg;|vuH9!E{UDv?UMOuF^lh0>vd<{jDSG|V2c1ik z7877q=;(v>NE!&!J4@f_|9g>L;2pAMG~B0fP$WO1ClQ*#E+iN;)XM%@*9kFk*k(8Q zD?{ZlTLQb-X4UGdkn+ac`(!dg?h?5ccRm+5z8R}kUzDDittGc2mu0sF@Zi)={s8%K zjq7MO>ge8gjPQa`NRMr={5^K1P`k+Q)yQ{LGNNxi`@Ju4hfY5}X&%y6c&1yG7e2Z8 z=U4xFgp5a+8}!!{MBh)GtzA|HgIdNse0rn!WQBDOdve)dQ(AH|A*_f8JF zHD;2Z1Yw4Z;~b5`2o~XEDYE{GIw&!J+2jSbFm|0!1d~YIv{TU@-Q&-uLe=_@Znop1 zMpF$42E~(ahEASW^y%ME3fIl~jPuoZbPN6szHV^|X_fka?b~{_`^Wv8sB^3yzrPJn PMaf7hNM?#Z^!z^nt54i^ diff --git a/docs/source/performance/performance_long_sequence.md b/docs/source/performance/performance_long_sequence.md index 77e7c9f46e1a..b6e15236dfca 100644 --- a/docs/source/performance/performance_long_sequence.md +++ b/docs/source/performance/performance_long_sequence.md @@ -34,9 +34,9 @@ SeqLen (K) # of GPUs - Without-CP - With-CP - Speedup with-CP/without-CP + Without CP + With CP + Speedup with CP/without CP TFLOPS / GPU From 10701e951e6ec782412883dea26668bd64bb7e3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 8 Oct 2024 12:57:39 +0200 Subject: [PATCH 08/18] =?UTF-8?q?[=F0=9F=A4=A0]:=20Howdy=20folks,=20let's?= =?UTF-8?q?=20bump=20`Dockerfile.ci`=20to=203f90b98=20!=20(#10789)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com> Signed-off-by: Youngeun Kwon --- Dockerfile.ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ci b/Dockerfile.ci index cf084d91982f..f6132bc6cc49 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -59,7 +59,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.17.0 -ARG MCORE_TAG=73e7b58e79df9da521ff31d74053579b7a060c7e +ARG MCORE_TAG=3f90b989c477ba9be5d6011866641eda9d91f588 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \ From 9c4c13d847fded4685e7c125170845fb02b56e75 Mon Sep 17 00:00:00 2001 From: Shengliang Xu <106840466+shengliangxu@users.noreply.github.com> Date: Tue, 8 Oct 2024 05:06:59 -0700 Subject: [PATCH 09/18] Add ModelOpt transformer model pruning example for Llama models, default to llama3.1-8b-base (#10294) * Add ModelOpt transformer model pruning example for Llama3 model Signed-off-by: Shengliang Xu * Apply isort and black reformatting Signed-off-by: shengliangxu Signed-off-by: Shengliang Xu * examples code is at wrong dir, move them Signed-off-by: Shengliang Xu * changes as suggested in comment remove some logging and unused config code, update example model to llama3.1 Signed-off-by: Shengliang Xu * Add pruning of hidden_size into example Signed-off-by: Shengliang Xu * Apply isort and black reformatting Signed-off-by: shengliangxu Signed-off-by: Shengliang Xu * Update examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> * Add pruning test to cicd-main.yml Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> * Update cicd-main.yml Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> * Update cicd-main.yml Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> * Update cicd-main.yml Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> * Update cicd-main.yml Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> * Update cicd-main.yml Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --------- Signed-off-by: Shengliang Xu Signed-off-by: shengliangxu Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Co-authored-by: shengliangxu Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Signed-off-by: Youngeun Kwon --- .github/workflows/cicd-main.yml | 24 ++++ .../conf/megatron_gpt_prune.yaml | 41 ++++++ .../language_modeling/megatron_gpt_prune.py | 127 ++++++++++++++++++ 3 files changed, 192 insertions(+) create mode 100644 examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml create mode 100644 examples/nlp/language_modeling/megatron_gpt_prune.py diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 96d54dbc8324..7aa6cdbfa00a 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -641,6 +641,29 @@ jobs: AFTER_SCRIPT: | rm -rf examples/nlp/megatron_llama_distill + L2_Prune_Width_Llama2: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Prune_Width_Llama2') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_prune.py \ + trainer.devices=2 \ + trainer.num_nodes=1 \ + trainer.precision=bf16 \ + model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=2 \ + prune.num_calib_size=8 \ + prune.ffn_hidden_size=192 \ + prune.num_attention_heads=2 \ + prune.num_query_groups=2 \ + prune.hidden_size=null \ + export.save_path=examples/nlp/language_modeling/ci_prune_width.nemo + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/ci_prune_width.nemo + # L2: ASR dev run ASR_dev_run_Speech_to_Text: needs: [cicd-test-container-setup] @@ -5350,6 +5373,7 @@ jobs: - L2_Community_LLM_Checkpoints_tests_Llama3 - L2_PTQ_Llama2_Export_Only - L2_Distill_Llama2 + - L2_Prune_Width_Llama2 - L2_Speech_to_Text_AED - L2_Speech_Estimate_Duration_Bins - L2_Speech_Batch_Size_OOMptimizer diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml new file mode 100644 index 000000000000..cb26d5744b5b --- /dev/null +++ b/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml @@ -0,0 +1,41 @@ +inference: + greedy: false # Whether or not to use sampling ; use greedy decoding otherwise + top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. + temperature: 1.0 # sampling temperature + add_BOS: true # add the bos token at the begining of the prompt + tokens_to_generate: 30 # The minimum length of the sequence to be generated. + all_probs: false # whether return the log prob for all the tokens in vocab + repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. + min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. + compute_logprob: false # a flag used to compute logprob of all the input text, a very special case of running inference, default False + batch_size: 64 # batch size for inference + max_context_length: 512 # max length of the context, input sequence will be truncated if it is longer than this + +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + logger: false # logger provided by exp_manager + precision: bf16 # 16, 32, or bf16 + enable_checkpointing: false + +model: + tensor_model_parallel_size: 1 # Pruning currently only supports tensor_model_parallel_size=1 + pipeline_model_parallel_size: 1 + restore_from_path: llama3.1-8b-base.nemo # Nemo file path + + ## Activation Checkpoint + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' + +prune: + calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset + num_calib_size: 512 # number of samples used for calibration + ffn_hidden_size: 3584 # ffn_hidden_size in the pruned model, ffn_hidden_size // 4 + num_attention_heads: 8 # num_attention_heads in the pruned model, num_attention_heads // 4 + num_query_groups: 4 # num_query_groups in the pruned model, num_query_groups // 2 + hidden_size: 2048 # hidden_size in the pruned model, hidden_size // 2 + +export: + save_path: llama3.1-8b-base-pruned.nemo # Path where the pruned model will be saved diff --git a/examples/nlp/language_modeling/megatron_gpt_prune.py b/examples/nlp/language_modeling/megatron_gpt_prune.py new file mode 100644 index 000000000000..b9bf8edbfb1a --- /dev/null +++ b/examples/nlp/language_modeling/megatron_gpt_prune.py @@ -0,0 +1,127 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import modelopt.torch.prune as mtp +import torch +import torch.multiprocessing as mp +from datasets import load_dataset +from omegaconf import OmegaConf +from pytorch_lightning.trainer.trainer import Trainer +from tqdm import tqdm + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy +from nemo.core.config import hydra_runner +from nemo.utils.model_utils import load_config + +mp.set_start_method("spawn", force=True) + +""" +Nemo pruning example script. + +Please consult examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml config on available pruning arguments, +models supported as well as how to set up data and inference for calibration (with defaults recommended). + +Example usage: +``` +python examples/nlp/language_modeling/megatron_gpt_prune.py \ + model.restore_from_path=llama3.1-8b-base.nemo \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=8 \ + trainer.num_nodes=1 \ + trainer.precision=bf16 \ + trainer.devices=8 \ + prune.ffn_hidden_size=3584 \ + prune.num_attention_heads=8 \ + prune.num_query_groups=4 \ + prune.hidden_size=2048 \ + export.save_path=llama3.1-8b-base-pruned.nemo +``` +where tensor_model_parallel_size must be 1 because of the current prune API limitation +""" + + +def get_calib_data_iter(data="cnn_dailymail", batch_size=64, calib_size=512, max_sequence_length=512): + if data == "wikitext": + dataset = load_dataset("wikitext", "wikitext-103-v1", split="train") + text_column = "text" + elif data == "cnn_dailymail": + dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train") + text_column = "article" + else: + # Assume a local JSON dataset with a column named "text" + dataset = load_dataset("json", data_files=data, split="train") + text_column = "text" + calib_size = max(min(len(dataset), calib_size), batch_size) + for i in range(calib_size // batch_size): + batch = dataset[i * batch_size : (i + 1) * batch_size][text_column] + for j in range(len(batch)): + batch[j] = batch[j][:max_sequence_length] + yield batch + + +@hydra_runner(config_path="conf", config_name="megatron_gpt_prune") +def main(cfg) -> None: + if not torch.cuda.is_available(): + raise EnvironmentError("GPU is required for the pruning.") + + # Overwrite model config with the one from the model checkpoint and apply pruning modifications + model_cfg = load_config(cfg.model.restore_from_path) + model_cfg.update(cfg.model) + model_cfg.name = "modelopt" # Use modelopt transformer spec for pruning + + assert cfg.model.tensor_model_parallel_size == 1, "Pruning currently only supports tensor_model_parallel_size=1" + assert ( + not hasattr(cfg.model, "sequence_parallel") or not cfg.model.sequence_parallel + ), "Pruning currently does not support sequence parallelism" + + trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) + model = MegatronGPTModel.restore_from( + restore_path=cfg.model.restore_from_path, override_config_path=model_cfg, trainer=trainer + ) + + data_iter = get_calib_data_iter( + cfg.prune.calib_dataset, + cfg.inference.batch_size, + cfg.prune.num_calib_size, + cfg.inference.max_context_length, + ) + dataloader = [data for data in data_iter] + + def forward_loop(model): + # NOTE: Alternatively you can also use `model.forward_bwd_step(data_iter, forward_only=True)` + # if your model is setup for training. + model.set_inference_config(OmegaConf.to_container(cfg.inference)) + for i, batch in enumerate(tqdm(dataloader, desc="Calibrating")): + model.predict_step(batch, i) + + model_pruned, _ = mtp.prune( + model, + mode="mcore_gpt_minitron", + constraints={ + "export_config": { + k: cfg.prune.get(k) + for k in ["ffn_hidden_size", "num_attention_heads", "num_query_groups", "hidden_size"] + if cfg.prune.get(k) is not None + }, + }, + dummy_input=None, # Not used + config={"forward_loop": forward_loop}, + ) + + model_pruned.save_to(cfg.export.save_path) + + +if __name__ == '__main__': + main() From 0264eb2689b76c0a3e64dacfe720570856d98cbb Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Date: Tue, 8 Oct 2024 18:12:04 +0300 Subject: [PATCH 10/18] Update mamba.rst after dist ckpt addition (#10800) Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Signed-off-by: Youngeun Kwon --- tutorials/llm/mamba/mamba.rst | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/tutorials/llm/mamba/mamba.rst b/tutorials/llm/mamba/mamba.rst index 2704c15aa05b..197825c27d58 100644 --- a/tutorials/llm/mamba/mamba.rst +++ b/tutorials/llm/mamba/mamba.rst @@ -80,27 +80,6 @@ Convert the Pytorch Checkpoint to a NeMo Checkpoint * Note: the ``mamba_ssm_ngroups`` parameter should be 1 for the Mamba2 models from the `Transformers are SSMs paper `__ (130m, 370m, 780m, 1.3b, and 2.7b) and 8 for the Mamba2 and Mamba2-Hybrid models by `NVIDIA `__ (both 8b). -Model (Tensor) Parallelism for the 8b Models -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -* Note: Distributed checkpointing for the Mamba2 and Mamba2-Hybrid models will be implemented in the near future. For now, you should use the method below for converting to Tensor Parallel (TP) of different sizes. - -The HuggingFace checkpoint for the 8b model is for TP of size 1, and so is the ``.nemo`` checkpoint obtained for the previous step. To shard the model weights for a larger TP size, use the script from