diff --git a/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/README.md b/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/README.md new file mode 100644 index 0000000000..b85b05593d --- /dev/null +++ b/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/README.md @@ -0,0 +1,38 @@ +# Record: SP8192 + Muon 0.97 + 3-Layer Recurrence + Parallel Residuals + TTT — val_bpb 1.0802 (3-seed mean) + +**val_bpb = 1.0802** (3-seed mean, std 0.0007) | 8xH100 SXM + +## 3-Seed Results + +| Seed | **TTT BPB** | +|------|-------------| +| 42 | **1.0795** | +| 314 | **1.0808** | +| 999 | **1.0804** | +| **Mean** | **1.0802** | + +Merged SOTA (PR #1493): **1.0810 BPB**. Delta: **-0.0008 BPB**. + +## Key Change: Muon Momentum 0.97 + +Single hyperparameter change on the merged #1 stack (PR #1493): Muon momentum from 0.99 to 0.97. Validated by PR #1514 (@dexhunter) which showed 0.97 improves over 0.99 on the SP8192 base. + +## Full Stack + +PR #1493 base: SP8192, MLP 4x, 3-layer depth recurrence (L3-5), parallel residuals (L7+), QK-Gain 5.25, MuonEq-R, WD=0.095, EMA=0.9965, warmdown=0.72, SDClip, GPTQ embeddings, score-first TTT (3 epochs), brotli. Plus: **Muon momentum 0.97**. + +## Compliance (Track B) + +Score-first TTT (PR #461 framework). No SLOT, no pre-quant TTT, no n-gram cache. All four conditions from Issue #1017 satisfied. + +## Reproduction + +```bash +pip install brotli +MATCHED_FINEWEB_REPO_ID=kevclark/parameter-golf python3 data/cached_challenge_fineweb.py --variant sp8192 --skip-manifest +SEED=42 TTT_ENABLED=1 MUON_MOMENTUM=0.97 QK_GAIN_INIT=5.25 torchrun --standalone --nproc_per_node=8 train_gpt.py +``` + +## Credits + +PR #1493 @bigbag (merged #1 base), PR #1514 @dexhunter (Muon 0.97), PR #1394 @clarkkev (SP8192), PR #1204 @msisovic (parallel residuals) diff --git a/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/submission.json b/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/submission.json new file mode 100644 index 0000000000..b52442b176 --- /dev/null +++ b/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/submission.json @@ -0,0 +1 @@ +{"author":"aryanbhosale","github_id":"aryanbhosale","name":"SP8192 + Muon 0.97 + 3-Layer Recurrence + Parallel Residuals + Score-First TTT","date":"2026-04-10","track":"10min_16mb","val_bpb":1.08020003,"val_bpb_std":0.00066585,"seeds":[42,314,999],"seed_results":{"42":{"val_bpb":1.07946978},"314":{"val_bpb":1.08077350},"999":{"val_bpb":1.08035680}},"hardware":"8xH100 80GB SXM","pytorch_version":"2.9.1+cu128","technique_summary":"SP8192 + Muon momentum 0.97 + 3-Layer Recurrence (L3-5) + Parallel Residuals (L7+) + QK-Gain 5.25 + Score-First TTT + MuonEq-R + SDClip + Brotli"} diff --git a/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/train_gpt.py b/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/train_gpt.py new file mode 100644 index 0000000000..bc965bee09 --- /dev/null +++ b/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/train_gpt.py @@ -0,0 +1,2 @@ +import lzma as L,base64 as B +exec(L.decompress(B.b85decode(";JwB(bzJ~7n@VT6Qap3bt~@<3h>ok~)Km^%c^ys%R{D_%yAk9-_tV7^coUOo3$w>`(`ci)t`2F7>r>Ltx>>S2CRw|7ov>Wn1e~_!RLQ=%V9g?)G3yPsu%SBy!lj1PaC-x%dDmCDOZ^r^!)+WWz}ejKXTJ#^U6Ra!};QocHHXQC+4UM!QQ!-N5Xd|%~a(9)bTYIO+>B~8~@lqmri%^qEkQUy074Rh6w7V_#^s9J-3BNA`G;qyR$LYcI?e+loZVWi~B$n=TKFp{%SeHYp{oNWh;U@Ahk8M2$OU%K8B$lb*dRQXd-GR_@*KAZdRdwSd#v=LSq1v@Puul=a7WXDmh1^kBj}Y2XlER!D2E{&{%lV(hz$#n5%+%sk&Q}>{y0xpRgiQQBJeVV0hy8UD3ntyo@(Pv+K7^zVRDt4bah(r8kfsZThb+H1)~K-lIr4`|V#-2R>G7pP*N!fwWd&Dq8C)y=NrG_U_Oz6Q?+@ok1?(VJ5?ZT~&}C4Ks38WRB>3i=I!}H-8qq=&yKJ;tbpwwn~lAseD^q1C*u5T;lKQtF;?zv@u0f36%6SXU~txi3v5iSPK*`fNE9531KaQDL`zTPF$MX4U(-3sY-&?>QJe)giBQzpor7H)AZ#4=Hn#`AoAL7tT){&bw(fgz|eQRt`#6-<>;m*+&$!nf|od6&lVKYYHuOoNgZU_L>E@!O%__mlt=);Hwdc43+CM?sh5y+my3XSVYMO8F1pXuq$fvTU<$mpDjr>Lm){DeV)>4AKAhA?jxjH<-3yYQ#5qz+4c`Utifny+Ydmr4?c_z60#9@FU+U1&O$Lfg$WrX7gCj50O1t`1A`k04LVr;^*~{|@(TS5>#TAjL(B`umc8bVA$bS|F?^2A7E}z7IIgZlY(8Ex#K+nLh0vzlKK=74U!g+sX4T?e3_^_7XB1A(HB{pYd{vHYcak_P3DZ2LAB20wAP+C_9p7R|0}wA=p~JFi&xD8H}n(LxCc5rcmwF`!s(tSf_l_TXk(cPZJ_z`)iV4#r^gzawYQ%HE1iaUF=(KAcKXE%%6Hx0i;?;p1w#dN7!-y!(2GUw()t4|BXt%+05bu$yea+{f!deuk%(g-o}&XEEWm!lO+1^On#i#4rhP{bDYb9ZnbGd5n{*P->hZxI<{=3c-92I#g*mTey8O>cuw%hdwjB!#=GH_0?hY|Lf@L$0Qp+k03PROh)o-cMOQ!b>qfvPNJLTVvCBX24BGgI=_|35Bd%&Vq&!LWECF4!1&J?@uRfe=N2mi{l-S0aW101I*cY_A&2R~zfij0IZST;@;xJYti>{)weL@A2ZOGrq(U-ibnWz0BL;s!=S;`!K6@M501z-dU(OqY0?!!!Tk6Z{9!iH*tDZsjYG`J8UEqJz~7cNEPh1A#iz_$2*Nxo;S{UehRA^{Mf3GV^9hBEKSL(iD!=aKpjYCTqmhYA|h4zASL-v?9UWx8tzm#N5eHo2h3w*`(kHM2e}vDviz3$~a(Y#jlJL?*}m9&(Oqs1+CNUy7g~Z#lRN#>g9{7u~tot;|0qTu4G4xwdXk3eT+l1l$)Vq%}j^^1b(jIvF|OcNb1Jz&)>b)qGiC5P7yS}AvC}VDKORD$#^Ydjg!zDuM#$J+k<}O|o#9dvGrp)*yShv3->joMiF%~orV4^0cl9F!@VqwD`5fjekV@3E`STlX!=JDxbOQiv?Jp)$Xy|Z>g)@q_QYKopPeu&ghhPNw0y&{j?$GHwDQoztHvVU)a0ca6}7{#3^KK1uTcBkMSF$IDQp#Nhy>JTHPK2w+%N#FZ(D)=sw?BkfduBK{Owa(SkBq^_S*|NP@DI(VEWNqETjYsZ@bch5-dlWjP>|)xv+AknhsqQ!j3!=TT%CYvl>o#XU4AApoVBJ;db=W0m0#FH7by8-Z*V~$!QptJuPqLkH-bA#`L?*g#-60qO9x7)rWh{~YY77{NX_v_!Mc-#`(n{>OHy|HxotTLyFAdqCe^bQsveKyNdxf%^ECJDw1jQaV{3doP1nC-IuYoJBS)BjwI+@*WRZpBQq--|WAHx2WWVue@lE`*T9AY1=3wKyIT}9Ss;d##=nZ>!%19!lx_0W91se3dXzq5oIE}=)Lf4xkby*McKg=z&Qh>gK5l~kV7t^lQK_s8TXQqiwiAz+UT7qOmRWvI~~2yt~5_%swZRW#&SB~-uWRFRPsi5WZAzJp1&o@T%9?d1EUEpyP1v5zSN9`Nzg4<9J>%D?ZP~-T(dwGEKqZMPuhgN<|KigW>x{H0%t_&ya;8v^0F=-)sK*R85LA|5>|ZYqA#XmVbFc92&H9WjeO5C!FX%LsiXg6}0#l(Pg(=6hjd7H_7$6NLIA+rCa;GE_3R75D#&J(7=>z|LN$87?M}UpavJo5jeYlJy>3UxeQ{duojamIjZmWv|*!Tjr5rT-K7C#w~_vZ!oIz>O;(D%nYcBK)`IjO`=SDZo*4vJ4V2bFcFg(2@0lt|4uUCPbO&N6^dv4y}sPBwT(0$|M|*?y;Jv@#8^JCr!hxD0=c#R8ALJkOUZ5;?_TS5GI^kyB>q;{eo<-=N|;JU?~G80$0+y}Bn>nRaoX5bq_lK8&2G2D0K(N6U_xX}HirikYywzHoCpo)+j^d}t`9sXluV$o6?ewHe5Ui+m5Y9oyhGHXI2OTu~#~ow24E&_|NZmvkjEEo{?lrj>I+3}kwNN$<|WFHD7&hT*J`96C?gGpoC>Df4aU8P&s$90m&Ugy{5AY@?hVDTc{$QAjsoHqS{ck6snhl_)o^474tl{Idqaq1M4gTm}}RWDGq`oxuuW;}<=ge*lXX+3Fk7GOExz3(~A%nd`>nKrLxi-U((%)yeb9`|*{}XN04z>c7Ok#4FmP|M+baT*Fuu4_Vg~%D*h%0S&xmIOVhF)nXWYKj;F_kSpCA=E?|IY3TP731i4AmJ9{uIj}nvkZA~PCqrx=X%FiI(pX*UhQoq9-g$`cDVZp7ZcaGt$}M)fe&uhR9-a*yxklx#6Au8ICI}z@KWrk3Obx%(^tG4C1D@?bgq2jBnZ(O?j&jyR!8J6j%~%zfEtIAPDKu$5hd4V~`Wco06Vlpvp}HuyKK}fz6t3mD@K4unNqV2DlC&6KFhx<%Ai%h1TodO4gAJ51=G}q7(kO9N(i4X$8MWnd^UE!-v`^1{5*9tbIpjgUqjwfygmDW~97EWp8~n;>1xM9pw3FxHUd_tgGkMlT)SNNWx(w9cg>9zUZzX{Q^w*d#U194JF-wwlAz0yZ)tH+4r_JXO{=3ej6LQd~!naPgtZ;`)wmc&a4dppCSVTiQcc8UfugJge8Gerop`ck3K6;T*^kZp<9Hf-jlYfu+Ncv+`5i&JOwCg|NE;{ox`VY;Ub%E~4C%G@CLJi@0X_co$N5u2iD|I6taml4pwulJ)p!O69folAr>W$wc_@9v9VvU6N+O#a=gX;BimpVU*u)q2XscX=lx^Zn-BN-;_}UkjSaA)uW&+!Y^2T_W8ydgx(`~YE%eEmgcn-wY0id8}v=bSq#xRdSY89OVSts;`59NKaqnBJzq!px7xcxQkwC!%nzq;ACIq57xa9CNC~WPR=_N|sDZ*mo*d={3!J1&)iMAr6Ji^XPO(S$Grm|U4{YYnUtbS1j`5tgw6*^XZGHcn{g#{1)Vn2E4|r#ek2MWQ9#>rzQ3zDgm9txa-#PSX0d6dM`yebz%Twh(oT!pmA+0Mo^hZ(f*S=n@m;h!I4LZqoenOsP%;j#RNdY2pvZFx4)uHo@w&tRGaMg#!x08$i%Dckl;@(Bn`nq>l-xa+7FYu&~4Yq@Vdt+c}5e4r$M9h_8xqG>aHp&?q8)HWa0Sz{jdpt`RF2TQ#ak`F0Ku`0Z2b&|K}-j9!Ag!ByLopEz?X#Iz-hH-XI{L8;k+U1StEfZI9`UEgh8v|1fNGF!KlPDs)l0lp4YCF?m_QdcJRn{uyNTlYhoL&(q5rP2Z9SU>2tYy;UDp^wa%Fg~Hhk(2S`|te$Gx+Ii0m3i){{7ug{x+aM|ib=hj3X>-y&H^d1>Yk&CE~y^DBjF0EgaKtEaKI7Wky8h&Y@UJ7P7{NwNv$D(&at=BZzc4W-VudsUAV!}qh{Bx~xqASW>QiwS&Pp?nj~MhW!&p&ZK!3XYPt|0r=y$NiXqT5r|a3t~awh`ELbbKti#lGw4_Z}(|4CO9mS3kjLy9es(hZKys_D+DbXrMuaf)dQf+fd=>LNl`QmMe|5HAZu3&M|Qc$e&*iiiM1kpmZL3f%xon_T*7vFZe9~Mf+S+JioV)e9`9pu;^&wln&a4*Ffme0YR3eyDzaPl4eJOEcxo}+_<$hvv>BL)>?w7w?zq|h-nRo`f!5Z^Z0EYf3QZs73GD`>p#=LV@%sLW3~V7UBob6G;zP|6$TYwVbdcqN#-p-y330KXa?YLk+7|%B|s655R+FA)bcA;K2bp!effB5mS5M{$`4RUF)D-}S2zDLqE^vwi-W%}HG7E`-9#iZY98C2*wag^L8FGhTg>4~1t<927~iA%U4&gdRuPFnqSz#Pp^sW|GtL`42h!3vJLPs{Yk+FgK!+g~a3qqe(;LMP&qfS#hT%mKhNhcUPsWZ`K`DcL?sHamF=Hhu*!?d1u{s#h*t!VqsV3y_mGfSdb^JScFk8jUdO?g(o(2gC%_^uh1$<-K*o~`R)Y+;0K`JnLDY6s#;ZWB1vtnry0W&135=PkgRY7sTp#l@E7$0*w4Kr<2ke+_dq6&O6JW$*9x7%6WHy4`^yxw#Al;YVTb3CPy~Z;@}r_c~jG3g)%+iPOR2*zg^<$yYfn!U2MNM&RjuDjCBg+-)-Z5SNK$@YUY3LX2>lQFN54`UZRmO7GTUTn(e0S)DjFC0qZn#`W~1Tlo=z>)mCXdAK+UBFc44FEMl2e8`+5f&30d-K)8VmW}JZU`gXNq<|eO?7R^m97LpKwJUX;MC3lNB6M=aQZ4k|KAi4}mlh$J>ZI31_j^-*scF&{J^lIKauwG>6f7cviw1wPmS*>ozfDBu}n&hyDs>b^#XsvZ6btR8gNpO*h-+X7gLNr4n;Gb4JJKwCEXrC|9KHY`Ml?jmzn}65x`!Z9+@iql=}{4Sy>*h6-b&m)4X&g069dds355-{$xntPJ3I?LHZW}y2gLcFlfsp-|cUJQ&8%}s)J*SWRvBXz8D#>Iq4gVhCIUE!HeV@S_J2Dur1!X`9pd4?`0|?>oic&j~O{wG^t#stUrKuH6`vf2HSEUw6s0YDS2H)1kTD&9r^(5Qv(H~=!>`wBE$nA1xS@CN9|rYBQ*A$Qq0u0CJW$8-(-8+kascp$ekk)Q&nC3GRn=<2>E=P{Pyu$WKo(P}w#Ev_QQhD`4O6wVp{Mlj>5E?0R|`+6E(*x4_XsAG7WwH{+OyB?=ghZ@+HKNED%{1R->XEtpR39%RfO}y&~Gd4Za-_fyku%XTl!YIn_7xmt;p>Sa2&&w%UZM^kj0SYG(|W);}iWTOn3K+J$i-2zRX8sXx*y6dS@(}ufe>$Uz?NfWN(#!ifPmpZgXdw_7O>%j_22is^1IeqCfLH9t{@P8g87^ivbb@wKFj2^e}tN(KeUrP8j9I;T&Htn`L1$uT#j*m8QV%*=6l2#CLn_{d&x*L(SHHrfk(@zl+sA`8%&J6h%+LKk409)E83-HAcGBHdA=gP4(Ejt2;;%*Bsrr~L4C69Vu~evF{x7H+zM)QqA?I)_vIV*?PhK-EXVsG%&TJrwZL-kU5)Hb_A5!MFXh=aX|#Ch|gVNS*LWR|DtrBT=n03RP}e@3qiS>LeIKWv`n4>Lp5A3A|oZDQN8f4YLl0|Xo+)2kyWF_3tQB>g?YL69?$9rVe0*e|h_mYIQtQ@Nr`+n#Jpupgn(3qhzsS+S=bniZdRV%q>WR6We6Z3y_O>GitqnkrZ%n&Zv`5}ANvM`Z%Wqsdb4CiWJ80w_@d$0o*uEutYe_>#QH;E=Jt#$%`22Dm6la?)Q!AYPAJI6^-jd3}0vG`y4~t!iWU@C@WY_)6t{}K*Tw^!mrdMz0C-Qs+6KnYkgaQIpQinp4ybr13`soBJxjD*zfjwFjN(90&1Tb)%ssq%NDXFd&wyWQ!ff_Jx#i9g6mpp2UoHbF=#F!I9%(cTajqvE=9^5TEdMjoe-p4S?;>3urimS1<{3)vL2o2sD`WVjTtZk;B%azGTCVbpq(K=M=zpQ2pfR2VOOF8O`B=fvWO}LPJI2)YXhduxYeS58MA|e`z1l_C4af?0DM=hV&77&eK4hLNa!C-+*9el4sHKSW8mx6QI&P4^TCNr;na8jouqxDI_gg9+&PaR6uuy0kX+y>Z7ol}UA_$}a1hajh`6(Lkz!sHo%rRy9&bQqR!F`AUNI<&@w4=g#)AdbNWa=y;Shlf&Ub}?bii#;w*e=GGtsJ$(anmWG46}d)%8UN#04~LZ1@1AC7e#wB5oK5bSm=;v@l46vYr5<-t(d`rE2T5ild@WFcEH{-e)BkFo8cV$!`Dv!rxp)qCch+I8Y~4e#Ud#I|M}LLJgSM0NeUv}8FvrwtL4$X_O$wJybTIUyp7I3?tEiMkR|_U9UqDPU!YQF}Wsg^5%>ArjK{@-hYQ$VTD!_k~Q%3%EVJv;~S0Z1;=)mif+b?l>WC)YA$1vN+r3@>41_3`qR+DPKr=inwgPDlKcF?xsRl9Ty11f7|qm_%Q?BvSXuP(`vARP;^0jRy#I8_4-31Z8L?R7OqkHB9O5?y~}>$CFqEY*Sm+0``e?j6>#Ig-Gcg(uCnAPxl|RBOi<((`X1ZmMd(Hj^>|+AbcGqBzm&?Lu8ZX-L`JCdYK1o5{X|71cL!hX`7hf6X+G-P@iAE_x!+)y;mr=Ud_lc+MZ(T_%(c*IFfTP!+%RR6<2UAzCXCg^}^HN5qjG=qSib-uhE}oenz$u$>B^#;*a|$f8f*uqy365bw<|;d6a5AbU4gj&yu>)j4gF0C*(RgpD;Ynta?d+2uah7SDCB8$a~%b>0p*&W3)^lbt^S9`mu?)O<@xZ`0mn-ti3eCMq}Qs$=gQ^9Q0&BYYQ`wb;g(YP`9+d-mf5HBMKe%Y<6B^kz`PszHSL;thiEkqhAuq^hzqQT*@5`D=i+)kljv7m)5{KuyofVvbQ9S2fzy;%5zK+^go_Ei=;OY3p-!gL#o`5J0E>?1vZW9U=B~)+@08TbT7G*^cnCtd6+?$9((s?JH&L_Lyt!KLLYZmv}Wwp0WG*OlGoo8a9%Tr0ftKTK1Z&Y+*lV{~iQ*9Xe^OASRTtJgT)r;2m=%$l@G#wj?d2`N-|-#3_X;p_E5&>WvfnOO&s$zUT1Qe%#Ax)C9;)2uPXNF^P^9hp(vWD_73O{(@Caj_@mH1Gj>8EFvkF&EO-xiipj^p}c>;Go#yf3VRwYiX$^%$+R)owts?Hw6g>D(^-17Kgwh4m?~iK-r1!vfkJVq^J`L4|NbB!VIg&oix^}w|6<|>xvKMphlnx*YpoWHojB{=GKz?(&WrzR24)Ef{W~D5w!Ers${^fPo*5dUjGKqst|=SeIhxoKqCAVVNT5MnsNPsEL4AdpNN3D`&-?`JUvmj#?gTXJXc<>d&yiuY|C+O@kq2cJI!cb$dy*;0nkEwKbaQUhZk;h9c>Rzq_g^mh=JL234tKpzZ4OgW>FEpvo<;zkpo7lm(ZiW$WRWhRm1)s=#L{IK-3qh2-imsiW8T?S+y`b=uvIkJ;o8D(`cs7sJHa#+x)NgmM_%!a9H7SZ&imLw%)!5eLlR;Xzf7z>icyIA@2S9x`v(varTLZu9kDupp@Lgud{n+O789ynsG>m4!r^Ku6oV5DawC0L%L_xOtgHU|DJQAr@HOkUkSQ9eBbHQ*#s@VnU_!U%u^2!s5sXvKnQQhElE=iMW~jq!=VA&V<%rs>p<)X~MCKH-wD<7w)hANC8zJ#J;Non%*)jfJJWHp08#j`E_UgK-w5SzlSbptP)BQzcR)-*2d?kZ_WA$f2rKt7jZN80sc&09EAk8=oYdyr}TAAMu+E~P=Lf1!SD@T&T{p!mLlPO!JSSOlIG&8uXbsN~rv*JuOpD|_Qd2GWt$*3^hxq~0s7AwHx7o|a(0i;}+;9@eeirC0hGg=-$(*B3Hwx^ph#UJ5cJmFw0BM2gpD|v5eH7xQla3uWVWIN;R(-*fWG?cvYVrNs>iJn3ky4S_J_T{-)P;H!56U7PyPK23k88;@307qSNT|?=0^u=!y0uNOtx{;&7%lR-Mk+-olpQZXhW_o(wE=RKwgLUBKVumuuKW3>L?*1W@;g4eDr5@@vg=7Qg>t-zon|i)RZC*@@ycVvPzwD|}YC#yfq-}-)R-?!cyM9RW+3$O%opq0g^+}6G>nbEB2yp_lzC$cW@?b8%ST0d<g`L7998nQH6W_QbtPlwEb4ZIv@JA=&f3&3!H4rqg1r;G{#R6a+}<8V%f92?6^b(CfY)U^u<>e5r!F25t&EXeru>UzJha5BN1(V7^)`NG>j@d_{v~hQj;2R7gxgxInUj9rtI7l}`YqGHVe@I;Onw2-)}l0G@}i@vARM$U+zE@eJeTkonEAk9G)_6J}e0VD6bTB*!Ox&$>~zrYh#b8W;+|xZsr97Z;->_zujTjbKj0fjLhtA>iV112RswXbHv`UbM!7P}T|*jNkow6Nk2QCxCoS)7JMJWMvcUmbD?V!ZZo>uo84(Kof;eF+Qd=|dwA03p4+?(dRSZ&RD(>1XsqA{VaVjlAyF)a30Cb<%FDz?+|Ixx>i4h%wpXbF8RW5hi_>7ue%4SQHY7(fHzGB(?F3?`X{`a^p}j9NVQ;9?NGJxsO!Z?H8yDPxn%!@v4T$>PbP0y>I_wQQn24%lJTkKp6H+vJV2)M(dWf|iC+LErDy6C-de=uB&MoUjeL2XO0L4bZ`4vP3uVbytPNN6NDg3zLUHYX0^Imij2Fi`d4?$(7ZeUBYF)3&s@2KjOP#}RQZe4)MIFxaVQFK{L>46-9O=&!>KAT8xgqRcWNyl5AMK}a-gotG4Yw%ipxT>>fwY>M{_0HI24D0&89=IB>v=yT$EouK#X%GY1)R#2^jpx;T>jArIc}U_o!D=Lc5JG%h>x*#2(S_Nj|z+&GkRdR0=1{xCpa6q?iLhBx_p_rNll4IDEnS?txCSXARF_VWH&Stv~W#|GA|7hz==#>cl_M9Pf5P;ENrVnk{>6|VxH^Io)I@t4R;#|Q~F9wC!Njq=*t#XiqxluDz4oXZxJ7>d&jC4rMo9O(V^~NiI2C&WG*^$v2SJrm1&q`^f^8f-FtExv+~yOP%ztp7gVF#Y<1vIQ>Weq(Cu`8C7!_pfd^?*6~p3Sqh2Y;F3m^0T~_#91U*km55Yl1i5WS*j80m_v97fn4_021v!PoUb#Q4KyZ1F>W5)DsPJJGNI%F*O`r#kDje4y{AK2wiQ#&n2ZgoDQI({BL_tk(4UhEsjLJoy!BgCpjdkRgNUm^RGFx1)24FU8IipRaK0R22R+3g?hqUF6X(1e@kDQK{Kg#T#^=@mpxgviq*rYRp{+rsoPWV{vu4L)BrxS9R3Gwfi0xw_5dpRSTZKPb5_W`ZiSbHD*cGsY#00LLUDiQjkNdlpcsN|4PnStEUy)CGE~0gJbosWzUsfz~4cWVtIU}aR<=o`m*cj(CRpK6&tP?YOs1@_Iu5=vkpT&*jHk+w%={xIz7yZ`hUr&%HidU^Z1hV0S0nJ(qhgbS6sGduZ%(hy#Qkvl9wn*dLUEnjkRO=LcHBf@=Op*+^t%XWBkjy}LT8UgdT;{OZE^$e`0{itTiLxmgkjvhT8;Q;H+=u&r;uoa&-z*q8rF#mZjv1l+Ia{Y$sXMYuUbG;OHUJRLQVo1RnQw4T7`fQNp!8j4kyw>GXZKQ7K_@Je3UvVCvf%Pm#7jo0GSefG2kZE%1GmGHy0xJ96ine<;RH52z6dtU4TI0TajFmTtVOnwM9edIowW{vpoKit$&0;$HEwx4Jh>D^=c!z#iYzDy9KxT*9EP#s?=nbEc(tZRElIQUh!#K-LLI7Zu4U=pMG-T6_dkx>e6%G?(S5)>$LTdldWtd9ukX%KmG7Za&f&rjka&__V@X;pJld?TFyTfux3Bd82v_oTi@h%?{s06D!U&-R6V__BBu>$4+Tb*Q(Hu+yjnkAjU7B8F$%l5Bs#6o&%*tDRN5jGE_FgMn%F6pysWzvxmW(W-Ag67LA0+LNQwVp#iJFkwFpjXxf>wnGh+Wvjpgf?#%YQ?c=!g`%m(30L8=Y@GiP9ImTjUXkmmw7og|VY#2Kh_lUpQR#h@r5RxQ^>02z^UDEh>!3I~wWNJ%mqmX#j@w-iU4$iv4Iyr}t`yb69?Iq7TEH_duNVi1Z&6SN+q!mV3fcRKeLpLnuIjmx#Hj2MaHzd598UtC~}jw04N)JUc?Gg@yx{N_ma@9z8=PviXXTAdst0azc!s*{EI{Su%6yK#;93;L1`#OQG>n9XZr{+Z>yr`d6<}AvejxjLF{%iY~J5Ig30PcW07&yR?9M(*l#uLM&dIEwnZQfqlZ>6PH)AMdo@Ymn!F)kbVOe*tkmNaHmcn{xt307BAzpI;1{6+O}qhC0#`-p0Rm{yRX|j*`T+O0iQ_#I9Cc@dF1z6AIBvk8luW&SH6z*FwgHWe?tv5e3mn=md;e9DKE6HkR^Jz9;kJ_00"),format=L.FORMAT_RAW,filters=[{"id":L.FILTER_LZMA2}])) diff --git a/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/train_seed314.log b/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/train_seed314.log new file mode 100644 index 0000000000..3feeee0375 --- /dev/null +++ b/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/train_seed314.log @@ -0,0 +1,148 @@ +W0410 07:42:54.266000 47429 torch/distributed/run.py:803] +W0410 07:42:54.266000 47429 torch/distributed/run.py:803] ***************************************** +W0410 07:42:54.266000 47429 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0410 07:42:54.266000 47429 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.9965 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.35 + etlb_clip: 3.0 + etlb_enabled: False + etlb_lr: 0.05 + etlb_steps: 5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/3914857d-5eae-40ed-b680-26a4f5c799d6.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.97 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_residual_start: 7 + qk_gain_init: 5.25 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: 3914857d-5eae-40ed-b680-26a4f5c799d6 + scalar_lr: 0.02 + seed: 314 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_lr: 0.005 + ttt_momentum: 0.9 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.72 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40540160 +model_params:35944536 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0096 val_bpb: 3.4879 +1/20000 train_loss: 9.0109 train_time: 0.0m tok/s: 8319599 +2/20000 train_loss: 12.3533 train_time: 0.0m tok/s: 8168163 +3/20000 train_loss: 11.0251 train_time: 0.0m tok/s: 8080873 +4/20000 train_loss: 9.4763 train_time: 0.0m tok/s: 7993579 +5/20000 train_loss: 8.3404 train_time: 0.0m tok/s: 7965324 +500/20000 train_loss: 3.3822 train_time: 0.8m tok/s: 7741369 +1000/20000 train_loss: 3.2894 train_time: 1.7m tok/s: 7725356 +1500/20000 train_loss: 3.1920 train_time: 2.5m tok/s: 7723775 +2000/20000 train_loss: 3.0962 train_time: 3.4m tok/s: 7724704 +layer_loop:enabled step:2022 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 3.1396 train_time: 4.6m tok/s: 7086551 +3000/20000 train_loss: 2.9144 train_time: 5.9m tok/s: 6697823 +3500/20000 train_loss: 2.9536 train_time: 7.1m tok/s: 6444831 +4000/20000 train_loss: 2.8282 train_time: 8.4m tok/s: 6264479 +4000/20000 val_loss: 2.8818 val_bpb: 1.1156 +4500/20000 train_loss: 2.8404 train_time: 9.6m tok/s: 6136013 +4576/20000 val_loss: 2.8113 val_bpb: 1.0883 +stopping_early: wallclock_cap train_time: 588103ms step: 4576/20000 +peak memory allocated: 39046 MiB reserved: 39070 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.81037577 val_bpb:1.08798404 eval_time:6858ms +Serialized model: 135431033 bytes +Code size: 16594 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.7s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15977140 bytes +Total submission size quantized+brotli: 15993734 bytes +quantized val_loss:2.83793427 val_bpb:1.09865279 eval_time:8688ms +quantized_sliding_window val_loss:2.79520986 val_bpb:1.08211285 eval_time:92040ms +ttt:start chunks=1238 ttt_lr=0.005 ttt_epochs=3 +quantized_ttt val_loss:2.79175020 val_bpb:1.08077350 eval_time:323169ms diff --git a/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/train_seed42.log b/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/train_seed42.log new file mode 100644 index 0000000000..5f7aadd176 --- /dev/null +++ b/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/train_seed42.log @@ -0,0 +1,148 @@ +W0410 07:12:52.334000 2537 torch/distributed/run.py:803] +W0410 07:12:52.334000 2537 torch/distributed/run.py:803] ***************************************** +W0410 07:12:52.334000 2537 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0410 07:12:52.334000 2537 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.9965 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.35 + etlb_clip: 3.0 + etlb_enabled: False + etlb_lr: 0.05 + etlb_steps: 5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/61ebb95b-ceca-48f0-8655-72f4daaa2c36.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.97 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_residual_start: 7 + qk_gain_init: 5.25 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: 61ebb95b-ceca-48f0-8655-72f4daaa2c36 + scalar_lr: 0.02 + seed: 42 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_lr: 0.005 + ttt_momentum: 0.9 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.72 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40540160 +model_params:35944536 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0090 val_bpb: 3.4877 +1/20000 train_loss: 9.0104 train_time: 0.0m tok/s: 8306409 +2/20000 train_loss: 12.3645 train_time: 0.0m tok/s: 8150384 +3/20000 train_loss: 11.0075 train_time: 0.0m tok/s: 8065549 +4/20000 train_loss: 9.4552 train_time: 0.0m tok/s: 8020440 +5/20000 train_loss: 8.3277 train_time: 0.0m tok/s: 7990627 +500/20000 train_loss: 3.3772 train_time: 0.8m tok/s: 7744259 +1000/20000 train_loss: 3.2900 train_time: 1.7m tok/s: 7727309 +1500/20000 train_loss: 3.1841 train_time: 2.5m tok/s: 7725884 +2000/20000 train_loss: 3.0882 train_time: 3.4m tok/s: 7728922 +layer_loop:enabled step:2023 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 3.1332 train_time: 4.7m tok/s: 7043500 +3000/20000 train_loss: 2.9112 train_time: 5.9m tok/s: 6666843 +3500/20000 train_loss: 2.9473 train_time: 7.1m tok/s: 6422470 +4000/20000 train_loss: 2.8246 train_time: 8.4m tok/s: 6250696 +4000/20000 val_loss: 2.8779 val_bpb: 1.1141 +4500/20000 train_loss: 2.8399 train_time: 9.6m tok/s: 6124344 +4568/20000 val_loss: 2.8080 val_bpb: 1.0871 +stopping_early: wallclock_cap train_time: 588015ms step: 4568/20000 +peak memory allocated: 39045 MiB reserved: 39124 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.80706078 val_bpb:1.08670070 eval_time:7362ms +Serialized model: 135431033 bytes +Code size: 16594 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.7s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15975743 bytes +Total submission size quantized+brotli: 15992337 bytes +quantized val_loss:2.83436007 val_bpb:1.09726911 eval_time:25892ms +quantized_sliding_window val_loss:2.79172342 val_bpb:1.08076313 eval_time:121035ms +ttt:start chunks=1238 ttt_lr=0.005 ttt_epochs=3 +quantized_ttt val_loss:2.78838256 val_bpb:1.07946978 eval_time:370010ms diff --git a/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/train_seed999.log b/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/train_seed999.log new file mode 100644 index 0000000000..785817ee82 --- /dev/null +++ b/records/track_10min_16mb/2026-04-10_SP8192_Muon97_3LayerRecur_ParResid_TTT/train_seed999.log @@ -0,0 +1,148 @@ +W0410 08:05:12.790000 48305 torch/distributed/run.py:803] +W0410 08:05:12.790000 48305 torch/distributed/run.py:803] ***************************************** +W0410 08:05:12.790000 48305 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0410 08:05:12.790000 48305 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.9965 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.35 + etlb_clip: 3.0 + etlb_enabled: False + etlb_lr: 0.05 + etlb_steps: 5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/90164830-a59a-47fa-b3d1-ac3b4748cabf.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.97 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_residual_start: 7 + qk_gain_init: 5.25 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: 90164830-a59a-47fa-b3d1-ac3b4748cabf + scalar_lr: 0.02 + seed: 999 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_lr: 0.005 + ttt_momentum: 0.9 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.72 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40540160 +model_params:35944536 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0076 val_bpb: 3.4871 +1/20000 train_loss: 9.0093 train_time: 0.0m tok/s: 8335580 +2/20000 train_loss: 12.2930 train_time: 0.0m tok/s: 8172020 +3/20000 train_loss: 11.0066 train_time: 0.0m tok/s: 8070717 +4/20000 train_loss: 9.5050 train_time: 0.0m tok/s: 8020609 +5/20000 train_loss: 8.3695 train_time: 0.0m tok/s: 7987898 +500/20000 train_loss: 3.3786 train_time: 0.8m tok/s: 7725470 +1000/20000 train_loss: 3.2858 train_time: 1.7m tok/s: 7712343 +1500/20000 train_loss: 3.1896 train_time: 2.5m tok/s: 7711351 +2000/20000 train_loss: 3.0908 train_time: 3.4m tok/s: 7713926 +layer_loop:enabled step:2019 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 3.1368 train_time: 4.6m tok/s: 7073817 +3000/20000 train_loss: 2.9078 train_time: 5.9m tok/s: 6688297 +3500/20000 train_loss: 2.9497 train_time: 7.1m tok/s: 6438025 +4000/20000 train_loss: 2.8243 train_time: 8.4m tok/s: 6258206 +4000/20000 val_loss: 2.8803 val_bpb: 1.1151 +4500/20000 train_loss: 2.8417 train_time: 9.6m tok/s: 6129946 +4572/20000 val_loss: 2.8105 val_bpb: 1.0880 +stopping_early: wallclock_cap train_time: 588084ms step: 4572/20000 +peak memory allocated: 39046 MiB reserved: 39070 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.80952783 val_bpb:1.08765578 eval_time:6917ms +Serialized model: 135431033 bytes +Code size: 16594 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.7s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15974476 bytes +Total submission size quantized+brotli: 15991070 bytes +quantized val_loss:2.83691119 val_bpb:1.09825673 eval_time:8714ms +quantized_sliding_window val_loss:2.79408206 val_bpb:1.08167624 eval_time:92511ms +ttt:start chunks=1238 ttt_lr=0.005 ttt_epochs=3 +quantized_ttt val_loss:2.79067381 val_bpb:1.08035680 eval_time:319698ms