diff --git a/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/README.md b/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/README.md new file mode 100644 index 0000000000..749bc53145 --- /dev/null +++ b/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/README.md @@ -0,0 +1,48 @@ +# Record: SP8192 + Banking + Triple Recurrence + Parallel Residuals + Muon 0.97 + TTT — val_bpb 1.0790 (5-seed mean) + +**val_bpb = 1.0790** (5-seed mean, std 0.0003) | **~15.99 MB** | 8xH100 SXM + +## 5-Seed Results (8xH100 80GB SXM, PyTorch 2.9.1+cu128) + +| Seed | **TTT BPB** | val_loss (nats) | Artifact | +|------|-------------|-----------------|----------| +| 42 | **1.0788** | 2.7866 | 15,988,830 | +| 314 | **1.0789** | 2.7868 | 15,983,617 | +| 1337 | **1.0788** | 2.7867 | 15,985,310 | +| 7 | **1.0793** | 2.7880 | 15,986,416 | +| 999 | **1.0795** | 2.7884 | 15,986,416 | +| **Mean** | **1.0790** | **2.7873** | | + +Merged SOTA (PR #1493): **1.0810 BPB / 2.7920 nats**. Delta: **-0.0047 nats** (5-seed), **-0.0020 BPB**. + +## Stack + +PR #1523 base (@abaybektursun) with hash embedding disabled and Triton fused MLP removed (standard MLP used instead). Key components: + +1. **SP8192** vocab with GPTQ embeddings and SDClip quantization +2. **Parameter Banking** — batched Newton-Schulz optimizer step +3. **Triple Depth Recurrence** (L3-5, 17 virtual layers from 11 physical) +4. **Parallel Residuals** (L7+, GPT-J style) +5. **Muon 0.97** momentum (from PR #1514 @dexhunter) +6. **QK-Gain 5.25** +7. **Score-First TTT** (3 epochs, SGD lr=0.005, PR #461 framework) +8. **EMA 0.9965, WD 0.095, warmdown 0.72** + +## Compliance (Track B — Score-First TTT) + +- Score-first TTT: each chunk scored under `torch.no_grad()` BEFORE SGD weight update +- No SLOT, no hash embedding, no pre-quant TTT, no n-gram cache, no ETLB +- All four conditions from Issue #1017 satisfied +- All artifacts < 16MB, train < 600s, eval < 600s + +## Reproduction + +```bash +pip install brotli +MATCHED_FINEWEB_REPO_ID=kevclark/parameter-golf python3 data/cached_challenge_fineweb.py --variant sp8192 --skip-manifest +SEED=42 TTT_ENABLED=1 torchrun --standalone --nproc_per_node=8 train_gpt.py +``` + +## Credits + +PR #1523 @abaybektursun (base: banking + triple recurrence + parallel residuals), PR #1394 @clarkkev (SP8192 + SDClip), PR #1514 @dexhunter (Muon 0.97), PR #1493 @bigbag (merged #1 hyperparameters), PR #1204 @msisovic (parallel residuals concept) diff --git a/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/submission.json b/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/submission.json new file mode 100644 index 0000000000..4cfb81bc33 --- /dev/null +++ b/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/submission.json @@ -0,0 +1 @@ +{"author":"aryanbhosale","github_id":"aryanbhosale","name":"SP8192 + Banking + Triple Recurrence + Parallel Residuals + Muon 0.97 + Score-First TTT","date":"2026-04-11","track":"10min_16mb","val_bpb":1.07904309,"val_bpb_std":0.00031891,"seeds":[42,314,999,1337,7],"seed_results":{"42":{"val_bpb":1.07876776,"val_loss":2.78656915,"artifact_bytes":15988830},"314":{"val_bpb":1.07887587,"val_loss":2.78684843,"artifact_bytes":15983617},"999":{"val_bpb":1.07948565,"val_loss":2.78842354,"artifact_bytes":15986416},"1337":{"val_bpb":1.07880325,"val_loss":2.78666083,"artifact_bytes":15985310},"7":{"val_bpb":1.07931921,"val_loss":2.78799360,"artifact_bytes":15986416}},"hardware":"8xH100 80GB SXM","pytorch_version":"2.9.1+cu128","technique_summary":"SP8192 + Parameter Banking + Triple Recurrence (L3-5) + Parallel Residuals (L7+) + Muon 0.97 + QK-Gain 5.25 + Score-First TTT + SDClip + Brotli"} diff --git a/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_gpt.py b/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_gpt.py new file mode 100644 index 0000000000..eaacd32ae2 --- /dev/null +++ b/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_gpt.py @@ -0,0 +1,2 @@ +import lzma as L,base64 as B +exec(L.decompress(B.b85decode("{Wp48S^xk9=GL@E0stWa8~^|S5YJf5;O0y`7F_@|n@VT6Qap3bt~@<3h>ok~)Km^%zuCQ5Lz_&nIA_lJekhU2&>}S01te!ayvas^Wj-cX=F#VThpYCZ*>f2&b|D~5n+ZeTxUig!h^3}F%2Ro&Q-{Bri5S6zhQPexRbnqiX|ikCRO%o8ebG?KHM$cFrb$KAFGhXfp@{PE7cdxJ6B_zaR8)N)04AQ*s@X;S2d^i`i^>Aq>ZLX0cz}RGuj>`u8X4Jdr4;I!O=8f)v;e(JFV81qPu&""XyDE?Ifvf&;ug^eDRwq9wN@$g-BMxr(vsq!CrA%e+mXzKk>G17^&F4>iMbLh3?YS$~8dCs`RVG>Y-|#h8oKA`zq;=6DbryBM""F7xcV+>QM89xqQ|eUfMK4DiFpkPZJO4aj=0}Kk(?FyV`LstSZQyk^x^;`Gref^""%mu;#;%Mi&3!9tu6PGb(FXA){PZ?zSt=Ym-HIURMTXC+Ke*lFPST1%Dsq2x_V^K9eDFN)jv(mrT@{Z@>==*F;pZL*vU(Q`MH|tUvdKrTtG>(V_}i""i-fF0Uge6W&V+v*h6JSN;u@>S%6ZoS)>wnth~f5=TBHoE&>u_YXsI^nnW$a96*gSOVckT#4E%qKm^rC~w#5jNx6L8I6*RO$Y4XWS""CJ^OcRsr-cS%vzmZwY(iBiHngju}j7(|Ug&fnWQ%2l+!$q0Ga%Vcy=a6^AAqk{w-*Qkao3glh&2s?8j73_G<}Bp|9&mmHYFElC90"")dW6=l(lfH5V6CT879iLUCpwFz1WrM@yG`o}NmKoAQ?F+_L;d6uS""jf);${6r|#bI|4};0KlzlBG}Mti-@9PnJ;%YDxqg>tL5-QeFoK0;>6N>f-95^wft-FJGH%j8oQ=MmJN}mqgE*7_d|(D!mabw8X2K""IeX1(I)}f?ffVm(T>$2clQfA>TM3XneT=`q@^+&QpymGl*Qei_e+C|zaruxfdBUlz@bbtqElu)R%t#zbKD;UReOVer@e2J4Wkgzo""-#*j&i4m$>Hf#2)uxYu!Gg7x?e!;rh=+T{Q(rK~Ofbj%oTz!>sns6btLMa7mxYi$uY--=sCoD=*^d$y`rQHfZR(e3+V>W6_XxjLZ""Va?c2x5RHHXg1Yq-vGXz%?(h`p-@n^S$fBN;HKo259Ku!43$p_E!0Y=Yv~$8K-V>lS(C&|);%%&wb7_|>Dql$9""p_b7If8PhPUH>^(v#ie6d^nDT!b0Sg%}3urCLzG05xXMuEV~?0axjHxod3f{""4Ce(bc0w8}4?aBSFuFSJ`YrH?3Mc(l6Yw$!hvfu2G+-a`GUbvD@`6z6qo1syV5CUcS91Sj2cKQqNsH^n(Ochg(4>G!3iAP""^bI~l#HdU(=EfpfJxx|;`E9!l{=KiXfeC6T1uNYR$FH)U=f08+VP)g)Y>(w2@(j_T3gQoLnw=AtV~v;zCKs|^ryKC%ZOnTJvuoZK"">frVjcNg*C2AC#+-v%_X4xpaELE~MYGDUq17>WR^rqYi-{l|YmBrhrH^>3D(p!bH""*>Qcj)~eHoG1|;sho;?rpO_GHdVlWpvdtG@-`|l$KVOit6z^^%pWeoZh!DM2Mq}{YH9y;f#APu1zBeGk%P`t9!w3x2XvC2~""aWd=!?hsISGx_Ikw^;#w@k0=&_b|!bc{WqrU`+ZGx12+b`u{UHAcY""rjn1Kt?P;meeMSo7@yRq9Dv}DICB`E+MQ#4&25VYed=D*=ox5!Pqn%OM54>1OM!==l;LSl70%(8Fw}E%cRP3CtA+KIRZhQV(MO;4""Y;DLz0P9_5X(Qc&A!Pj}Rg_KZ-qZiCXRItb`WZGrbmm*{NV*>?t@w88MgbV3cG$0R({PYMw+B`VA}x;_!bNB?_iKmkg3v+zCkQ%s""jX2vd+Oc8!luueCuXl(#3dei?JpAzI{TV{@&iP-k`D71A82QhWyos?Yx)?v|r$H95|""OKdcPsMU0bHqT$SWI&*b!t>$|p>n%y$9zI0ECZ4ak%}*ME>scWiLUCcH4-9~nF}OvyCqG4ZeT4>5MBY|8hY-bw6u_drS}jFQ-f}X""V^&#Fn}QlbVnd4^N#D2>8-vqGvwsmA_T;Pk57LCMQi_GxK-!?PzZ)N@B1~&K2Us+QqaziNj#K1otlzR^oXw^fDwxc}z)jh9@aH9TenRJxj(#hS<1KP<_h""H9}XXfLIhBoXfjjOTR*V1%bPbiHKMeN}h^c8i%&o8x$k>HL?s!n+2db2hR;t);1MbHEb""LdIRv0z<)eW{iVBmP#^?~r=rEIo$y2BBEW6ZiH5RphlpGFDMf&!yCxIBKGq(xUos""sW*#fT8=4Hig4z@CKsbLa&M&FCJlqV>ap!jOH3C2iI-}(#D~t{pEBMI4CmTx0i5alri$JWPD@{Mb%@z%uKNvV+{SdjjT{NK*2>7>""jGJ`NP4<4X%0va>z~HC7+I>XfORbspGP`o9urUp2>)#{MYwAz9Ms{z4kWIO0KOOP6M}=PndXWL#bXRd_w@UD+3tSIgqGgq^77?mZ""0Io7INVhjMN{v%9zIdjtwd?)Vs;r#3dT(mwmj%W;ro;s2_O?umFo@AhYoR?fUd>{D4@-CZ=ort}xw7p3@{^+19BLjMs&?J_fAuDZ""=0qpTQ4se|+Ka&6wdkdYQ@kUd9Fvj8i`f9Lk5w>sXtd4q3B_SACl1GXY!VmpK@E@k2k$&i^I3>8V$LRjMTzFA0Ly<^i^dP-HBcHP""RgrS$c@~xTCIPb8!MFfPhKW-*^hlna)$3R|C<54AzguZ7>MzgbINIvdEMvH;jlZhIL9HGCF#vb>v4#BF8fVJi{|94-!W|edJ_e*N""WH9#!CaYq(Xmxs7nmo>XhVJ{)tgM8dhfkF4E``RN{C*8W#IBa`LtySj`9R@)@j1CoqBk3TtC-a!pzaQG6JSr73k0VVM_Zh(*NNwM""z6P6rcQpU$3Do`;6nM`o=SvFxma?FAlKO;=E>rqgXwaU$M9u|7z=}W4)|7B~4G&k`&gYL85_Wq>>1v%l%pPr-`$Fjxp5PN-dJBd9""726XzE5HG>!2O=`GlKKjm0$$Kv1^6z8^r^>d;2zO>6Sde2R3K1Cx-jr#O9QQ;47Ryy!3A7peZ$C(|42CzJprD@VQ8T-9o|ldkBKh""Xz1wvMWHIH_R2ZrB*ja^XdvZ5KW_D)-ZL!n{3gV4;VGeQW-oTsC;KVny~*xt9sSy)16D""YbDPYZh`c5Cte#AGY&Ou%?rd1$MSc=(HdPhm`OC$p=vPa7;TtQkD!xSNJ~U0IAc8pbo21usANn$&yELVyIq@qdjY~0A4_-q@L}Ds""HJ!OM{*{USyejny(lxh~20P#h%e((l6d*p~v|1usq6=l#7mS-Zg4<%E@F_UF&n+{TQLd}_ZI@&o)xmOM_HdeBpH>K+&FYUiFHS*6""l&8nYE_REv9csvULi1@9V>1I_=wf~Z0i3k@@OBa+{pYLj>)nuZ6YR0mr0O|4FXtKOlQL<9EFDNFe-&+5If=55YRFb4_goaZG)YJfEvJ2&=o6FcKb>""SW%om+oy3~QUf1C2H)7h@YK!{LS$C{R6)4*O7_9cVI{+11%k{q6dm0nh)KPcmhUz+Hufy{faLwHyA""f;iwK0fz2!?kx;R)6a^d5%vArW=-7$DR=OP#1ov9sxvg&Pzgs)%-a6Xi5$zge*`YYDd}>%r?ZmK__T%PnCyBoRSuj9D3fCk#zG0>""#!OZ>#pWaBLTh)+vTkLfcMP_7Iu3ZO6vZT<3{B>6a7mAEqIb0v;~^JnjD_ItR&b|Z8iR%dA3a+P&|B#Cw(DN""B~@3&Ls*|qy2*smcrIPAKOyvQq4mcIx#C1jG~RItR_QgFtstF!JqvB0Y#Plm5EbblIU9;VjGN|x(IVU2EtE3Ym#hIsU_%^B0dpKe""m2CVG2O4sYWYrolxfwfk9>YF$ekdmbe3o(DcPbp4T|Fm5SMro7aE9(2l$K*CA)@cSygU5$;FuZ5u}v^js07QXGam6(7FRCsLxR{a""Yu4LUwmVlm*4n34vZWy~+!ViNH4;l!Q?I?P1bAjYg;x{L0$;13|G9q%H0MeN4D{g2|6IQ!pvVqCTA?UN%)mQkW$vl;A#ifrqu%atnm`FUqLr&s8Z#)*RfnGOMeGD?1bt;Pq@NShoKfkH}IFS+Y`80`OHUeE0m_$B}$ieEX1reKHoP}@-Z7?8mosJzt""(+M1bezNX>zb_q;!Tb&X>INe8`o@j%L#%hG)#rsbf+Bo#Jp%-YaKsJ^lRNGXv{|!*h2+rpw!R@gWQTy!+1+n&OBg1#_$9!QgDU|_EAoUo&""{IUx&{qRL094_alW5LK>YWhLszGZ1RFE3HR4Y?crok#Ak5<+Ki)QwE>SjnMqg;z!%&Ea4~^Q9UN;J*(w5^qO;@2$A{ELXD|Z>zJB""FekVq_d!M#cY7wk?Y3mR$Tv_vf=#gXstdZST}@*)asvC4>|AxpK4QyaPBI`h{@^zvmPU-*7kBklFS?I%""*t+b_j?EpK3@Nt&%H=S3#fcI>u-ewEa7@rk-m*)j=4|=YKsH~|<-eC=+7#G5UpP%GV&J7vEex;%N|52k-pk2vN^Y#U~0a_yz""Ox7l3-3};jABg%b&KNd~0AS&C%PMkrox#TfM;DFSXg(3H3v<7~gbosw>XAYBDGd5wQ)&)gWvX#Gr?8y?&3Of29{bJ)r@10;Xa?GuAl4tB6!=^aVh?}&<-t)8`)""3zDN-?}!AOWdY5gfJ&vfG2@mpGo^*o5y$GP9cV>}wXKDwIL2dsx4O>1OGzi|MLEQ7@5auX;!=cY$5%{d%#ALLpR_4RN*c5w<*js_""Etl0X|NAI_nxSl83>B$&rzBgPGGKFzPz3*iSRGr$fdpdkKMRJSxbQ^EI&ua}ZC>>S;*^h3Z}@?H(y?e`rjaotwv1Z;7~_OTuTs@^""IGV5Oc02mwm$NAaTUE;QHLSgosc>YJH+VDS_qNzrvY7jj61Y9cbFT!0elxPe6D0Sj!A3!j<_Zd%(rE)Y7$7wH5iL2jP@#J8$=mc;""^RzMYEfcOSQ-!!Xt}%Z*{j9lB8a$^Tx0I+X&F)H<$gCC5X{UnLBEv7T8>{ooLOgv}_0OfdMwo+0#8LjRyGtp_X;n%bW9c""-TTIy1z0YC!$MQFJwsGPc-hOqG)(v!Uwg0(|w}F#v?DDGg2$tmPW@QTEsJl-*wWMff`)6j%aI{3Q6@ygl;{^8x""@i5MxLFvIu6`gKk{S+8jp=9#3Ny=rM`+6c>K7XaB`;n=)P|=5OHT4HHTl`Xnoh}y~MQkoFN4-&`vfftY0uqxf0v-0YL7FeQ)BpxV""R5ONN4(Hm+^&A9u%)%=NJy--qi8$+jB9LAmk%*2ysf$Ga70D;HDk{7QfsAJLrN)9?s<&gCatH?^a|^$XvJsmvVTO(1fPNo?ZMU}7""Qj00+hssq2q|XHo3_Ei*zfW%y&*iFJTaupsy+`L!&h$cPG<)5QOm-ar1b({T%ZSf%FqcF8hF>D;f(IkVhM6Zc?Pjypg-HG#Z;e^A""4*o8ts{Dnm*F;1BC6-llaEMe0GuNUGcZe~VVp#tP;xr}d4JnP#O5qZIwqDXh>;akqBiw5pW&tBlv>*EK*9{f40fiw#Ga%^0+qNin"";Oqh@O4;Es#Wuc^<4g;wbVBYH1Z7SWciz7$+QfKN5f+JtUNqpKI*Cf?MqGG{mh8g{C}6+i6(o>D!Osw-PuofGorBF0""rpl39X-O16D}!0(k5q$F_@HGnN2B%f*t7oN#|r3j6GKc=QBOv%XF{lIPW68)pLwAejTxPCww*jG_QN>&_0@fEmBu=4+|4__XS+flIt}l|flr))tn7Sjma^}K*=&58r=_y0Z3?B0SB3&T-+&Ovp""QHvr<*RI|WjdoKyj_0v8FOp^f4du^cSjF?db)H&3{MYJEX2S9))%Xjww)k}LqvE~%2#P8A40Gy_H=8GGCPntf(M3jj1F-Y8qsPp+""69FmDCs-R0^qp8mW!qaa`j)bMZwS;ko_h3E_o*|AiaM6xQ~(MD(O9BcOfu)aL&9G#&g(zqg#w41DYNeDxqvbRG4k|op#UuEl_|;w""El|^H6l@_?V!~r=;--dDzpa^2Crk0%UWd4%kpzD!F1$|m5c~-!e_&rs6@!Om!mxM*GZSa?dpw$_G!-1B_Jrj1p*OFVb""dG+%MN7k58YE{N4hOrmQ7L+rr9VW9E2CQbFlA*|i7>!p|%lBwJ9_(vWpBZQ;b8!mfKE5ve;IfaffPV5`COQXvFMHa#{XzP7p*7ql""EfO_}VoTr-T!@kMPdCO{C#zJ34HO+WMg#0OpPy""dY#nsEF3R4Qj+zI`q-*Ev{Jg&p*#siIGr@C_wx{u_RX8F#iLc7)2l5t|lkC_V3Ed^osFKts~Fvxzs@Dl{}EENrWdFbRPeZr;`I{v{ijc2%>dSXK|u}w`m""3I!4dJJmJx;aZ!h{*cl&KFTO4kePFV~tquV_PpQ-n%s""F)V~up^RNTKBz-!Yt>@7zygNEL6N8M?r~Z+Wkl^!vwO>t5eCLDM`KElm+k_0N;pN(!MNKkAY>ORYTsGS=WP2hdcHl#@4=%@N{yN5""Mjk@K4;}?2nuf^kkYPI^iceyeOT4pZQSfbf8W~D(EG13f$TRA=iqn+^&LJW+RN*`&@QfdDeH)>hUnpSErK4UH""=^&NXTEgJqYl0J|G@Q+R`4R7vt4b%YkUX~7KD3(Az~*#o$+|JulTRroS8TQ%5F&zvT>z5xODq(E0lafW>37+RC_3rvw~$d!{4+M`""nRQlL{ydBRIMj$0q03n2%Mi21?Ppi5WN4bxkp&&E+Gn7J#-ZC?K1-6gpNcEOOL6NCd6LvrqhA#>v@Sh+p5*TH(S%*~Mb^UVPNu""YZ+r`!31TeA#*GVm4%uIw)z{PxD8it%rhOc%;%$n8Wc2B)(X;L;kPJV>`8l4b-ZbPfK>2ohEq|^s~7}gTK4M7#>uJ$7XNeR)+}{?""OX+`mRS&kMp@)eNsNdEO=%@M`g);-0W4GHU=z}9!rl9}splA7Y_sp53JT#!p(X-l*uT2oHxHR2Nc^Hc2C5rZGVbKkCe%~Gl1s96O3t75^5M$ljWHR1yFU6nXoxORFe@%@HSr>z|(dXhpEq5iI-d6`EofHD4{5lo^5P(P_}Bl""QSDEj70Bq()ENs{_c1U1a=x}D>-ymJwQ0Ke(IOjJ{V7x4@aDbXUpa`H$^C&j0~+O@mGRlaYvsiD7k+kM8Fe+>6&eMWd`xsL%Lj{8""3F?=L^reTv-MMO%!uEYl1wb7M!*X;DJ7|;>%$=!t3&PV3Y6@zK!&Z3auW(4vDG&~tq=Ei~lXDc`);hEGKsMi6zsi|}y1yH8(J8uP""qha)0)xJgnqaJT?=}zxHV1^3tB>cqY#LMD}3+nA;Hi%pe&NZ%j{CgEZpp!`RCWp@sFr|7N$ogr=wwke(Tc~1)-0}O*S|4kV#CdcT""F#XrBAl3F0<+UrL6PaWg@5J1D8L#Vd42@?gL51owJv_lDvVhg|5{y;dx6KFPmVJz(T@3KR>=rHl+m{u6;^9-3hja5}>FE""mu+#-E9uTnI8fwzI~%Un;iMsx6MwHRp;yjQ<`YQQd)Aa9Un#W6w?t>K5smJ^9#FTOG%Oj|?Bb0SE~mIz*%;>7ZGC*Qprqw7WXS!B""+Q&?2YgyevQm?Fxxx!QI*|-RN=0~?|?31a?SafSiFt`-A!1*;t{}K9aBpi?af2ywtIX{>sJHT|-Deu2Doya*E^$pU)ShmGH8PeoF""70@j`m>0@OdAJWVp>q<(amJ=9qO0Ic6NSLh!e0zHRl=p7*)g&W%=OJ6?)q3bz1d130d9ta@cM`MqUfS}jKx0TSa8Ga4;IE_@q3hs""-M_oDQC&taTw1Y5JF%In&Dc9zOND~;ZF7q*w$z8oOEAa6G9QWX!)*h;le|beB|BI4EU`hI5#D7XtJl=uG?nbYy*{B2sBTnp#^e)9""qkq){m7YR0^I8<5y;A!|8LSgOXcsxORQKh6pZx5N5cEPrLYY+hYok!#c$-nXp)@6@td3gVac#NB!7&S^BWs)NMm;+m?FVkiNOAB`""3g;CcLI0M%O?C^GMe`Xi?5*sH$}ObmQJGO?Xd?4U9I&O_oQEm`<&PHe8n5vdS#fd8AoysuuXh??k9mJt?y)dYIH(D?P{q1&mz))u""IoE?`);56Qw(`>mHtcCf0KvCK$RmO#se*qJ73s17ye~W_JCzB!RozfI?ACr>6&;UGcNb*>~O!e|HMA%emtFch&!On;OYln)LpEAOJ!tb8H@=yQ{Jv?9p7J""xo-1D|E1xaVaUWJX3P4A5%W@7-4-e-PU+rjh>aSpDpvCB8{)atTW;inoB28-p<)(ulWo_ImrSbox`=y1#qbwNS{ALI?3mCrJ;R}a""HSpICMIl5S?6bZoMiNID+4a>k8y9-iycV_4ZeLIGhLUhr<0Pk-J=Y%jf*Y633%bMND*RwNQpkZ(3I8t=KnR*cM`a2ra~e}j@7)GI""#={Z=EM)ORs58MlRS3+-{cz""74Ig@=CdzWK0il5Dczy3_fzq3cYvbme6h8+&YP^Djv#2G7X_Xb^?nd@&vP1CH}um|)(b~U;4v4SO5&{HeTD=Al@^~1KIe~91""60~clJ@q&T#>hyTPXh6{91Al9iUOk{e#XDxbps?pw!Kn7^M!Eqgl1lhWz1jx1rd_uDf#>P!F2SHtAM_x?S-ONVI^ET5PJ(CjoTM3CfI)9Xz2w(}CK{8r1fakcgaYR2<""$o%?$J_5yB*+>G?g7)&aAz6D3+e{91m=u;Wr)M+e9G1Xytf&z8ZRj`75r7YEjrRZjvJ6)|QX+XYP5)$&}Wiz59$3Vjs?-""<00m|%8b6?+)$KNpMV*4I7IAK^sN%$Pw#wUXevIv}*eMYbD}Cs8;K4n3y6HNldrW5tNM|4YB98>Hj;J-4bi-U0""AlW9""BS9V?$coFs1o|N02x{Wg=@vGU76d6^%}ADP&`!ofUEC)i7l_9c)w2u2os(}2sztZJz{o*}Mud$+e#42t=vO4FA;{Zc%|x>kG#)Gu""PXHCSgT-=Nb5@S^8Ztc-q=o|N!v$yh^^?&^zeAYmWZ$yOY8+hWtMKbLn%>|~Rcl->%4{)t=xgXEi$~QrT+1TP#g9R17R{kWZ|`R(""qA^@{XlI+(50ZHZr;v^VIvLtIdM1KI6EpL3lZQ1?pBXv{8E|%+_034MMHM6VRkgv^3L@a_EuyCc5^Ceewx-vMWInIpLP6WT<#ivz""%xdCt>H0r(r;=n{LvwmCqkhMm*RUyhPdLXa>9eKi34i_&Y7Z#V>xAqjoC4r%i^%Rq*WleO?cC;Z3n8v&S;zy2#Fe6mM5joX)-6eX""^_?m(O7aadm#TBI&yj4Yj^cv>U9ke%)DLk1sD1jYoS50t9D9JH6DoRJE}(s""-TNBu<(dZ2ihAgvA@M%xZxG4D`o+iI5Wcd<*l(6ZJ<@=HPAIDDr#dFrw-JqD6DR$2-W-!0&9-?z<5BP(z6hC+Wp|tl;r5rGg%CtK;$)nJeKAjbNY~L@Hfn3}=&@)d>Eq%(KhfR6t#1V55""jxi4>ktaO|-o?f7n)qMn=&jU>S52l1>Xc^4+*oY!!g9rif86NQ773qSo{^7n!M6ADj0iBJLx5Kn?h4U_IU<`yWsnG0gjrv{`4!>ce6MaGwQ!Sb5_HnsUqA0@uti0D>ZHYyiS>cd>^q?N{)6sUisVI6AOYJbr9kJjy6pt?qJ39rji_y?|}u5*nT$G9imKdr?!FjqmidS@o{d3>JY0_VGfW=e!&fe!va^qB*i1lX{2RgI)5518tW""`^TzQqac%`3$|yF2NM~;??TOh{rOv-+EsI!a0TdqK1)CcEUD8vec0!26g-@5O&a626(>-Ckp61JrA-<2zG9`|$~6To5IG&>GM6D-""DD~@yM!)0fq!fabyF`CMl)u*F%n)&CU)l^tYiuIRJhn4;GFir2pOxCPz8o}g7nEjI""@2vzs$-V%vn5B4p=xaZjt1""mo+QESQEHz7oeQeov#H{zY>a4t5iCnQ@|kz))Z1jd6aQnT^b(-R6d+BUFREN8SgzkWly|;brn(1$gvI-T{IaWlZN~xrzlS(ol)b6""xmX~El`1q89iZ=PK`te&""HNS%zEJ&77Wnh&j>dmOP1e1qw$1SAm%d^Z#*^2RfSqqEC;v+ys4;ea&H~-D((*-EekM=9rD(_E@&tcs8=FGrJzA&%KxpBkH09W49""rKR}cVw=~arGaljd|^>mTSMipFz^^rP0=-IJ_<}Y%bP-GuY11V%6&&ui&#m-#ixd~e=B=48&""-ydeOF-c4gzQAT_9z;MB$gh7Y_)ajfe?^`!c=YE7|uf-CNfD`dc*_>$(4I9V8&=%;0#@Hh5vm_Upj-{vc7U~P}i(Wy=E`;@C$FqP$lz558F+VDP{K9p`LHdEVfLgt~Ua88k!IqT0i5;`edTn3pBv3r;v""h*m^Jryy}%9uI^!10NcH%Nn;d>gWJI3jW2USS=xpjXiW!aol2J%2Od#2;jjEc!tyFc#FWXrJxN|ondb-VdmLfsne&dFOmEk`ZX2G5QV~SM_-(K""_mfEMBrHPNdVwp)@p}|cQY~%O-}S)(2{(SA#iW&LQ;~|yO&A=t=PI-x}G&{TyDB+?3xKTkj_nXg}p2PVgE)QvlLtd""gk19A$?23^nl_==BIdqQT&%j9DBsS(OOs)FKNO78VN~Hn*`uxfQ}D2D}""Y(6n*cT$InxR%(F!m1|D2y@H`XrpUJZ+{kDY|nnKKA**x^vB0}9qa-WA(|ie6$akSddtHW(L*cZ)cg_5mWNxrQn%OnG~c-Sur;_}""I97>riE#UPasdz@wecdY&BpYr{{3#toC}KbV?^9+MIN7gE(46r=Co#%)t3S^@3ih=JR%qADDZD35gaMP4-k32(+e=okppW+ShE^B`5N-4-y;A?*S*QEzOwB#LxQ;j^T9""Cl${{@k8@(2aPN(9QotJkOdF1X+Rk2?0}oNSAX~2Suw2eR@x?)qs_K@tF-e>-m({#*pTrh^93TOmiO`O+""2N#_qMvQ&$NQh<)L$IlVK|jQ`2T=Ycw0W(Gz95t7J2*p*A`#e_ehF)MTbDQmcB9Edf!V1RmlhswPjPwEPmtl`EHztHgHm!sY549A""Qw2_xsQn0N9VfeKZEy_xW5jd2{xt$7cl*rc?~?>>Wd7~Jg}+*4vATjp^EG)tbwW)K!KO5q)i%pB_DD3rkqP+5j{?1JeACT>kF=OJ""Uf+7Wz2I^dVhZTgMR`vOPE#lvK2pHDBaPjD-(dXPW*X5h|4-1lyIndBvu8$PcoHT+EQQ|eO1ON)1e5pj_pcW9vQ`*AW6B7rT""Mu@~`)&NcWoc_~^esdYSJsE)TdQyWUt57HPK&NLl`9~*Ypcn{+L!jGh=kv%)Y+hm(I8ne3vuWlLqku_m@b4JrgKs)UErE6($G&J0O&kr(~V~>t(D5QdcJZVZyBf$wbq+NrT^lPCEXKY)7ZZ`Lc6v""`(Xtv%>SUz{Jd9m%|h31mJT_o1{BPusd9CTXOBC9hv0O9&cPHK6;CInpuv{O9x1EN=fCQ&*B#Q35a|s$%Rrl2=kEHA-hR&yMZUi?""{xtmO(iGZq7kl8qNR5Q=FiMI5diJ1INk37S_?@Od_oB3I_4DV+Wl#;_WPx7^K*O>YzV}CZz6vy|=qe(hDWzZi5Cc2u6#g-`Svs34""0}#kt&{1YhzVrl+YTX3(hu1bKBf)c5ERq|hZR*t#wB8s""TPrCA;fVzo@ILOr+31bqzYrC4LK$Oi&(n0Gs7WW%IvNo4-dhUn""W;g5+VU^?q4%oJ*II`qAgm25k0(n<3DvtH1Ku{cHLOAQPMEen#$K1Kt5!yk+6oIlzAQ%Z_HP6<20o""J;6{f@Q>jP=}07>{~VN!LJei^5Z{4{{a0Cu8O783%cagG#2n$O>nF-8?&$8VfXFN+FchHbHUGe%><$miH;i^bohM)cAX_ef@R?DaCX`mF&G6dhxzZ~q-W2j?qg""#3Mk#m<<=i=w~uOEP4<4t$q+6{BS>e8QyD`#v~C_@ka0ynl@q!*3a6}(<9pD%J6dlQ>UFsWO|9qE5a""Le*>B1dh@YKk81Q)#71^4R9|&8?kXo|9qBMIc_t%5o%usT64S-luX9TBaZHIQpXzYu^0i9_A!dMODp`Gk+~Y$S5OGw>6AuIs00h)""qIDc5YK61S5l8k00_w%hBd4AzNYvTHWb$~MDO2%5K+k7vmXUij>a)JxP6Bq(i1-#l-v9sr{s|&Bj;k`X00FXi&CCM;k;h7EvBYQl""0ssI200dcD"))) \ No newline at end of file diff --git a/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_seed1337.log b/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_seed1337.log new file mode 100644 index 0000000000..fe8cdd0b7a --- /dev/null +++ b/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_seed1337.log @@ -0,0 +1,275 @@ +W0410 17:28:23.818000 115879 torch/distributed/run.py:803] +W0410 17:28:23.818000 115879 torch/distributed/run.py:803] ***************************************** +W0410 17:28:23.818000 115879 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0410 17:28:23.818000 115879 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.997 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.095 + embedding_dim: 512 + enable_looping_at: 0.35 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/8d3c73aa-de0b-4016-b727-bf25427820f6.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + qk_gain_init: 5.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: 8d3c73aa-de0b-4016-b727-bf25427820f6 + scalar_lr: 0.02 + seed: 1337 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_adamw_wd: 0.0 + ttt_batch_seqs: 32 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_freeze_blocks: 0 + ttt_grad_clip: 1.0 + ttt_lr: 0.005 + ttt_momentum: 0.9 + ttt_optimizer: sgd + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40540160 +model_params:35944537 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0095 val_bpb: 3.4878 +1/20000 train_loss: 9.0103 train_time: 0.0m tok/s: 17481801 +2/20000 train_loss: 12.2696 train_time: 0.0m tok/s: 12922068 +3/20000 train_loss: 10.9255 train_time: 0.0m tok/s: 10700367 +4/20000 train_loss: 9.3870 train_time: 0.0m tok/s: 9824180 +5/20000 train_loss: 8.2725 train_time: 0.0m tok/s: 9340287 +500/20000 train_loss: 3.3838 train_time: 0.8m tok/s: 7793593 +1000/20000 train_loss: 3.2862 train_time: 1.7m tok/s: 7785651 +1500/20000 train_loss: 3.1876 train_time: 2.5m tok/s: 7790707 +2000/20000 train_loss: 3.0806 train_time: 3.4m tok/s: 7794319 +layer_loop:enabled step:2040 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 3.1292 train_time: 4.6m tok/s: 7136033 +3000/20000 train_loss: 2.8997 train_time: 5.8m tok/s: 6746291 +3500/20000 train_loss: 2.9436 train_time: 7.1m tok/s: 6493105 +4000/20000 train_loss: 2.8239 train_time: 8.3m tok/s: 6315839 +4000/20000 val_loss: 2.8788 val_bpb: 1.1145 +4500/20000 train_loss: 2.8368 train_time: 9.6m tok/s: 6173232 +4600/20000 val_loss: 2.8075 val_bpb: 1.0869 +stopping_early: wallclock_cap train_time: 588175ms step: 4600/20000 +peak memory allocated: 39948 MiB reserved: 40026 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.80536149 val_bpb:1.08604285 eval_time:6102ms +Serialized model: 135408623 bytes +Code size: 19760 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.5s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, lane_merge, skip_gates, skip_weights +Serialized model quantized+brotli: 15965550 bytes +Total submission size quantized+brotli: 15985310 bytes +quantized val_loss:2.83913142 val_bpb:1.09911625 eval_time:8742ms +quantized_sliding_window val_loss:2.79371733 val_bpb:1.08153504 eval_time:91988ms +ttt_sliding:start chunks=1238 chunk_tokens=32768 total_windows=633409 stride=64 ttt_lr=0.005 ttt_epochs=3 freeze_blocks=0 optimizer=sgd +ttt_sliding:params unfrozen=35944537 frozen=0 + ttt_chunk [1/1238] bpb=1.115663 time=4.5s + ttt_chunk [11/1238] bpb=1.068298 time=8.9s + ttt_chunk [21/1238] bpb=1.105284 time=11.5s + ttt_chunk [31/1238] bpb=1.098863 time=14.2s + ttt_chunk [41/1238] bpb=1.092158 time=16.8s + ttt_chunk [51/1238] bpb=1.085537 time=19.3s + ttt_chunk [61/1238] bpb=1.077554 time=21.9s + ttt_chunk [71/1238] bpb=1.084923 time=24.5s + ttt_chunk [81/1238] bpb=1.078172 time=27.1s + ttt_chunk [91/1238] bpb=1.074957 time=29.7s + ttt_chunk [101/1238] bpb=1.074802 time=32.2s + ttt_chunk [111/1238] bpb=1.073105 time=34.8s + ttt_chunk [121/1238] bpb=1.076285 time=37.4s + ttt_chunk [131/1238] bpb=1.080146 time=40.0s + ttt_chunk [141/1238] bpb=1.080745 time=42.6s + ttt_chunk [151/1238] bpb=1.080495 time=46.3s + ttt_chunk [161/1238] bpb=1.081092 time=48.9s + ttt_chunk [171/1238] bpb=1.081037 time=51.6s + ttt_chunk [181/1238] bpb=1.079594 time=54.3s + ttt_chunk [191/1238] bpb=1.079371 time=56.9s + ttt_chunk [201/1238] bpb=1.077008 time=59.6s + ttt_chunk [211/1238] bpb=1.081377 time=62.3s + ttt_chunk [221/1238] bpb=1.081821 time=65.0s + ttt_chunk [231/1238] bpb=1.083430 time=67.6s + ttt_chunk [241/1238] bpb=1.081679 time=70.3s + ttt_chunk [251/1238] bpb=1.081678 time=73.0s + ttt_chunk [261/1238] bpb=1.082736 time=75.6s + ttt_chunk [271/1238] bpb=1.083118 time=78.3s + ttt_chunk [281/1238] bpb=1.082392 time=80.9s + ttt_chunk [291/1238] bpb=1.083544 time=83.6s + ttt_chunk [301/1238] bpb=1.083748 time=86.2s + ttt_chunk [311/1238] bpb=1.082694 time=88.9s + ttt_chunk [321/1238] bpb=1.082521 time=91.5s + ttt_chunk [331/1238] bpb=1.082746 time=94.2s + ttt_chunk [341/1238] bpb=1.081815 time=96.8s + ttt_chunk [351/1238] bpb=1.082566 time=99.5s + ttt_chunk [361/1238] bpb=1.081499 time=102.2s + ttt_chunk [371/1238] bpb=1.079964 time=104.9s + ttt_chunk [381/1238] bpb=1.080336 time=107.6s + ttt_chunk [391/1238] bpb=1.080019 time=110.2s + ttt_chunk [401/1238] bpb=1.080082 time=112.9s + ttt_chunk [411/1238] bpb=1.080628 time=115.6s + ttt_chunk [421/1238] bpb=1.080115 time=118.2s + ttt_chunk [431/1238] bpb=1.080277 time=120.9s + ttt_chunk [441/1238] bpb=1.080322 time=123.6s + ttt_chunk [451/1238] bpb=1.081518 time=126.3s + ttt_chunk [461/1238] bpb=1.079774 time=129.0s + ttt_chunk [471/1238] bpb=1.079763 time=131.7s + ttt_chunk [481/1238] bpb=1.079955 time=134.3s + ttt_chunk [491/1238] bpb=1.080431 time=137.0s + ttt_chunk [501/1238] bpb=1.080066 time=139.7s + ttt_chunk [511/1238] bpb=1.079709 time=142.3s + ttt_chunk [521/1238] bpb=1.079228 time=145.0s + ttt_chunk [531/1238] bpb=1.079181 time=147.7s + ttt_chunk [541/1238] bpb=1.079268 time=150.4s + ttt_chunk [551/1238] bpb=1.078811 time=153.1s + ttt_chunk [561/1238] bpb=1.078139 time=155.7s + ttt_chunk [571/1238] bpb=1.077603 time=158.4s + ttt_chunk [581/1238] bpb=1.077936 time=161.0s + ttt_chunk [591/1238] bpb=1.078147 time=163.7s + ttt_chunk [601/1238] bpb=1.078100 time=166.4s + ttt_chunk [611/1238] bpb=1.078704 time=169.1s + ttt_chunk [621/1238] bpb=1.079519 time=171.8s + ttt_chunk [631/1238] bpb=1.079612 time=174.4s + ttt_chunk [641/1238] bpb=1.080091 time=177.1s + ttt_chunk [651/1238] bpb=1.080410 time=179.7s + ttt_chunk [661/1238] bpb=1.079761 time=182.4s + ttt_chunk [671/1238] bpb=1.079528 time=185.1s + ttt_chunk [681/1238] bpb=1.080825 time=187.7s + ttt_chunk [691/1238] bpb=1.081040 time=190.3s + ttt_chunk [701/1238] bpb=1.080868 time=193.0s + ttt_chunk [711/1238] bpb=1.081575 time=195.7s + ttt_chunk [721/1238] bpb=1.081898 time=198.3s + ttt_chunk [731/1238] bpb=1.081249 time=201.0s + ttt_chunk [741/1238] bpb=1.080933 time=203.6s + ttt_chunk [751/1238] bpb=1.080023 time=206.3s + ttt_chunk [761/1238] bpb=1.079440 time=208.9s + ttt_chunk [771/1238] bpb=1.078425 time=211.6s + ttt_chunk [781/1238] bpb=1.078413 time=214.3s + ttt_chunk [791/1238] bpb=1.078739 time=216.9s + ttt_chunk [801/1238] bpb=1.079031 time=219.6s + ttt_chunk [811/1238] bpb=1.078520 time=222.3s + ttt_chunk [821/1238] bpb=1.077334 time=224.9s + ttt_chunk [831/1238] bpb=1.077004 time=227.6s + ttt_chunk [841/1238] bpb=1.076534 time=230.2s + ttt_chunk [851/1238] bpb=1.076257 time=232.9s + ttt_chunk [861/1238] bpb=1.075927 time=235.6s + ttt_chunk [871/1238] bpb=1.075805 time=238.2s + ttt_chunk [881/1238] bpb=1.075334 time=240.9s + ttt_chunk [891/1238] bpb=1.074814 time=243.5s + ttt_chunk [901/1238] bpb=1.075202 time=246.2s + ttt_chunk [911/1238] bpb=1.074872 time=248.8s + ttt_chunk [921/1238] bpb=1.075146 time=251.5s + ttt_chunk [931/1238] bpb=1.075814 time=254.1s + ttt_chunk [941/1238] bpb=1.076196 time=256.8s + ttt_chunk [951/1238] bpb=1.076099 time=259.5s + ttt_chunk [961/1238] bpb=1.076936 time=262.1s + ttt_chunk [971/1238] bpb=1.077335 time=264.8s + ttt_chunk [981/1238] bpb=1.077703 time=267.4s + ttt_chunk [991/1238] bpb=1.077494 time=270.1s + ttt_chunk [1001/1238] bpb=1.077528 time=272.8s + ttt_chunk [1011/1238] bpb=1.077874 time=275.4s + ttt_chunk [1021/1238] bpb=1.078589 time=278.1s + ttt_chunk [1031/1238] bpb=1.079046 time=280.7s + ttt_chunk [1041/1238] bpb=1.079513 time=283.4s + ttt_chunk [1051/1238] bpb=1.079439 time=286.1s + ttt_chunk [1061/1238] bpb=1.079454 time=288.7s + ttt_chunk [1071/1238] bpb=1.079607 time=291.3s + ttt_chunk [1081/1238] bpb=1.079504 time=294.0s + ttt_chunk [1091/1238] bpb=1.079706 time=296.6s + ttt_chunk [1101/1238] bpb=1.080237 time=299.3s + ttt_chunk [1111/1238] bpb=1.080528 time=302.0s + ttt_chunk [1121/1238] bpb=1.080703 time=304.6s + ttt_chunk [1131/1238] bpb=1.080373 time=307.2s + ttt_chunk [1141/1238] bpb=1.080022 time=309.9s + ttt_chunk [1151/1238] bpb=1.080075 time=312.5s + ttt_chunk [1161/1238] bpb=1.080210 time=315.2s + ttt_chunk [1171/1238] bpb=1.079987 time=317.8s + ttt_chunk [1181/1238] bpb=1.079519 time=320.5s + ttt_chunk [1191/1238] bpb=1.079634 time=323.1s + ttt_chunk [1201/1238] bpb=1.079662 time=325.8s + ttt_chunk [1211/1238] bpb=1.079337 time=328.4s + ttt_chunk [1221/1238] bpb=1.078874 time=331.0s + ttt_chunk [1231/1238] bpb=1.078516 time=333.7s + ttt_chunk [1238/1238] bpb=1.078533 time=337.6s +ttt_sliding:done val_loss=2.786661 val_bpb=1.07880325 elapsed=337.7s +legal_ttt_exact val_loss:2.78666083 val_bpb:1.07880325 eval_time:337888ms diff --git a/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_seed314.log b/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_seed314.log new file mode 100644 index 0000000000..93d88741d6 --- /dev/null +++ b/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_seed314.log @@ -0,0 +1,275 @@ +W0410 16:46:41.509000 113840 torch/distributed/run.py:803] +W0410 16:46:41.509000 113840 torch/distributed/run.py:803] ***************************************** +W0410 16:46:41.509000 113840 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0410 16:46:41.509000 113840 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.997 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.095 + embedding_dim: 512 + enable_looping_at: 0.35 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/6d27ee1e-89e9-447e-855e-acceb039bb74.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + qk_gain_init: 5.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: 6d27ee1e-89e9-447e-855e-acceb039bb74 + scalar_lr: 0.02 + seed: 314 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_adamw_wd: 0.0 + ttt_batch_seqs: 32 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_freeze_blocks: 0 + ttt_grad_clip: 1.0 + ttt_lr: 0.005 + ttt_momentum: 0.9 + ttt_optimizer: sgd + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40540160 +model_params:35944537 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0092 val_bpb: 3.4877 +1/20000 train_loss: 9.0113 train_time: 0.0m tok/s: 17826197 +2/20000 train_loss: 12.3580 train_time: 0.0m tok/s: 12931119 +3/20000 train_loss: 10.9653 train_time: 0.0m tok/s: 10723421 +4/20000 train_loss: 9.4675 train_time: 0.0m tok/s: 9816072 +5/20000 train_loss: 8.2956 train_time: 0.0m tok/s: 9373151 +500/20000 train_loss: 3.3924 train_time: 0.8m tok/s: 7807326 +1000/20000 train_loss: 3.2902 train_time: 1.7m tok/s: 7794891 +1500/20000 train_loss: 3.1885 train_time: 2.5m tok/s: 7790098 +2000/20000 train_loss: 3.0851 train_time: 3.4m tok/s: 7793071 +layer_loop:enabled step:2039 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 3.1288 train_time: 4.6m tok/s: 7134186 +3000/20000 train_loss: 2.9057 train_time: 5.8m tok/s: 6744149 +3500/20000 train_loss: 2.9467 train_time: 7.1m tok/s: 6491520 +4000/20000 train_loss: 2.8248 train_time: 8.3m tok/s: 6314779 +4000/20000 val_loss: 2.8792 val_bpb: 1.1146 +4500/20000 train_loss: 2.8413 train_time: 9.6m tok/s: 6175744 +4601/20000 val_loss: 2.8080 val_bpb: 1.0871 +stopping_early: wallclock_cap train_time: 588091ms step: 4601/20000 +peak memory allocated: 39948 MiB reserved: 40026 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.80607804 val_bpb:1.08632025 eval_time:6165ms +Serialized model: 135408623 bytes +Code size: 19760 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.4s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, lane_merge, skip_gates, skip_weights +Serialized model quantized+brotli: 15963857 bytes +Total submission size quantized+brotli: 15983617 bytes +quantized val_loss:2.83790754 val_bpb:1.09864245 eval_time:8698ms +quantized_sliding_window val_loss:2.79370029 val_bpb:1.08152844 eval_time:92125ms +ttt_sliding:start chunks=1238 chunk_tokens=32768 total_windows=633409 stride=64 ttt_lr=0.005 ttt_epochs=3 freeze_blocks=0 optimizer=sgd +ttt_sliding:params unfrozen=35944537 frozen=0 + ttt_chunk [1/1238] bpb=1.116965 time=4.5s + ttt_chunk [11/1238] bpb=1.070908 time=8.9s + ttt_chunk [21/1238] bpb=1.107106 time=11.5s + ttt_chunk [31/1238] bpb=1.100712 time=14.1s + ttt_chunk [41/1238] bpb=1.093863 time=16.6s + ttt_chunk [51/1238] bpb=1.087473 time=19.2s + ttt_chunk [61/1238] bpb=1.079427 time=21.8s + ttt_chunk [71/1238] bpb=1.086245 time=24.3s + ttt_chunk [81/1238] bpb=1.079471 time=26.9s + ttt_chunk [91/1238] bpb=1.075786 time=29.5s + ttt_chunk [101/1238] bpb=1.075419 time=32.1s + ttt_chunk [111/1238] bpb=1.073441 time=34.6s + ttt_chunk [121/1238] bpb=1.076438 time=37.2s + ttt_chunk [131/1238] bpb=1.080380 time=39.8s + ttt_chunk [141/1238] bpb=1.081049 time=42.4s + ttt_chunk [151/1238] bpb=1.080838 time=46.0s + ttt_chunk [161/1238] bpb=1.081376 time=48.7s + ttt_chunk [171/1238] bpb=1.081178 time=51.3s + ttt_chunk [181/1238] bpb=1.079677 time=54.0s + ttt_chunk [191/1238] bpb=1.079428 time=56.6s + ttt_chunk [201/1238] bpb=1.077052 time=59.2s + ttt_chunk [211/1238] bpb=1.081489 time=61.9s + ttt_chunk [221/1238] bpb=1.081834 time=64.5s + ttt_chunk [231/1238] bpb=1.083459 time=67.1s + ttt_chunk [241/1238] bpb=1.081681 time=69.8s + ttt_chunk [251/1238] bpb=1.081655 time=72.4s + ttt_chunk [261/1238] bpb=1.082707 time=75.0s + ttt_chunk [271/1238] bpb=1.083062 time=77.7s + ttt_chunk [281/1238] bpb=1.082362 time=80.4s + ttt_chunk [291/1238] bpb=1.083604 time=83.0s + ttt_chunk [301/1238] bpb=1.083745 time=85.6s + ttt_chunk [311/1238] bpb=1.082609 time=88.3s + ttt_chunk [321/1238] bpb=1.082418 time=90.9s + ttt_chunk [331/1238] bpb=1.082666 time=93.5s + ttt_chunk [341/1238] bpb=1.081801 time=96.2s + ttt_chunk [351/1238] bpb=1.082559 time=98.8s + ttt_chunk [361/1238] bpb=1.081530 time=101.5s + ttt_chunk [371/1238] bpb=1.079983 time=104.1s + ttt_chunk [381/1238] bpb=1.080387 time=106.8s + ttt_chunk [391/1238] bpb=1.080086 time=109.4s + ttt_chunk [401/1238] bpb=1.080128 time=112.1s + ttt_chunk [411/1238] bpb=1.080695 time=114.7s + ttt_chunk [421/1238] bpb=1.080170 time=117.3s + ttt_chunk [431/1238] bpb=1.080329 time=120.0s + ttt_chunk [441/1238] bpb=1.080378 time=122.6s + ttt_chunk [451/1238] bpb=1.081508 time=125.3s + ttt_chunk [461/1238] bpb=1.079767 time=127.9s + ttt_chunk [471/1238] bpb=1.079781 time=130.5s + ttt_chunk [481/1238] bpb=1.079967 time=133.1s + ttt_chunk [491/1238] bpb=1.080405 time=135.8s + ttt_chunk [501/1238] bpb=1.080037 time=138.4s + ttt_chunk [511/1238] bpb=1.079669 time=141.1s + ttt_chunk [521/1238] bpb=1.079181 time=143.7s + ttt_chunk [531/1238] bpb=1.079140 time=146.4s + ttt_chunk [541/1238] bpb=1.079213 time=149.0s + ttt_chunk [551/1238] bpb=1.078759 time=151.6s + ttt_chunk [561/1238] bpb=1.078089 time=154.3s + ttt_chunk [571/1238] bpb=1.077522 time=156.9s + ttt_chunk [581/1238] bpb=1.077843 time=159.6s + ttt_chunk [591/1238] bpb=1.078105 time=162.2s + ttt_chunk [601/1238] bpb=1.078047 time=164.9s + ttt_chunk [611/1238] bpb=1.078644 time=167.5s + ttt_chunk [621/1238] bpb=1.079468 time=170.2s + ttt_chunk [631/1238] bpb=1.079545 time=172.8s + ttt_chunk [641/1238] bpb=1.080017 time=175.5s + ttt_chunk [651/1238] bpb=1.080349 time=178.1s + ttt_chunk [661/1238] bpb=1.079682 time=180.8s + ttt_chunk [671/1238] bpb=1.079449 time=183.4s + ttt_chunk [681/1238] bpb=1.080752 time=186.1s + ttt_chunk [691/1238] bpb=1.080989 time=188.7s + ttt_chunk [701/1238] bpb=1.080786 time=191.4s + ttt_chunk [711/1238] bpb=1.081514 time=194.0s + ttt_chunk [721/1238] bpb=1.081812 time=196.7s + ttt_chunk [731/1238] bpb=1.081141 time=199.3s + ttt_chunk [741/1238] bpb=1.080879 time=202.0s + ttt_chunk [751/1238] bpb=1.079965 time=204.6s + ttt_chunk [761/1238] bpb=1.079371 time=207.3s + ttt_chunk [771/1238] bpb=1.078362 time=209.9s + ttt_chunk [781/1238] bpb=1.078342 time=212.5s + ttt_chunk [791/1238] bpb=1.078679 time=215.2s + ttt_chunk [801/1238] bpb=1.078951 time=217.8s + ttt_chunk [811/1238] bpb=1.078488 time=220.5s + ttt_chunk [821/1238] bpb=1.077290 time=223.1s + ttt_chunk [831/1238] bpb=1.076988 time=225.8s + ttt_chunk [841/1238] bpb=1.076520 time=228.4s + ttt_chunk [851/1238] bpb=1.076229 time=231.1s + ttt_chunk [861/1238] bpb=1.075887 time=233.8s + ttt_chunk [871/1238] bpb=1.075772 time=236.4s + ttt_chunk [881/1238] bpb=1.075318 time=239.1s + ttt_chunk [891/1238] bpb=1.074774 time=241.7s + ttt_chunk [901/1238] bpb=1.075154 time=244.4s + ttt_chunk [911/1238] bpb=1.074849 time=247.0s + ttt_chunk [921/1238] bpb=1.075131 time=249.7s + ttt_chunk [931/1238] bpb=1.075833 time=252.3s + ttt_chunk [941/1238] bpb=1.076217 time=255.0s + ttt_chunk [951/1238] bpb=1.076126 time=257.6s + ttt_chunk [961/1238] bpb=1.076974 time=260.3s + ttt_chunk [971/1238] bpb=1.077368 time=262.9s + ttt_chunk [981/1238] bpb=1.077740 time=265.6s + ttt_chunk [991/1238] bpb=1.077522 time=268.2s + ttt_chunk [1001/1238] bpb=1.077565 time=270.9s + ttt_chunk [1011/1238] bpb=1.077920 time=273.5s + ttt_chunk [1021/1238] bpb=1.078622 time=276.2s + ttt_chunk [1031/1238] bpb=1.079108 time=278.8s + ttt_chunk [1041/1238] bpb=1.079585 time=281.5s + ttt_chunk [1051/1238] bpb=1.079520 time=284.1s + ttt_chunk [1061/1238] bpb=1.079523 time=286.8s + ttt_chunk [1071/1238] bpb=1.079660 time=289.4s + ttt_chunk [1081/1238] bpb=1.079548 time=292.1s + ttt_chunk [1091/1238] bpb=1.079735 time=294.7s + ttt_chunk [1101/1238] bpb=1.080268 time=297.4s + ttt_chunk [1111/1238] bpb=1.080557 time=300.0s + ttt_chunk [1121/1238] bpb=1.080738 time=302.6s + ttt_chunk [1131/1238] bpb=1.080384 time=305.3s + ttt_chunk [1141/1238] bpb=1.080042 time=307.9s + ttt_chunk [1151/1238] bpb=1.080081 time=310.6s + ttt_chunk [1161/1238] bpb=1.080204 time=313.2s + ttt_chunk [1171/1238] bpb=1.079979 time=315.8s + ttt_chunk [1181/1238] bpb=1.079510 time=318.5s + ttt_chunk [1191/1238] bpb=1.079651 time=321.2s + ttt_chunk [1201/1238] bpb=1.079709 time=323.8s + ttt_chunk [1211/1238] bpb=1.079397 time=326.5s + ttt_chunk [1221/1238] bpb=1.078940 time=329.1s + ttt_chunk [1231/1238] bpb=1.078589 time=331.8s + ttt_chunk [1238/1238] bpb=1.078590 time=335.8s +ttt_sliding:done val_loss=2.786848 val_bpb=1.07887587 elapsed=335.9s +legal_ttt_exact val_loss:2.78684843 val_bpb:1.07887587 eval_time:336069ms diff --git a/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_seed42.log b/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_seed42.log new file mode 100644 index 0000000000..ed3bdd4235 --- /dev/null +++ b/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_seed42.log @@ -0,0 +1,275 @@ +W0410 16:22:58.134000 96964 torch/distributed/run.py:803] +W0410 16:22:58.134000 96964 torch/distributed/run.py:803] ***************************************** +W0410 16:22:58.134000 96964 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0410 16:22:58.134000 96964 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.997 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.095 + embedding_dim: 512 + enable_looping_at: 0.35 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/c98bf494-f7a2-4fd1-aa66-e16bc11874b3.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + qk_gain_init: 5.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: c98bf494-f7a2-4fd1-aa66-e16bc11874b3 + scalar_lr: 0.02 + seed: 42 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_adamw_wd: 0.0 + ttt_batch_seqs: 32 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_freeze_blocks: 0 + ttt_grad_clip: 1.0 + ttt_lr: 0.005 + ttt_momentum: 0.9 + ttt_optimizer: sgd + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40540160 +model_params:35944537 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0078 val_bpb: 3.4872 +1/20000 train_loss: 9.0109 train_time: 0.0m tok/s: 17644861 +2/20000 train_loss: 12.3669 train_time: 0.0m tok/s: 12996480 +3/20000 train_loss: 10.9491 train_time: 0.0m tok/s: 10750105 +4/20000 train_loss: 9.3755 train_time: 0.0m tok/s: 9860583 +5/20000 train_loss: 8.2196 train_time: 0.0m tok/s: 9360045 +500/20000 train_loss: 3.3880 train_time: 0.8m tok/s: 7807700 +1000/20000 train_loss: 3.2868 train_time: 1.7m tok/s: 7795982 +1500/20000 train_loss: 3.1860 train_time: 2.5m tok/s: 7793322 +2000/20000 train_loss: 3.0792 train_time: 3.4m tok/s: 7793684 +layer_loop:enabled step:2039 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 3.1269 train_time: 4.6m tok/s: 7168520 +3000/20000 train_loss: 2.9035 train_time: 5.8m tok/s: 6767468 +3500/20000 train_loss: 2.9458 train_time: 7.0m tok/s: 6509658 +4000/20000 train_loss: 2.8225 train_time: 8.3m tok/s: 6329004 +4000/20000 val_loss: 2.8793 val_bpb: 1.1147 +4500/20000 train_loss: 2.8391 train_time: 9.5m tok/s: 6189597 +4610/20000 val_loss: 2.8074 val_bpb: 1.0868 +stopping_early: wallclock_cap train_time: 588151ms step: 4610/20000 +peak memory allocated: 39955 MiB reserved: 39980 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.80509889 val_bpb:1.08594120 eval_time:6798ms +Serialized model: 135408623 bytes +Code size: 19760 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.4s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, lane_merge, skip_gates, skip_weights +Serialized model quantized+brotli: 15969070 bytes +Total submission size quantized+brotli: 15988830 bytes +quantized val_loss:2.83810821 val_bpb:1.09872013 eval_time:8682ms +quantized_sliding_window val_loss:2.79369665 val_bpb:1.08152703 eval_time:92793ms +ttt_sliding:start chunks=1238 chunk_tokens=32768 total_windows=633409 stride=64 ttt_lr=0.005 ttt_epochs=3 freeze_blocks=0 optimizer=sgd +ttt_sliding:params unfrozen=35944537 frozen=0 + ttt_chunk [1/1238] bpb=1.121541 time=4.5s + ttt_chunk [11/1238] bpb=1.070638 time=9.5s + ttt_chunk [21/1238] bpb=1.108076 time=13.5s + ttt_chunk [31/1238] bpb=1.101265 time=16.0s + ttt_chunk [41/1238] bpb=1.094476 time=18.5s + ttt_chunk [51/1238] bpb=1.087755 time=21.0s + ttt_chunk [61/1238] bpb=1.079447 time=23.5s + ttt_chunk [71/1238] bpb=1.086318 time=26.1s + ttt_chunk [81/1238] bpb=1.079529 time=28.6s + ttt_chunk [91/1238] bpb=1.076137 time=31.1s + ttt_chunk [101/1238] bpb=1.075793 time=33.5s + ttt_chunk [111/1238] bpb=1.073865 time=36.0s + ttt_chunk [121/1238] bpb=1.076781 time=38.5s + ttt_chunk [131/1238] bpb=1.080689 time=41.1s + ttt_chunk [141/1238] bpb=1.081177 time=44.3s + ttt_chunk [151/1238] bpb=1.081006 time=46.8s + ttt_chunk [161/1238] bpb=1.081566 time=49.3s + ttt_chunk [171/1238] bpb=1.081396 time=51.8s + ttt_chunk [181/1238] bpb=1.079927 time=54.3s + ttt_chunk [191/1238] bpb=1.079586 time=56.9s + ttt_chunk [201/1238] bpb=1.077162 time=59.5s + ttt_chunk [211/1238] bpb=1.081604 time=62.0s + ttt_chunk [221/1238] bpb=1.081939 time=64.5s + ttt_chunk [231/1238] bpb=1.083581 time=67.0s + ttt_chunk [241/1238] bpb=1.081866 time=69.6s + ttt_chunk [251/1238] bpb=1.081838 time=72.1s + ttt_chunk [261/1238] bpb=1.082840 time=74.6s + ttt_chunk [271/1238] bpb=1.083292 time=77.1s + ttt_chunk [281/1238] bpb=1.082611 time=79.6s + ttt_chunk [291/1238] bpb=1.083771 time=82.1s + ttt_chunk [301/1238] bpb=1.083904 time=84.7s + ttt_chunk [311/1238] bpb=1.082755 time=87.2s + ttt_chunk [321/1238] bpb=1.082615 time=89.7s + ttt_chunk [331/1238] bpb=1.082881 time=92.2s + ttt_chunk [341/1238] bpb=1.081967 time=94.7s + ttt_chunk [351/1238] bpb=1.082634 time=97.2s + ttt_chunk [361/1238] bpb=1.081577 time=99.7s + ttt_chunk [371/1238] bpb=1.080020 time=102.2s + ttt_chunk [381/1238] bpb=1.080445 time=104.7s + ttt_chunk [391/1238] bpb=1.080109 time=107.3s + ttt_chunk [401/1238] bpb=1.080197 time=109.8s + ttt_chunk [411/1238] bpb=1.080763 time=112.4s + ttt_chunk [421/1238] bpb=1.080287 time=114.9s + ttt_chunk [431/1238] bpb=1.080474 time=117.4s + ttt_chunk [441/1238] bpb=1.080533 time=119.9s + ttt_chunk [451/1238] bpb=1.081696 time=122.5s + ttt_chunk [461/1238] bpb=1.079957 time=125.0s + ttt_chunk [471/1238] bpb=1.079971 time=127.5s + ttt_chunk [481/1238] bpb=1.080181 time=130.0s + ttt_chunk [491/1238] bpb=1.080673 time=132.5s + ttt_chunk [501/1238] bpb=1.080252 time=135.0s + ttt_chunk [511/1238] bpb=1.079881 time=137.5s + ttt_chunk [521/1238] bpb=1.079409 time=140.0s + ttt_chunk [531/1238] bpb=1.079372 time=142.5s + ttt_chunk [541/1238] bpb=1.079466 time=145.0s + ttt_chunk [551/1238] bpb=1.079015 time=147.5s + ttt_chunk [561/1238] bpb=1.078325 time=150.1s + ttt_chunk [571/1238] bpb=1.077778 time=152.6s + ttt_chunk [581/1238] bpb=1.078147 time=155.1s + ttt_chunk [591/1238] bpb=1.078375 time=157.6s + ttt_chunk [601/1238] bpb=1.078269 time=160.2s + ttt_chunk [611/1238] bpb=1.078854 time=162.7s + ttt_chunk [621/1238] bpb=1.079706 time=165.2s + ttt_chunk [631/1238] bpb=1.079746 time=167.7s + ttt_chunk [641/1238] bpb=1.080230 time=170.2s + ttt_chunk [651/1238] bpb=1.080557 time=172.7s + ttt_chunk [661/1238] bpb=1.079920 time=175.2s + ttt_chunk [671/1238] bpb=1.079691 time=177.8s + ttt_chunk [681/1238] bpb=1.080983 time=180.3s + ttt_chunk [691/1238] bpb=1.081185 time=182.8s + ttt_chunk [701/1238] bpb=1.080990 time=185.3s + ttt_chunk [711/1238] bpb=1.081682 time=187.8s + ttt_chunk [721/1238] bpb=1.081948 time=190.3s + ttt_chunk [731/1238] bpb=1.081272 time=192.8s + ttt_chunk [741/1238] bpb=1.080948 time=195.4s + ttt_chunk [751/1238] bpb=1.080028 time=197.9s + ttt_chunk [761/1238] bpb=1.079424 time=200.4s + ttt_chunk [771/1238] bpb=1.078413 time=202.9s + ttt_chunk [781/1238] bpb=1.078421 time=205.5s + ttt_chunk [791/1238] bpb=1.078779 time=208.0s + ttt_chunk [801/1238] bpb=1.079058 time=210.5s + ttt_chunk [811/1238] bpb=1.078564 time=213.0s + ttt_chunk [821/1238] bpb=1.077348 time=215.5s + ttt_chunk [831/1238] bpb=1.077053 time=218.0s + ttt_chunk [841/1238] bpb=1.076578 time=220.5s + ttt_chunk [851/1238] bpb=1.076303 time=223.1s + ttt_chunk [861/1238] bpb=1.075967 time=225.6s + ttt_chunk [871/1238] bpb=1.075862 time=228.1s + ttt_chunk [881/1238] bpb=1.075410 time=230.6s + ttt_chunk [891/1238] bpb=1.074859 time=233.1s + ttt_chunk [901/1238] bpb=1.075250 time=235.6s + ttt_chunk [911/1238] bpb=1.074938 time=238.1s + ttt_chunk [921/1238] bpb=1.075202 time=240.6s + ttt_chunk [931/1238] bpb=1.075883 time=243.1s + ttt_chunk [941/1238] bpb=1.076282 time=245.7s + ttt_chunk [951/1238] bpb=1.076182 time=248.2s + ttt_chunk [961/1238] bpb=1.077009 time=250.7s + ttt_chunk [971/1238] bpb=1.077416 time=253.2s + ttt_chunk [981/1238] bpb=1.077774 time=255.7s + ttt_chunk [991/1238] bpb=1.077565 time=258.3s + ttt_chunk [1001/1238] bpb=1.077612 time=260.8s + ttt_chunk [1011/1238] bpb=1.077961 time=263.3s + ttt_chunk [1021/1238] bpb=1.078664 time=265.8s + ttt_chunk [1031/1238] bpb=1.079116 time=268.3s + ttt_chunk [1041/1238] bpb=1.079594 time=270.8s + ttt_chunk [1051/1238] bpb=1.079511 time=273.3s + ttt_chunk [1061/1238] bpb=1.079528 time=275.8s + ttt_chunk [1071/1238] bpb=1.079695 time=278.3s + ttt_chunk [1081/1238] bpb=1.079613 time=280.8s + ttt_chunk [1091/1238] bpb=1.079805 time=283.3s + ttt_chunk [1101/1238] bpb=1.080354 time=285.8s + ttt_chunk [1111/1238] bpb=1.080641 time=288.3s + ttt_chunk [1121/1238] bpb=1.080820 time=290.8s + ttt_chunk [1131/1238] bpb=1.080488 time=293.3s + ttt_chunk [1141/1238] bpb=1.080164 time=295.8s + ttt_chunk [1151/1238] bpb=1.080201 time=298.3s + ttt_chunk [1161/1238] bpb=1.080321 time=300.8s + ttt_chunk [1171/1238] bpb=1.080097 time=303.3s + ttt_chunk [1181/1238] bpb=1.079630 time=305.8s + ttt_chunk [1191/1238] bpb=1.079782 time=308.3s + ttt_chunk [1201/1238] bpb=1.079846 time=310.8s + ttt_chunk [1211/1238] bpb=1.079538 time=313.3s + ttt_chunk [1221/1238] bpb=1.079079 time=315.9s + ttt_chunk [1231/1238] bpb=1.078718 time=318.4s + ttt_chunk [1238/1238] bpb=1.078718 time=321.9s +ttt_sliding:done val_loss=2.786569 val_bpb=1.07876776 elapsed=322.6s +legal_ttt_exact val_loss:2.78656915 val_bpb:1.07876776 eval_time:322819ms diff --git a/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_seed7.log b/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_seed7.log new file mode 100644 index 0000000000..2d650cb086 --- /dev/null +++ b/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_seed7.log @@ -0,0 +1,275 @@ +W0410 17:48:56.228000 116888 torch/distributed/run.py:803] +W0410 17:48:56.228000 116888 torch/distributed/run.py:803] ***************************************** +W0410 17:48:56.228000 116888 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0410 17:48:56.228000 116888 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.997 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.095 + embedding_dim: 512 + enable_looping_at: 0.35 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/17bd3462-085d-4688-b726-abb548eee4ca.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + qk_gain_init: 5.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: 17bd3462-085d-4688-b726-abb548eee4ca + scalar_lr: 0.02 + seed: 7 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_adamw_wd: 0.0 + ttt_batch_seqs: 32 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_freeze_blocks: 0 + ttt_grad_clip: 1.0 + ttt_lr: 0.005 + ttt_momentum: 0.9 + ttt_optimizer: sgd + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40540160 +model_params:35944537 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0064 val_bpb: 3.4867 +1/20000 train_loss: 9.0083 train_time: 0.0m tok/s: 17981982 +2/20000 train_loss: 12.3005 train_time: 0.0m tok/s: 13029357 +3/20000 train_loss: 10.8994 train_time: 0.0m tok/s: 10752932 +4/20000 train_loss: 9.3730 train_time: 0.0m tok/s: 9885845 +5/20000 train_loss: 8.2350 train_time: 0.0m tok/s: 9384333 +500/20000 train_loss: 3.3929 train_time: 0.8m tok/s: 7786933 +1000/20000 train_loss: 3.2935 train_time: 1.7m tok/s: 7777873 +1500/20000 train_loss: 3.1878 train_time: 2.5m tok/s: 7782721 +2000/20000 train_loss: 3.0848 train_time: 3.4m tok/s: 7787177 +layer_loop:enabled step:2038 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 3.1316 train_time: 4.6m tok/s: 7128072 +3000/20000 train_loss: 2.9064 train_time: 5.8m tok/s: 6740176 +3500/20000 train_loss: 2.9498 train_time: 7.1m tok/s: 6488148 +4000/20000 train_loss: 2.8269 train_time: 8.3m tok/s: 6311779 +4000/20000 val_loss: 2.8800 val_bpb: 1.1149 +4500/20000 train_loss: 2.8443 train_time: 9.6m tok/s: 6173084 +4600/20000 val_loss: 2.8094 val_bpb: 1.0876 +stopping_early: wallclock_cap train_time: 588183ms step: 4600/20000 +peak memory allocated: 39948 MiB reserved: 40026 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.80743773 val_bpb:1.08684663 eval_time:6137ms +Serialized model: 135408623 bytes +Code size: 19760 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.4s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, lane_merge, skip_gates, skip_weights +Serialized model quantized+brotli: 15966189 bytes +Total submission size quantized+brotli: 15985949 bytes +quantized val_loss:2.83706164 val_bpb:1.09831497 eval_time:8707ms +quantized_sliding_window val_loss:2.79295740 val_bpb:1.08124085 eval_time:92136ms +ttt_sliding:start chunks=1238 chunk_tokens=32768 total_windows=633409 stride=64 ttt_lr=0.005 ttt_epochs=3 freeze_blocks=0 optimizer=sgd +ttt_sliding:params unfrozen=35944537 frozen=0 + ttt_chunk [1/1238] bpb=1.117126 time=4.5s + ttt_chunk [11/1238] bpb=1.068764 time=8.9s + ttt_chunk [21/1238] bpb=1.106020 time=11.3s + ttt_chunk [31/1238] bpb=1.099846 time=13.8s + ttt_chunk [41/1238] bpb=1.093297 time=16.2s + ttt_chunk [51/1238] bpb=1.086984 time=18.7s + ttt_chunk [61/1238] bpb=1.078741 time=21.1s + ttt_chunk [71/1238] bpb=1.085899 time=23.6s + ttt_chunk [81/1238] bpb=1.079076 time=26.1s + ttt_chunk [91/1238] bpb=1.075865 time=28.5s + ttt_chunk [101/1238] bpb=1.075723 time=31.0s + ttt_chunk [111/1238] bpb=1.074058 time=33.5s + ttt_chunk [121/1238] bpb=1.076926 time=35.9s + ttt_chunk [131/1238] bpb=1.080607 time=38.4s + ttt_chunk [141/1238] bpb=1.081238 time=40.9s + ttt_chunk [151/1238] bpb=1.081030 time=44.4s + ttt_chunk [161/1238] bpb=1.081511 time=46.9s + ttt_chunk [171/1238] bpb=1.081484 time=49.3s + ttt_chunk [181/1238] bpb=1.080056 time=51.8s + ttt_chunk [191/1238] bpb=1.079827 time=54.3s + ttt_chunk [201/1238] bpb=1.077490 time=56.7s + ttt_chunk [211/1238] bpb=1.081866 time=59.2s + ttt_chunk [221/1238] bpb=1.082157 time=61.7s + ttt_chunk [231/1238] bpb=1.083830 time=64.2s + ttt_chunk [241/1238] bpb=1.082032 time=66.7s + ttt_chunk [251/1238] bpb=1.082011 time=69.2s + ttt_chunk [261/1238] bpb=1.083082 time=71.7s + ttt_chunk [271/1238] bpb=1.083503 time=74.2s + ttt_chunk [281/1238] bpb=1.082869 time=76.7s + ttt_chunk [291/1238] bpb=1.084010 time=79.2s + ttt_chunk [301/1238] bpb=1.084200 time=81.7s + ttt_chunk [311/1238] bpb=1.083141 time=84.1s + ttt_chunk [321/1238] bpb=1.082984 time=86.6s + ttt_chunk [331/1238] bpb=1.083268 time=89.2s + ttt_chunk [341/1238] bpb=1.082406 time=91.6s + ttt_chunk [351/1238] bpb=1.083084 time=94.1s + ttt_chunk [361/1238] bpb=1.082000 time=96.6s + ttt_chunk [371/1238] bpb=1.080407 time=99.1s + ttt_chunk [381/1238] bpb=1.080856 time=101.6s + ttt_chunk [391/1238] bpb=1.080529 time=104.1s + ttt_chunk [401/1238] bpb=1.080603 time=106.5s + ttt_chunk [411/1238] bpb=1.081149 time=109.0s + ttt_chunk [421/1238] bpb=1.080666 time=111.5s + ttt_chunk [431/1238] bpb=1.080826 time=114.0s + ttt_chunk [441/1238] bpb=1.080892 time=116.5s + ttt_chunk [451/1238] bpb=1.082026 time=119.0s + ttt_chunk [461/1238] bpb=1.080325 time=121.5s + ttt_chunk [471/1238] bpb=1.080349 time=123.9s + ttt_chunk [481/1238] bpb=1.080506 time=126.4s + ttt_chunk [491/1238] bpb=1.080962 time=128.9s + ttt_chunk [501/1238] bpb=1.080553 time=131.4s + ttt_chunk [511/1238] bpb=1.080159 time=133.9s + ttt_chunk [521/1238] bpb=1.079684 time=136.4s + ttt_chunk [531/1238] bpb=1.079661 time=138.9s + ttt_chunk [541/1238] bpb=1.079738 time=141.3s + ttt_chunk [551/1238] bpb=1.079272 time=143.9s + ttt_chunk [561/1238] bpb=1.078591 time=146.3s + ttt_chunk [571/1238] bpb=1.078074 time=148.8s + ttt_chunk [581/1238] bpb=1.078410 time=151.3s + ttt_chunk [591/1238] bpb=1.078647 time=153.8s + ttt_chunk [601/1238] bpb=1.078552 time=156.3s + ttt_chunk [611/1238] bpb=1.079150 time=158.8s + ttt_chunk [621/1238] bpb=1.079997 time=161.2s + ttt_chunk [631/1238] bpb=1.080087 time=163.7s + ttt_chunk [641/1238] bpb=1.080552 time=166.2s + ttt_chunk [651/1238] bpb=1.080915 time=168.7s + ttt_chunk [661/1238] bpb=1.080275 time=171.2s + ttt_chunk [671/1238] bpb=1.080058 time=173.7s + ttt_chunk [681/1238] bpb=1.081353 time=176.2s + ttt_chunk [691/1238] bpb=1.081563 time=178.7s + ttt_chunk [701/1238] bpb=1.081368 time=181.2s + ttt_chunk [711/1238] bpb=1.082055 time=183.7s + ttt_chunk [721/1238] bpb=1.082378 time=186.2s + ttt_chunk [731/1238] bpb=1.081736 time=188.7s + ttt_chunk [741/1238] bpb=1.081451 time=191.2s + ttt_chunk [751/1238] bpb=1.080530 time=193.6s + ttt_chunk [761/1238] bpb=1.079920 time=196.1s + ttt_chunk [771/1238] bpb=1.078901 time=198.6s + ttt_chunk [781/1238] bpb=1.078884 time=201.1s + ttt_chunk [791/1238] bpb=1.079233 time=203.6s + ttt_chunk [801/1238] bpb=1.079525 time=206.1s + ttt_chunk [811/1238] bpb=1.079022 time=208.6s + ttt_chunk [821/1238] bpb=1.077828 time=211.1s + ttt_chunk [831/1238] bpb=1.077535 time=213.7s + ttt_chunk [841/1238] bpb=1.077054 time=216.1s + ttt_chunk [851/1238] bpb=1.076776 time=218.6s + ttt_chunk [861/1238] bpb=1.076428 time=221.1s + ttt_chunk [871/1238] bpb=1.076329 time=223.6s + ttt_chunk [881/1238] bpb=1.075871 time=226.1s + ttt_chunk [891/1238] bpb=1.075323 time=228.5s + ttt_chunk [901/1238] bpb=1.075710 time=231.0s + ttt_chunk [911/1238] bpb=1.075414 time=233.5s + ttt_chunk [921/1238] bpb=1.075689 time=236.0s + ttt_chunk [931/1238] bpb=1.076369 time=238.5s + ttt_chunk [941/1238] bpb=1.076760 time=241.0s + ttt_chunk [951/1238] bpb=1.076668 time=243.5s + ttt_chunk [961/1238] bpb=1.077502 time=246.0s + ttt_chunk [971/1238] bpb=1.077900 time=248.4s + ttt_chunk [981/1238] bpb=1.078271 time=250.9s + ttt_chunk [991/1238] bpb=1.078068 time=253.4s + ttt_chunk [1001/1238] bpb=1.078111 time=255.9s + ttt_chunk [1011/1238] bpb=1.078456 time=258.4s + ttt_chunk [1021/1238] bpb=1.079195 time=260.9s + ttt_chunk [1031/1238] bpb=1.079646 time=263.3s + ttt_chunk [1041/1238] bpb=1.080135 time=265.8s + ttt_chunk [1051/1238] bpb=1.080072 time=268.3s + ttt_chunk [1061/1238] bpb=1.080067 time=270.8s + ttt_chunk [1071/1238] bpb=1.080233 time=273.4s + ttt_chunk [1081/1238] bpb=1.080116 time=275.8s + ttt_chunk [1091/1238] bpb=1.080314 time=278.3s + ttt_chunk [1101/1238] bpb=1.080879 time=280.8s + ttt_chunk [1111/1238] bpb=1.081176 time=283.3s + ttt_chunk [1121/1238] bpb=1.081349 time=285.8s + ttt_chunk [1131/1238] bpb=1.081019 time=288.2s + ttt_chunk [1141/1238] bpb=1.080687 time=290.7s + ttt_chunk [1151/1238] bpb=1.080720 time=293.2s + ttt_chunk [1161/1238] bpb=1.080847 time=295.7s + ttt_chunk [1171/1238] bpb=1.080644 time=298.1s + ttt_chunk [1181/1238] bpb=1.080178 time=300.6s + ttt_chunk [1191/1238] bpb=1.080318 time=303.1s + ttt_chunk [1201/1238] bpb=1.080380 time=305.6s + ttt_chunk [1211/1238] bpb=1.080060 time=308.0s + ttt_chunk [1221/1238] bpb=1.079617 time=310.5s + ttt_chunk [1231/1238] bpb=1.079260 time=313.0s + ttt_chunk [1238/1238] bpb=1.079257 time=316.8s +ttt_sliding:done val_loss=2.787994 val_bpb=1.07931921 elapsed=316.9s +legal_ttt_exact val_loss:2.78799360 val_bpb:1.07931921 eval_time:317072ms diff --git a/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_seed999.log b/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_seed999.log new file mode 100644 index 0000000000..48f65f48c3 --- /dev/null +++ b/records/track_10min_16mb/2026-04-11_SP8192_Banking_ParResid_TripleRecur_Muon97_TTT/train_seed999.log @@ -0,0 +1,275 @@ +W0410 17:07:36.476000 114855 torch/distributed/run.py:803] +W0410 17:07:36.476000 114855 torch/distributed/run.py:803] ***************************************** +W0410 17:07:36.476000 114855 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0410 17:07:36.476000 114855 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.997 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.095 + embedding_dim: 512 + enable_looping_at: 0.35 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/309cb7f2-6e76-467c-aed5-1d9b2e5bc1f8.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + qk_gain_init: 5.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: 309cb7f2-6e76-467c-aed5-1d9b2e5bc1f8 + scalar_lr: 0.02 + seed: 999 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_adamw_wd: 0.0 + ttt_batch_seqs: 32 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_freeze_blocks: 0 + ttt_grad_clip: 1.0 + ttt_lr: 0.005 + ttt_momentum: 0.9 + ttt_optimizer: sgd + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40540160 +model_params:35944537 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0088 val_bpb: 3.4876 +1/20000 train_loss: 9.0104 train_time: 0.0m tok/s: 17877488 +2/20000 train_loss: 12.3210 train_time: 0.0m tok/s: 12916150 +3/20000 train_loss: 10.9271 train_time: 0.0m tok/s: 10699581 +4/20000 train_loss: 9.3682 train_time: 0.0m tok/s: 9788869 +5/20000 train_loss: 8.2157 train_time: 0.0m tok/s: 9304786 +500/20000 train_loss: 3.3914 train_time: 0.8m tok/s: 7803097 +1000/20000 train_loss: 3.2924 train_time: 1.7m tok/s: 7786652 +1500/20000 train_loss: 3.1882 train_time: 2.5m tok/s: 7791909 +2000/20000 train_loss: 3.0837 train_time: 3.4m tok/s: 7797183 +layer_loop:enabled step:2040 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 3.1267 train_time: 4.6m tok/s: 7137524 +3000/20000 train_loss: 2.9102 train_time: 5.8m tok/s: 6746928 +3500/20000 train_loss: 2.9474 train_time: 7.1m tok/s: 6493605 +4000/20000 train_loss: 2.8223 train_time: 8.3m tok/s: 6316417 +4000/20000 val_loss: 2.8806 val_bpb: 1.1152 +4500/20000 train_loss: 2.8424 train_time: 9.6m tok/s: 6173714 +4600/20000 val_loss: 2.8094 val_bpb: 1.0876 +stopping_early: wallclock_cap train_time: 588128ms step: 4600/20000 +peak memory allocated: 39948 MiB reserved: 40026 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.80732869 val_bpb:1.08680442 eval_time:6181ms +Serialized model: 135408623 bytes +Code size: 19760 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.4s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, lane_merge, skip_gates, skip_weights +Serialized model quantized+brotli: 15966656 bytes +Total submission size quantized+brotli: 15986416 bytes +quantized val_loss:2.83780389 val_bpb:1.09860232 eval_time:8848ms +quantized_sliding_window val_loss:2.79386229 val_bpb:1.08159116 eval_time:92095ms +ttt_sliding:start chunks=1238 chunk_tokens=32768 total_windows=633409 stride=64 ttt_lr=0.005 ttt_epochs=3 freeze_blocks=0 optimizer=sgd +ttt_sliding:params unfrozen=35944537 frozen=0 + ttt_chunk [1/1238] bpb=1.113886 time=4.4s + ttt_chunk [11/1238] bpb=1.068858 time=8.8s + ttt_chunk [21/1238] bpb=1.107136 time=11.3s + ttt_chunk [31/1238] bpb=1.100883 time=13.8s + ttt_chunk [41/1238] bpb=1.094188 time=16.3s + ttt_chunk [51/1238] bpb=1.087376 time=18.8s + ttt_chunk [61/1238] bpb=1.079422 time=21.3s + ttt_chunk [71/1238] bpb=1.086608 time=23.8s + ttt_chunk [81/1238] bpb=1.079957 time=26.3s + ttt_chunk [91/1238] bpb=1.076650 time=28.8s + ttt_chunk [101/1238] bpb=1.076548 time=31.2s + ttt_chunk [111/1238] bpb=1.074757 time=33.7s + ttt_chunk [121/1238] bpb=1.077737 time=36.2s + ttt_chunk [131/1238] bpb=1.081431 time=38.7s + ttt_chunk [141/1238] bpb=1.082024 time=41.2s + ttt_chunk [151/1238] bpb=1.081699 time=44.7s + ttt_chunk [161/1238] bpb=1.082216 time=47.2s + ttt_chunk [171/1238] bpb=1.082089 time=49.7s + ttt_chunk [181/1238] bpb=1.080566 time=52.2s + ttt_chunk [191/1238] bpb=1.080379 time=54.7s + ttt_chunk [201/1238] bpb=1.077905 time=57.2s + ttt_chunk [211/1238] bpb=1.082251 time=59.7s + ttt_chunk [221/1238] bpb=1.082574 time=62.2s + ttt_chunk [231/1238] bpb=1.084259 time=64.7s + ttt_chunk [241/1238] bpb=1.082448 time=67.2s + ttt_chunk [251/1238] bpb=1.082389 time=69.7s + ttt_chunk [261/1238] bpb=1.083412 time=72.2s + ttt_chunk [271/1238] bpb=1.083820 time=74.7s + ttt_chunk [281/1238] bpb=1.083148 time=77.2s + ttt_chunk [291/1238] bpb=1.084318 time=79.7s + ttt_chunk [301/1238] bpb=1.084477 time=82.2s + ttt_chunk [311/1238] bpb=1.083320 time=84.7s + ttt_chunk [321/1238] bpb=1.083229 time=87.2s + ttt_chunk [331/1238] bpb=1.083533 time=89.7s + ttt_chunk [341/1238] bpb=1.082589 time=92.2s + ttt_chunk [351/1238] bpb=1.083335 time=94.7s + ttt_chunk [361/1238] bpb=1.082235 time=97.2s + ttt_chunk [371/1238] bpb=1.080661 time=99.7s + ttt_chunk [381/1238] bpb=1.081082 time=102.2s + ttt_chunk [391/1238] bpb=1.080762 time=104.7s + ttt_chunk [401/1238] bpb=1.080842 time=107.2s + ttt_chunk [411/1238] bpb=1.081363 time=109.7s + ttt_chunk [421/1238] bpb=1.080867 time=112.2s + ttt_chunk [431/1238] bpb=1.081103 time=114.7s + ttt_chunk [441/1238] bpb=1.081163 time=117.2s + ttt_chunk [451/1238] bpb=1.082361 time=119.7s + ttt_chunk [461/1238] bpb=1.080622 time=122.2s + ttt_chunk [471/1238] bpb=1.080624 time=124.7s + ttt_chunk [481/1238] bpb=1.080802 time=127.2s + ttt_chunk [491/1238] bpb=1.081282 time=129.7s + ttt_chunk [501/1238] bpb=1.080917 time=132.2s + ttt_chunk [511/1238] bpb=1.080519 time=134.7s + ttt_chunk [521/1238] bpb=1.080050 time=137.2s + ttt_chunk [531/1238] bpb=1.080012 time=139.7s + ttt_chunk [541/1238] bpb=1.080094 time=142.2s + ttt_chunk [551/1238] bpb=1.079640 time=144.8s + ttt_chunk [561/1238] bpb=1.078958 time=147.3s + ttt_chunk [571/1238] bpb=1.078412 time=149.8s + ttt_chunk [581/1238] bpb=1.078777 time=152.3s + ttt_chunk [591/1238] bpb=1.078991 time=154.8s + ttt_chunk [601/1238] bpb=1.078882 time=157.3s + ttt_chunk [611/1238] bpb=1.079445 time=159.8s + ttt_chunk [621/1238] bpb=1.080265 time=162.3s + ttt_chunk [631/1238] bpb=1.080318 time=164.8s + ttt_chunk [641/1238] bpb=1.080788 time=167.3s + ttt_chunk [651/1238] bpb=1.081116 time=169.9s + ttt_chunk [661/1238] bpb=1.080467 time=172.3s + ttt_chunk [671/1238] bpb=1.080236 time=174.8s + ttt_chunk [681/1238] bpb=1.081524 time=177.3s + ttt_chunk [691/1238] bpb=1.081716 time=179.8s + ttt_chunk [701/1238] bpb=1.081551 time=182.3s + ttt_chunk [711/1238] bpb=1.082241 time=184.9s + ttt_chunk [721/1238] bpb=1.082546 time=187.4s + ttt_chunk [731/1238] bpb=1.081896 time=189.9s + ttt_chunk [741/1238] bpb=1.081575 time=192.4s + ttt_chunk [751/1238] bpb=1.080666 time=194.9s + ttt_chunk [761/1238] bpb=1.080065 time=197.4s + ttt_chunk [771/1238] bpb=1.079068 time=199.9s + ttt_chunk [781/1238] bpb=1.079041 time=202.4s + ttt_chunk [791/1238] bpb=1.079389 time=204.9s + ttt_chunk [801/1238] bpb=1.079681 time=207.4s + ttt_chunk [811/1238] bpb=1.079166 time=209.9s + ttt_chunk [821/1238] bpb=1.077985 time=212.4s + ttt_chunk [831/1238] bpb=1.077673 time=215.0s + ttt_chunk [841/1238] bpb=1.077199 time=217.5s + ttt_chunk [851/1238] bpb=1.076908 time=220.0s + ttt_chunk [861/1238] bpb=1.076544 time=222.5s + ttt_chunk [871/1238] bpb=1.076447 time=225.0s + ttt_chunk [881/1238] bpb=1.076004 time=227.5s + ttt_chunk [891/1238] bpb=1.075473 time=230.0s + ttt_chunk [901/1238] bpb=1.075849 time=232.6s + ttt_chunk [911/1238] bpb=1.075532 time=235.0s + ttt_chunk [921/1238] bpb=1.075818 time=237.5s + ttt_chunk [931/1238] bpb=1.076509 time=240.1s + ttt_chunk [941/1238] bpb=1.076862 time=242.6s + ttt_chunk [951/1238] bpb=1.076783 time=245.1s + ttt_chunk [961/1238] bpb=1.077610 time=247.6s + ttt_chunk [971/1238] bpb=1.078012 time=250.1s + ttt_chunk [981/1238] bpb=1.078384 time=252.6s + ttt_chunk [991/1238] bpb=1.078168 time=255.2s + ttt_chunk [1001/1238] bpb=1.078217 time=257.7s + ttt_chunk [1011/1238] bpb=1.078568 time=260.2s + ttt_chunk [1021/1238] bpb=1.079285 time=262.6s + ttt_chunk [1031/1238] bpb=1.079763 time=265.1s + ttt_chunk [1041/1238] bpb=1.080224 time=267.6s + ttt_chunk [1051/1238] bpb=1.080151 time=270.1s + ttt_chunk [1061/1238] bpb=1.080175 time=272.6s + ttt_chunk [1071/1238] bpb=1.080323 time=275.1s + ttt_chunk [1081/1238] bpb=1.080213 time=277.6s + ttt_chunk [1091/1238] bpb=1.080418 time=280.1s + ttt_chunk [1101/1238] bpb=1.080960 time=282.6s + ttt_chunk [1111/1238] bpb=1.081264 time=285.1s + ttt_chunk [1121/1238] bpb=1.081442 time=287.6s + ttt_chunk [1131/1238] bpb=1.081105 time=290.1s + ttt_chunk [1141/1238] bpb=1.080773 time=292.7s + ttt_chunk [1151/1238] bpb=1.080789 time=295.2s + ttt_chunk [1161/1238] bpb=1.080925 time=297.7s + ttt_chunk [1171/1238] bpb=1.080719 time=300.2s + ttt_chunk [1181/1238] bpb=1.080249 time=302.7s + ttt_chunk [1191/1238] bpb=1.080388 time=305.2s + ttt_chunk [1201/1238] bpb=1.080407 time=307.7s + ttt_chunk [1211/1238] bpb=1.080090 time=310.2s + ttt_chunk [1221/1238] bpb=1.079646 time=312.7s + ttt_chunk [1231/1238] bpb=1.079283 time=315.2s + ttt_chunk [1238/1238] bpb=1.079290 time=319.1s +ttt_sliding:done val_loss=2.788424 val_bpb=1.07948565 elapsed=319.1s +legal_ttt_exact val_loss:2.78842354 val_bpb:1.07948565 eval_time:319353ms