From 7ed3f08e61281dbab846f6215fbf5005e8a8379d Mon Sep 17 00:00:00 2001 From: Aryan Bhosale Date: Sat, 4 Apr 2026 14:42:20 +0530 Subject: [PATCH] =?UTF-8?q?Record:=20SP4096=20+=20Depth=20Recurrence=20+?= =?UTF-8?q?=20Parallel=20Residuals=20+=20MuonEq-R=20+=20Causal=20SLOT=20?= =?UTF-8?q?=E2=80=94=20val=5Fbpb=201.0766=20(3-seed=20mean)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 8 train-time techniques + causal context-only SLOT at eval. 3-seed mean: 1.0766 BPB, delta -0.0381 vs merged SOTA. --- .../README.md | 57 ++++++++ .../submission.json | 20 +++ .../train_gpt.py | 2 + .../train_seed314.log | 129 ++++++++++++++++++ .../train_seed42.log | 129 ++++++++++++++++++ .../train_seed999.log | 129 ++++++++++++++++++ 6 files changed, 466 insertions(+) create mode 100644 records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/README.md create mode 100644 records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/submission.json create mode 100644 records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/train_gpt.py create mode 100644 records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/train_seed314.log create mode 100644 records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/train_seed42.log create mode 100644 records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/train_seed999.log diff --git a/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/README.md b/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/README.md new file mode 100644 index 0000000000..c62b90db54 --- /dev/null +++ b/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/README.md @@ -0,0 +1,57 @@ +# Record: SP4096 + Depth Recurrence + Parallel Residuals + MuonEq-R + Causal SLOT — val_bpb 1.0766 (3-seed mean) + +**val_bpb = 1.0766** (3-seed mean, std 0.0004) | **~16.00 MB** | 8xH100 SXM + +## 3-Seed Results (8xH100 80GB SXM, PyTorch 2.9.1+cu128) + +| Seed | Sliding BPB | **Causal SLOT BPB** | SLOT gain | Artifact | +|------|-------------|---------------------|-----------|----------| +| 42 | 1.0893 | **1.0762** | -0.0131 | 15,999,461 | +| 314 | 1.0897 | **1.0766** | -0.0131 | 15,997,932 | +| 999 | 1.0897 | **1.0770** | -0.0127 | 15,994,941 | +| **Mean** | | **1.0766** | **-0.0130** | | + +Merged SOTA (PR #1019): **1.1147 BPB**. Delta: **-0.0381 BPB**. + +## Key Techniques + +### Training (8 techniques) + +1. **4096-Vocab + MLP 4x + WD 0.090** — PR #1218 @clarkkev, PR #1285 @dexhunter +2. **Depth Recurrence (layers 4,5)** — PR #1204 @msisovic, PR #1260 @dexhunter +3. **Parallel Residuals (from layer 7)** — PR #1204 @msisovic, PR #1289 @MatoTeziTanka +4. **MuonEq-R** — arXiv:2603.28254, PR #1260 @dexhunter +5. **QK-Gain 5.0** — PR #1217 @bigbag +6. **Full GPTQ int6 + Brotli + LZMA Compressed Wrapper** + +### Evaluation: Causal SLOT (context-only delta optimization) + +Per-batch additive delta vector (dim=512) optimized with AdamW (lr=0.008, 16 steps) on **context-only positions** during sliding-window eval. Only already-scored tokens contribute to the optimization loss. Delta is re-initialized to zeros for each batch. Model weights completely frozen. + +This is provably causal: the delta at position t depends only on tokens x_1,...,x_{t-stride} which have all been previously scored. New positions (last stride=64 tokens per window) are scored with the context-adapted delta but do not influence its optimization. + +Source: arXiv:2505.12392v2, PR #1306 @resouer (causal variant), PR #1176 @bigbag (SLOT concept). + +## Compliance + +- **Condition 1** (causal): delta optimized on context-only positions (already scored). New tokens excluded from optimization loss. +- **Condition 2** (full distribution): standard softmax over full 4096-token vocabulary +- **Condition 3** (score-before-update): new tokens scored AFTER delta optimization on context. Delta does not use new token information. +- **Condition 4** (single pass): single left-to-right sliding window, no rescoring +- Model weights frozen during eval — only delta vector optimized per-batch +- GPTQ calibration within training budget +- Total eval: ~520s (sliding ~76s + SLOT ~444s), within 600s budget + +## Reproduction + +```bash +pip install brotli +MATCHED_FINEWEB_REPO_ID=kevclark/parameter-golf python3 data/cached_challenge_fineweb.py --variant sp4096 --skip-manifest +SEED=42 RECUR_LAYERS=4,5 RECUR_START_STEP=3000 PARALLEL_START_LAYER=7 \ +SLOT_ENABLED=1 SLOT_LR=0.008 SLOT_STEPS=16 \ +torchrun --standalone --nproc_per_node=8 train_gpt.py +``` + +## Credits + +PR #1218 @clarkkev, PR #1285 @dexhunter, PR #1204 @msisovic, PR #1289 @MatoTeziTanka, PR #1260 @dexhunter, PR #1019 @abaybektursun, PR #1287 @dentity007, PR #1217 @bigbag, PR #493 @parinzee, PR #1306 @resouer (causal SLOT), PR #1176 @bigbag (SLOT concept) diff --git a/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/submission.json b/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/submission.json new file mode 100644 index 0000000000..c5f0a7d95b --- /dev/null +++ b/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/submission.json @@ -0,0 +1,20 @@ +{ + "author": "aryanbhosale", + "github_id": "aryanbhosale", + "name": "SP4096 + Depth Recurrence + Parallel Residuals + MuonEq-R + Causal SLOT-16", + "date": "2026-04-04", + "track": "10min_16mb", + "val_bpb": 1.07660790, + "val_bpb_std": 0.00039902, + "seeds": [42, 314, 999], + "seed_results": { + "42": {"val_bpb": 1.07620919, "artifact_bytes": 15999461}, + "314": {"val_bpb": 1.07660728, "artifact_bytes": 15997932}, + "999": {"val_bpb": 1.07700722, "artifact_bytes": 15994941} + }, + "comparison_baseline_pr": 1019, + "delta_vs_pr1019_bpb": -0.03813, + "hardware": "8xH100 80GB SXM", + "pytorch_version": "2.9.1+cu128", + "technique_summary": "SP4096 + MLP 4x + WD 0.090 + Depth Recurrence + Parallel Residuals + MuonEq-R + QK-Gain 5.0 + Causal SLOT-16 + Full GPTQ int6 + Brotli" +} diff --git a/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/train_gpt.py b/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/train_gpt.py new file mode 100644 index 0000000000..057f3d5b02 --- /dev/null +++ b/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/train_gpt.py @@ -0,0 +1,2 @@ +import lzma as L,base64 as B +exec(L.decompress(B.b85decode("{Wp48S^xk9=GL@E0stWa8~^|S5YJf5;W(g3d0hZBn@VT6Qap3bu0*kgCR~YUqB0W9R)iarr*QtEZpesGY3>~CZRiK|6Dwut$nH#N""!RYqQnA}G^`ZsFO;ar92)Xt#3E3Ki5S1}OfSx<=$c<4=h|J{kt$27^CQ01M+lVgZ0tGgX0&I*V@{U&JgYc0U!(4F-btCy*+qzv6D""p~UW!y~6{U*}y$E@2-R}vd?t*s#fnDO{!j>OImt34A(d+9n>hnnvzmd((_D1Cghg~(bQ$Yj)>!Y%{*o9ex8FWa#U)!OI!!5Prl^?bnBX2V(=(Bvc+CvGo!S{LhLn7pSsR!}@""U=OBW0)h6IYneQ1{|$<&k9TS^qGQpb-;#vEPAl%11UF)?6mtC8c04XzR$+h2=j84E2|i`pOEt$uyM`lGs*ejIF-}^SvSRZK$ePh1""`gt+?%1r#=OVy3pW`{ofc)6PhQQRP|_h56zl+sQ(le1eJ^&&qZxdGb15""aOb^-R1ouqi-H1_w|H;g(()bKz_!0+L{#HFmtQSw%~n|MX3ij_2{lW(_*6gdIz`XT%tzkhK-k5tAu}`=>u|z+|uP4UnMNw^{d&KAm;P6`40&zphh*D=e*8?KGZuo~y*`y#Wg}r(PV}?J$Oae&vv%2(eb|cSn}oTUc1&B%M^B@xlvn10$Ol5{Rc(||g*`0AO8zk14;L7H$""%dzpg>)#q=gM_#uqc-I?vb;GXT^_H=R{GSK>=VuvqSe%#z|2c2YlT&*kjZ2;Kl!jcm$Q-fVo|ict&Ja4ywb`i8dvCk+h?3OCDUXX""&zMsrr>Pa>PMm^gxGuU~bBkga*GhQ$y8PeK(vV5>_veh8WQRi`fT)Py)OBaHdkXS7+2I5uiFuTjpn{|fPzQ@LVpRws+Pag!Jmkh&s`Rf#8p6@GE%rDwB!|iWyv#W3""fc4s5V~t#!AZbycFoe=5_^dmnYiDah?iytPc4Cd-vA=_4gHR76=T$8{3_ZlX8M&|puQAfna*J4ik!z%hrP@gAZRC@#7kjdaqYLPNXkd|J5C{v5M<=KgwjY<<""RG%}XMSx;M(-Q4ydT)Cu?zXy|62!hO5GK6?)N(B646MfaTmnZ!8;mBW&1sHGMHlrCL(`)&M`v2tW(sAF!DlstPKLk(793yV|;%hqX8yRQ~7qt>z*_1Q}nrqG?-_2+I@FEcp""Dk22T^*y|IeLsyh%&Jl=BD=^s;7%TMB)OIR3?x3zlGcx&|%5L>qz|#y$tK6`UcRj%|3!3cHJeEdDutE&Q""!G$@K!c(G=cTif8@R2K}X55$`>%yi10kG)XZU#a&wVatKSfR}qmbsvbuA8xfcJ=b|if1ag%PVmk$*qdh=#$2Yuf`nUGo(5rSqZXh""4gDR0J=E*HR(o&Wy37z}fksW*{`BufC1SN!A=jwucIoPXF8+iM+=c-JZ=K&I>v0p0p79-kBWK9;U*WHS36pGGky+u`PcECOw6y$&""mj@^nxm{D9Hmd|i9(@Fo%xP-Hbnz%$>#*;Xg;Ty6GouTFUN@kU$8!%q7o$?!?g%#7~%o61m!""2x#d}z3R{6h?%6}ca^*5_Jf!&ZvI)?sdMg`0A=^GDVU{IGZqL3rj%z$d)jlx6~ytx4J^s~E*`4;!As$aUrKZxSj0YH7l+5BVaLBH""b)Zuh5IyXe0RwO^zm{10elh-OVL&!P#Ekb>_)~}*-y!d(M1wk0tQ>8x;Cx&>pBuBm5ki{0H`K(1*VT((TMZ>N!u2nHz4U@&9MvRJ""+*kH}WUU6WouG!F2L&YMTb^;NFj{Dd)bI>t0LRSCWauXUSe>@#8MGxhk236R>W9w4|Xlg1}cc!QjkZnVVRvC;+y""x+3PoS+Dt@=_m5jIg2t5obGzUYgI=o6+4a~mXa2e&Jc9}Vj9^+4yTg$qFbM=y75;hce>4zrztxN?eZ+w6UWZST24`D35j&4SxOeb""%PHMu-RH5LVp(R&9AZ`x(`xf9dSqZ04)Tce`@Hk-A7*@$4-U}W)Tuy_CVnBPa%DI_=Gjh4wJR04xHatM#l$3S&t}wcLb?oscu0=c"")sQ0fXY6JTtbdH$<6fgxMr;T9H8!nxmh>TLLBVFU>B>$N%Jf-HGUs<9@RuUu6WO*pcSNN~QebhViH5}HQ^ydDtS!aer&ix?8!nc>9+x2yqIY2-Y""ukRJw*em=|OzOZMd-ibb5gnRMj0k3$uvJi0@-n+Yt{yUw5YN2s99o$dE)a}BeXkzi)a()Exw*#qBd;h+@rpe^fOrvGpJX+AGkXRr""#+te2z5RnNsGJdkG+VJXy`{p;J>%g`69JIw_+igLVc~|##zh;Eob4ny;iDp5DhEt#eRR1}c`?d}pWyq$S*s>rnwJ*sdZ}E~oX%88""zC_J*BWBu+5XSa_$+=B4&P8F3%CKnXqHS@bfN~MaNmY-`vaASzaQc9)z_G(aR^c;V""Vs`jd5M&3`^Z;||fnbL90Xupa7SGMnNo}DKJ|h@>l!N=*-GAe)TZ@A|Pf)qRf*H{8GrxK8KqRRN&@@VNNXPwUeP`4c2>c+|2gdpp""9*DPc?5})3sh|_Jn0)|Ip)B)_^4%v$hTOPC0<~s`z6%IPYUwmBrxD7a{YlXCh+KW%=#p""{cFc<7?2)g=j|&#!{k7#5U>L3k7amS|7W2y?zDJ_j?%5yg5H%64qcn%xTQ@AOhXq6B&_IBsx%#XV$YX9NUEtO(jIU_eXvQgXDRHu-X4Git0Nz~T7t$dFxsTX4TeqP)K~VY^+-YO-kiPvm^O!Wg1fQN%8Q""eo4hv20;7f3}#O$M|raU2EVChNsFk4Ly_keh*UU<+vyQI@txDr_!gs_9B`99c#S8v7*h%!-#NcSg5XO9HDsR=%ULM!S#K%vAvbR01bCiJA`A$sbY)HB@j4{-L2*_vbBU^W$;D4FAIuKK""Y)MubSyyR!F%=jilziE~sYMXgizEOp3Lv2sQrnMnXC8a+gs@5s$MH^Yun{>S<2qZMAusv+0}<(_sR_?RT`Hs}$AF?US&scm*$7}p""Hg$nU8WvlDF15bOkNz#1o0ex-oWxJe)&""-By_S&2Pp-n!$>f_D{%|0!M;>y^7i4-hnbZ*PKqe@(~Ui?C5qrJkIW=bbG(&bPeRooS(;%>r#N0=$IRk*+hWYr{fV2u%a;mxLzfK>6jAeHT)#Ave(zV0Z@#7807tMacFq<0Q~VWi""3`qb=ABhvQGP~U4YC<>89I<|#xJ9)J6Z`){?=Zo)(~2OTL#dHE>+GBBbu!l|=4mH_xf}mF60EH2UF6R~-!IO`gpYj>63E_8q1b%hPa-i$8=siT0y_le@""j_uZi0&cl9{I;Hm?%Y1CR=t5Q*AGZa5N87K-%2dv?|1l05H7wTcwAnW6JEE8xq}v0@sDJ4LX)4=UNP89>?YR=McY?>@@&{}PM6Ay""Xj)Cw6Q0M3Eh1^WAaS4yqdA7gZ?$s{+Vm`BHeg%#g?E-4tQwSc;BP)Us24zga(cHffug)>@pbGYUvoZvxxuh2tQSi2w6S(v?dPt1c6+sd8VKxNTwy$_=Sq4JOr<&MX5Cr""!kfNrbYGi""NPUlc#*x8+i3voP;9bZI3OE&xh4M""YfJ+0W0kAt{E3bWT1?hsDvbVMYw~66yiVIw6}$V)FoZ*CQyfwfKyAU1lL7B_%`Y!>T#B}6+v(LW9AI3r0<|MHc&gz4F8b3+okXx$""%<4=wcb#_uS$=dYQ|){kjl2N|mVjpd~q;);a-5nt5y`d1X?WVBuB}%JYttTO$HwKFwzqh23BpCzuG|dEwvNAhU""qdfvegwigP3-Hy!u~=*il}WI0Yc1rH7Olm)s^LMRCg%wxOou+kflx8&v$yqL6Q7>#xpO~692k%IxBfcC^r571C87sHs4U(!){s)@""UOb|HIBQCG=cjM{(N^2cE^*pxhJot~%d!;6y0~CU)!#GKQZ2J`nq1Hl?(BwUCkFEl-A1Ng|3#96k`xm1g`x9`EdW>H=f){?`v^(6""=oI-lA5qaBgLu>Gktp2<=_6G&=M+Rz*zebg3p~WW3%Ro^ywwF(QFXBCuIy5fn~tFXWg29Q)GA7|qxAYn{j(JM`rY1pNCWBITCpSv""&gDoH%e!C;8^F}4FDMFm{_2Jg9+9gR@FnR=_fZZD+IRU-8)aDo&#XQ""+wzoQ=$JM5Vjmr;i)yOsdj7V6`Smy}wR{dy3=I2gC*^rOkNn2hvz}G%lo*OQDXYYW(;7|e6)0WY3_r-g-B_RjPpFxdMxbEKYRz98""%81f_i2^txTT-7h6LB;K!T^R_tT+oXMbYy00L(^%muSUZ=qjtKU6@V?G^!}D|J_XqYjI_A`9JlO~Zom^2}f&sF53oGhpu#m{dMA`h$+9>!GZv|vX^*0Ucnb~k9n{aJfuLmD+vT9MwErUAB""J2n_PmN+b`-h+|NLr@8Sfb|$;+BhckbAh%hxp*EaQK{TJ`u5%rFnBYxV<%u(aI7KXKlF~zzAWU$uTP|F)fjThL%B_99M#F3Kz6&^""QF@O7c}JG?Jg7WgpTO{3%2YR=tU#Z>?*9j;*|j@rAW|?dKH)kvnFe_fs}F-cn*7N9d)2a0DS&NZwz~-4aU&M~P}S8MR{uu@js`Zi""A90iJsCBa&W{=*c&7nnz;MH8H)xi0N>MDmlGM1B23iFwCr*P!YZH)ohb2cg%z#W#E=D+%#m21OvK>dw_S6E(C(W3D@D<2Ipd>PE*""%<^=Loh)`FO6o-^Q2%c^=I5!O&6U2^@@c?_C*SFS*$SucM5~=<$;~STl9M~lR-ac{15&V~e^X}(@(d`DY#-jKTBb-5@&=+P?8T=t""bnJntl=QV*-SMWrLe%wy>=ukFz(|vxE7TJNZ65CJl(tJg_z1gJN0sdQ9_p1mhLTQKteAgb@2;h1gIpAWMt=j6*;a&ex*HH=m$5@*{Y9*wiS2B>iTX7x6^8qIhv3>rwtqcPZsh!Gx2UWwmOLg=fjL5kn`CpBojBl""cyzWFMvB~rD&I=;ZhWJ=-P0afs_fF`qJMi6i+-`z0n17D{t>CQiW~GIypO7!uas1VHh$n&2G`Nr67>)dlwiugFZWc$OnW=5q{10S5V&&SfARytG1wXU*T#`0Mg$kuc)RbSr1(01z1oy@M|NC4ILir2g""`BQ%b;UP?NHwC1)p1Bsdp&4`6jt=~PD{Zwr$B^%""@oVD>c@M^>M>RUC2II=zAw-Iz@+nA+U(!mw(>H`IO0GULr3WNllTcNkm{vvdGl<$~CoU%VToMg#Sd&bXYotQ0fnWo4P7RWeyO{vj""4E8&hXP|6JYGFwjzrdI`62GasL0-W&sSJjMdaLql!-YuGxIG_gb~Ed>NO0Tij9gi%FPE#LaS1JxGS^P7ig+Ceo!X!IUBy8Zr6^xb""DXrGTPXKr}gKPyGQhB~nch$~8bbydjyfWe$Cs<3TCPfP82&P#sLzbeSxWol?!}87F=KBhKDdO|Ux0EDEV!jXt-!}wznUSwbeMUGm""EU*sDqr9*Er-_(^3h6Qks(gf!w7F4*W|HhHdDU@Kajt}LZKqh|_n~f@?WSuf@iWH8N7f-Pr6O?-SkgsY>u=LV!iSWj5(ApIlPvi0""*5JM&tDoQN876T~a3>L2$~tSTOU@~x@@Cf@NnLCz5Fps-E>|$jw!Nt^t`?T|2X{Abh3W3|8rvc;^>}Ux5ky%P=y9m>`(7c4FDDgI""&FRZwerEb8lQ9L@|B(G05|V(|0k2aFVTrLn@Qx_}viqkV0ML4@CH+-n{kxYDdeShu6D{b1X)In*QW09~NnW}&*c0~kXzgTAbCOSF""Op1abcyvXFEMoHYShDG#%#0n(x!-p*TyEkqnQAPib?lsg0i_7r-=>M*dL-Rnl!CisL3HM;""asXghy9-pA=c-sPkW$6wsAgY2t$jZEFj361V*x=0GemlCmBrr{^387TO6c{D{NHYWuB3DNirF|+&X&YxtpbegR""{rx>Qv43WyYXLekJ!wAOC4^i(62KAo)GWvp*WG($1ueS$n1Z%sk?+yQ1u!*v00zR|hMiE;#_8QfH7VH1@Sagg9DSRTJ1szua6qcT""xtal?|zlI$^2}t-O6cz-KVtuf|4qKm*a~;Dp=Bd3tX3Nzh$+T4{Q=|zRCSRZ+rn`zQjbb5""i^>=oFort2v!ZZtnm8veRTr!LQXIK;F-_!O(L--o^)8}Wq-Qx?Q_gZEcG$#Aza1x;rkPO^lGSn-;5Rxvsa%3^+lYk1laeGgzIT}-""V$zMB14)!2ONJt@1r!TRbMj<2qn-PbQKgcr8)qGv`~pi|MdacJ4GhFt*$HltGi*_<{y82U`tToyXsFZ(UHKk1$8P@h+%0&ZG5T`=""pueljV>ltHRZ;g;zCEzbO^k*J;1GZ?V-nxe3_sIJ(Onilj4u{U&2Ql(QiQk*51HtB(7B`f7|>m_?MmX6xD>E9%Lsg-5JUq@NN$f?""L+)Ad1V917X?!a2?Rq|TPfe%EeV;|SlT0%EyzPT2k~bZ|3lA#dj}g~hS}=lBd~y|Qg;Rvh)D#}k6!SCpCb!0DHTiewXCbG-=j@xI""Rr7<*Nc4WG}|lz9g{}PU$X>4$wRe#TC9Bvc_W&$gHbr$eH#="";3EoaTVsY!eiNd(bLR%V{al&~Tq~JHFCGbziDnz^QmMH1KDZj0iWQKroiNV`LVAt{3aK?WpJR2PWhUEdsNbX@s6h}d(O}eo6$xK_""euVN?4ZzEBeDHYgS&x4`Xq){emn=%76~ActGCNJb=+FnFK6)%;wuJ=iG|6sjrnU}ijg{y4Tr263?Xe8sze3YrMHt9|Lq>1h700K?"")NXv}P>wuztxrSSLJg0Tyz(1}c;%&aT3HMX-DF~>?b3o#>VU~1SN+{i;`}hXjV_9ERr7(A)I}ee^hi6=C$km;^xh`epn2aXzm@Ap%J!TjNvpq3*@H(Ky~^{jVMJYylJ!+q-C8@WO?p6J""M7<$4ZLTmP9$7G#?($4CQqS4epFM3`aHv%2K+~w_<&u31#f#|rX>tsBj{aIwrM{5}I^P`y4)#FjutsHM#""D1_J80huX|3%J3g1eJv~PX<`fML!)u4dR3_oae7uez47eG#&ZZ0%LjE-?%DFr=F*j5dk^g!kznGG*0y@=Yni$?$g!oo=""MFH59NmxMPe!R$z{zIv$1q_<}8tzNV?e8g0F}_Cn4@qa}@~A<+0NIR+?u#5#TJ#p%1d4>uuzkLcMP1mw(o^I;)CceUyU#fZ20maF""J}v($o;kIo9Su5ia@1k*&by*;Dh$snuW_AGcf1iNW_CCX3fz""oE&#t8KM6jj4zuYAUJb$EgrH-8(O%hO(i$HrakLSw~4ERno_%k(zp)-JucL85G+GS!3gLodz|(emGE)&;D78Ur`CO}19riqLGjEl""Lqcu%pWc8oxOBw""O+QIjHFwi7gUwoGtwa9*0`5w)ba-V@*z{+q?wc{|P$&*_{bLkh2c6@Hk0l7>ZfKaV@j)_h^sS=+S4(nrAG2{Yqc~pj)^h7tjzBT~""RDR%u9Ov*`=4RtDR1J80is)J?Wn?M{6)ZjGEfkuWOOMDSQ4f3*xXy-CA;y*OO!-sFeU_H7;;z}IxvD?+&qtid3$qO(U7n7`SdCrW*l7GCe9V9wEp{MYp(W;tOJtuGAg9exFz2XWG@Sw8Ze^<1v`A6OqR}+!&i<1RTY_Th""Cw4^c7ZN-eU%cnXJ@?KoU~YT0bn)0SHK{@SPv{}DkAJ8CYL^U3jR3VtHkD^ifFP<-iT-bkHY~MrV3sPO*_*pgV|)~wtlK2cdby1G""(+&1$OB9;T@jM2l-ug_a?P-}gsdrhipE)-Uzy}n3UQ>1He$rl^Eqd9PIPlM{ews@Lpi{g=kH;!)r65ij4-$dUeUbPdu#1cUSF0Vb""h3%dBDd+#8gRlX(&+rp=g%H`Kdm&_|2OI`fenesN+1^a1AU5NZaxeGI2=sb!O*@a#KMfWBEljeIY~hHA)V8D!p_?(n&{2NDP=AH@""4TGv~cDz1ND>TN9lCdg6gLB-}dNv^F0EldzrA$@vU^4;Og&o%7b*H0EZWz$SXvL1U7Z6aj4j|g}l>0;nGpya2#(dau*ykGa6Q4&c""0(nyA_!qW0zkR%tldG7>#__efd9XGq**?19K#U{@*~bqCIy7vG@6=_(ne!^pkJ*G{8%8Etij%hf&Sa+`ba9HH#BPaN1vxLW~T*MJm(%OE`Dtm@jr2A&ymXyD^{52V7b_!L""Jx~(N`!D^dP~Dx?aoqmWwN_>tRZj1vgFD{J@?5aXd|p0q&M~6|BRW0w2IdxQD2i1+3fTrdlA_9R!Ni~xPts3*ZsRfmo00ASI5m=8""iUQUWssdP{{qlP`Qt=vmH58w{7I|!N4C%|d^+T_6S-0!M4#0qyCR=b&sg#Vr{S99n""x0+io4Hp#qISBAFqNTYE|9KCp09BF@3Q%PI?dC;YtNP#`GHY(-zCX~LS`p#OG$$6""p55YR)v{{CL~+kV+L#Y>%Jp~CoX{iRx?#&?&(#;FAYbgAauv9tmK?0oxHcKoQccWIW?&04R)CgHl=7}LPBpW;t>GKB""c{<)fObM(5W2tof$98tbRpbAbU~}PA%|-f1*xeMXIj^)$+hzm%KX0!{754m~0%C+Z&4U?_?kBz~)n=ihZ0*YK8E5}o3NT_yjP3@C""hEf$>^z3@PmvKm(;QNA)KZ7)55#UnC{{r{-^R35iQmSZ|Fj|A5mgu5v""iAZB&6fm7Euz_&^q$@IC&^QK@G7B>^Cu(?bt6(bq*38?{;Z11B^sbZ90|<_"">~Khakz;nB1?KHKgzrM3t1QH+dWLiPf-d@NxZlS+4G#a~K}|{Kzt>Appq#wK{LAKgt3a2#bA)!*V_wusSKB8^pX(P&V""b&Uy9;lUDX(D0wEiM~7|<$MdHK&^GVTSN(2X{$mS#9o4=EBdW;%-r1|hIX8yJJ7CQjIi27mHacgaC^Lz8KNEeIgX4T&8dV>g@_qR""v0uVr$8>5`4`ZejJ6d1d""Af=oV=rKNM(OQ*d3v;}sjRQ~mOpm>7Y&fF&U#Tz0DZS9c^qp1{a&V8Ad8NaWeSjA+Y1rJV%%5l<<3K!`H>>oFzwI;cO`zY{NT&}j""3$4t~N5oY6JaWDGD)Li7?r>gS4~qGaGeVF34G|$BC1}y~{mzyPDV-zN|hgIlHLYX{T;kT^{$LR7j5`bokIz^u(l6dSN*w=~1q)JImP|EGsMM0i&@VBPFDL-DKoOV9l`roaY(~epN4y~m$tx1>M;5&eqfN>;5OnC`H2-=?DZdYEUy9+EAxUZ""Dt;A=)+YrbJx3(#)e)XQw5qxvG9jwerg=C8WVeG^Z0%<~m-~%WtKu3*n~bE^!?Vl6!Qbx3yzr+x4FYf2u$MDcO=&o5)Y5BJK7f^J""##T{qy?@MxKuu?n_4mKL;s!GrM2$O29jbu;2^gj?&U_tDaNZ>$s-ITaKo?ZCXhyN1AWW8!7_(KVDy<~mQTU0txTZ73zYDCp4FYq@""5v{s_mT|T2drmdk2iM-c$dd0xa}}h2zsIvsn1bU{J!-dHIc>kEHxW2SBW0prmkrW-3(|DX=Tj)!H(BF5WojA1X2so#@G1*w&uY_c""l$67K#RTtekqIs9@6j)6@eyOj51BZ`giXYJAPXlHwFp>278PuS?7x{jA^2cZGMPHL5Vk=BBOEgX=IT-G_K}TUsx<=C`1(bP|NIZ@""Ge#x12+o?F$QdLAl|O}E5^F&%$C`z<-@fV0$ePo5Eb3xE8A$k7P;*N4>n5%8yESf4UVEL-@fgDAmTST7s?361`GZ5*4BwH&j085h""fN0kwE~Q=wpfw(QzR|M?P-d9uuz5o{~C7jL`(y9F0fbYP-@k@YMvmCoXh$?FRRibSZT(b""ht!7UtvHhO4Egwlf^anuMx>QnisSnWw""!In|?pl#c?Bnb{6s{Q?bW!Um&kffkpUdWg|Zv9r&(YZe?vAR-oO1hW)N*51NpHjPm2w)#QRTA1{?j;47Q{0z^?o06?NG*&&a22C!""?0T;!zaLZ$H9S}Bw*pi8Ke~}o+edxa0kK~!b+oy(%`#$BQ2?^;e(y6FWP+GR=x>_zA5)bomNkQp=WHae*pwRhcOv*qfy}=9@}!SV""+om_$H>4dvdJ*Xxt-NsUWH^~wIY_jse>SCgMT$~uZ""vgDbkTByxJV0i@<<-Ja2zlR_J&ANF%FTdD_BHd{3!|(5pTo$y6aY>3#""sje1KgH#xRU-rF;RzCv~5;mf8q!MI)2*jB-5LFxV%Fe7i!C-3$rozU&g;1*96JwgGb%<`-6-M?*RJ4*l47;d45{!eb@>xQ+b!52th3_Zgk)=QkVzv3*G9=}7ijl{gDmA$dP3|@kHz?D4o&)lprgW{xxH+qYD5TQjpV&1-K}jhRwwBBn>To;qy|Y%C`cj@""PyjK{&bI7TMM>VrT@y{x56^5m7=t9cRhegOfFv)-;migK$$>>Zl<@~AZyjd&t8QBt_g^~y3Hj|>Rl%kltlYD*qF^c+2RkFt1-a*^""Z?3gk7uPrxC8(r7mqnKJfoSruZwr6FuQZ=^l$TDICKh1kKBBOFoRlL2?;!pQkL%)g1sS54&A^Nd5hj;#XI*?ykk$wwQ1>xnq5CikxA$awqN)ryL={fG6N)Zh0JDo)TJsI-Ba(`l^S$)qTcNCo9hc?ube9s-$b&-k=Uq!3{|*GSyb(""eY7v&DvdTrJgLdvhHn8<*2JZ(Xr@dF#f1?NE~`)I)STne+V(lK>uo^`{|dZXd)I_~Y^EPo%Lq1}?nPayx=w+6aAiNx9tLiK`9FDi""K?u|b${19!ja1Nb1(dyvO5E525Ru)PDGAUUL!H7!RW0M%LF-DfNrP4+$tqTza%ZCrNf#m;ZWgT""Tbt_8jIRDM2WW_SeWFNx?~Xl02a|H~hBVij&EgrLPW-Fu(zO=fuxoG+wi;LdX`Oy?qLo?Em__>wdEWJ2U$OgYYEHhG^hde##n@s7""rVaXbLSj%m;^}TNL~x8CQ?0=$mGaCX>RYKIZ<7s*?q$45^rTm0(v{?ky2GN-KFjTQQR(sL#T|j7@@9T3zI@$CNUwUn)k2qiRM*-R""e@dh?;+?RMhjlvPfGqm}cGP$kwL_pcL;&Eq;2mHbUWgGNbx@zx#q6%Yl;1cWj(Zz4?3X!xkvuuxT`qCG2q~qJoYmAibEJ+Mgu(?XcaoFhKNq9z=<*iJXvPlMzLA4Uxs#=%z+JKx;MzT)+NKJGaDMxt*m!u4{6fkwNf`9o#g$a}FSp_#y*|6d@o78CLJ=)7+!t<4(hOw}""(lga~x)a@Kz4iZOAYa4gSEJKu2+3OdRJimY)zepR#Ao5(k%(OLYmj7j;8B{nXVViSkLW|AJ&Jr0AN!UgMcr}a47T5@DGB`D$*H<{er^Fn`"")~;fDoyUa~_B(zT~4moGT+`""j(pK$TYWh*8hgkVc}XKg(pi+@4?&N%f_or0j^(MMsS~kD5_{S|$y$Ljc;kTuAdI|$xzG`|TpwjI4vBrx)tz-6At~9wpYvbQDr$r-""L(qlWq@v{-S+%a5GoxLdu!<8eNUv78DuSDGlap1u-w@8%DS<2&Y3*=|uJAst8DD(e_5B$MZeHO3kI4zfR)X}0r~;B>@V96ZtL-r6""RZmUeQ1PFsL&3k^)VjzY^iqKva}LHzk^cF8HWsAeJr}$BDAN>*$#Z?1o~Bm|wMDwqnZFdsu57*)yhw+OJk$0eE)Oq~9$PL!mPHgIPO4(8WaU*=T8RQ>pR#0W42p{i)Kl2%|jJl!No?S#E*AIh;=<$!%ey""yYhyAX!}ntfe#{9RAR;WJRdA^va>vS;V>%qyYy6L47+hr7K_Ksyl)`a%L2J?uul_efZ""-5XP|^couQXGZNpIHS`;O&tbL**CL9k7P#5;""s~kkY@9#ooquRTPJ1ojY=C!C@soXkCdI6n1qCo~f;n?$Q>;k|6;hNZwWr~YzCYa;b""vnENz&XM?nk|)xUnrPObhpd;+;k-=2k$-Wp$Zy>oeW*pD0(<5kegz92D1JDG6P7OLf;4=b0j~%3HH@lteH%|a3wJHOjML}~HBZ=cL?2_)MokB5;SJ%H01""#vS#js%r%3RZDU_L-$*^A*Fh|ZPXR+6A-WfX=dce4hx1Kz%+@>oG;S&KaP#n!nzs1Zf}s|5vjyas@jAZpqiV+uS*m%=4EKG@sxOw""Z4!*4C+Pan$bKixP&kZhGJu#e*7Ut`<5S{{Mz4$4J!Sa{Dg(AR-mB^j;OnMw-h-tq6u~%C>mgYvWSA|rAjp`wAJ4-<%19DH*giSq""EN%$%^_Qv`m1SSyx_Ge&Xy$^cNf`y*Fj?j81ye^}61|>;{WB+clck)G=if+b3p{ySH$zoQmm}TWT49q}+aje%rSfj)s!fs7S_T!T"";c_IWfD8rix^Q+mB68Xy5bMXl-?m)|x19YlWb={o`9)NJBviPCp0^PLCy-qvc05EO!<&F+kU$WOZs|hP9Wtmu6Gsxe-lc;uwt`Wj""2~0{G;{S%^g^l=xzD0Vc@wbBTJ>fY{+CeYaR_)UvyVR`ydt3i)vitp#%$1kqnut`so|X4{%|}(4~Ical5TfBi=wp+;Rm=4cg6yO(L5MwvV1i?jsR6d{FB3)k6heau-+I_r_bWYKLaJ+~Z)+B<;Q86+W`}""gTdK%+`mD_Tqxv{?Ch@`Gi+hEVj;6b}^s%0nsK!#dR26#=y3uRSCQWZuiNXTO$|`wijGA^d510""qvbYZB(2j)#;Qg>ZzcxbCNElMHsGYWg""PqN0_dS(xgG98U5iBJzhFWEEd""zBPmsc5&voku!_Jg6eImXOId^%sO>jREELmFnvv~)}5=08!+^RArxdQQIvJ-bNXgX)tNHQ0Tj9{+x;ey1ZEAN{@}81rdC#|2}URxRYZ(R!0Gf*I5i%rGe%Tand9UlBof_K|0jpNqyL?M~$Ea?h9McoEz8sSRSiAE$xSG>D(rGRcNT9NIrL""E9BRauw>6f=l~beb1mx(fw6R(bxKXE)@V$r$""ln(>1H>MDziy@}6^t<`KdTW0m*`TYcqi05=a5~vR*=ZNsPIv+L?acS#QV}w~Z)AinsFqhCLeAF44z9v-$RSJzz=YFnGN!DK(EYpFW&4KB5q!A1}raAFQeu}mn_!r_Lhi$0gXG4(E7qke^8v*2gC*{*tGGFHm{UA0Ww3uuo1n+`Cz$ciLL=;R6Hs`d{my~Kf"">$?8{KC`WWt?Pyp?;&yp6knHcOI_Oc&J*t(_F@#-3^rO3tLWY^d2#5e`TD5c`lg&s$NH2GP0yjK{7_qIf`QTq(stsd1b}pYkyCHP""qHAM8TN~iG*BQ?~@CNk$sQ)_5E|{Ch+%1QD=Vo{uZoZ>=X>Fg?fx>b;9jdi_P&H5Iy+bD=?O_BLes5azxHm!JIMXFcPQ}0gm{xR0""3Ma`9{B)7**3Q;7=h8gzT`3O?$_>|uGs`1JbQ1J+GqJ6STuP1@e?Yk*eEITmQ>v@XiHs4gsvNLdLl9L#5XL3Xev~;@XQ?=F21eYd""^>Kj^Cs(3fjDk|-(CIf;l6zoS8)UytX6k3P`sRvYdWT57-R1;TPnhde`&FqXa3;Oo?b+1-On0HpuUt)*kZgyf^V;!0s*a_lxy@eul%%bH{XAv~QPkV~JIm+J`0W`m0mNV7HJy""T!&L`|{X5gtji->O9E|j9Ly2Xeg?<-*$X+}*x`x){5b##^G#pUjXe)A9G?_och<@a6PT?I0rsG>o1|;G!7oY>5Vl""w@$>nk{w{-|IYkdnk&yf?pkXK!&-;jAi81)Dh}MO^VxnJsh_^bVG4J<3V5GDMHoJ)k&nBsOwPWDhAu`CkFns^^Gu{h""gW{;Lo8A`V-1_SQ)m+kX2HEhxiB}AqwZc2G76mp~X@KPtJK-=I#n|vHEWES$8A?7qdacc3u-{U#{tG8s<(%~$YbAmd0QaWXrzw&Y""$66_{a""&@7o0(+iTyj6BGC$^BA>W-lAFXxnyy000001zPg!OGXJx00EVe0ip2(S{YObvBYQl0ssI200dcD"))) \ No newline at end of file diff --git a/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/train_seed314.log b/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/train_seed314.log new file mode 100644 index 0000000000..91efb6f6e9 --- /dev/null +++ b/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/train_seed314.log @@ -0,0 +1,129 @@ +W0404 08:08:26.228000 78847 torch/distributed/run.py:803] +W0404 08:08:26.228000 78847 torch/distributed/run.py:803] ***************************************** +W0404 08:08:26.228000 78847 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0404 08:08:26.228000 78847 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp4096 + distributed: True + ema_decay: 0.997 + embed_lr: 0.6 + embed_wd: 0.09 + embedding_dim: 512 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_enabled: True + gptq_reserve_seconds: 10.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/f7607a20-c299-450b-9170-973578a8b2ce.txt + logit_softcap: 30.0 + matrix_lr: 0.02 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_wd: 0.09 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + parallel_start_layer: 7 + qk_gain_init: 5.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + recur_layers: 4,5 + recur_start_step: 3000 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: f7607a20-c299-450b-9170-973578a8b2ce + scalar_lr: 0.02 + seed: 314 + skip_gates_enabled: True + sliding_window_enabled: True + slot_enabled: True + slot_lr: 0.008 + slot_steps: 16 + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_4096_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp4096/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp4096/fineweb_val_*.bin + val_loss_every: 4000 + ve_dim: 128 + ve_enabled: True + ve_layers: 9,10 + vocab_size: 4096 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 45508608 +model_params:34401372 +gptq:reserving 10s, effective=590000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +0/20000 val_loss: 8.3172 val_bpb: 3.6146 +1/20000 train_loss: 8.3192 train_time: 0.0m tok/s: 8507828 +2/20000 train_loss: 12.1995 train_time: 0.0m tok/s: 8377177 +3/20000 train_loss: 10.6851 train_time: 0.0m tok/s: 8288110 +4/20000 train_loss: 8.8318 train_time: 0.0m tok/s: 8233714 +5/20000 train_loss: 7.6631 train_time: 0.0m tok/s: 8203041 +500/20000 train_loss: 2.9028 train_time: 0.8m tok/s: 7976717 +1000/20000 train_loss: 2.8869 train_time: 1.7m tok/s: 7942538 +1500/20000 train_loss: 2.9120 train_time: 2.5m tok/s: 7935326 +2000/20000 train_loss: 2.6523 train_time: 3.3m tok/s: 7932106 +2500/20000 train_loss: 2.7109 train_time: 4.1m tok/s: 7930042 +3000/20000 train_loss: 2.7611 train_time: 5.0m tok/s: 7929894 +recurrence:activated at step 3000, virtual_layers=[0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 10] +3500/20000 train_loss: 2.6827 train_time: 6.1m tok/s: 7529226 +4000/20000 train_loss: 2.6169 train_time: 7.1m tok/s: 7435459 +4000/20000 val_loss: 2.6413 val_bpb: 1.1479 +4500/20000 train_loss: 2.5702 train_time: 8.0m tok/s: 7365310 +5000/20000 train_loss: 2.5111 train_time: 9.0m tok/s: 7309592 +5454/20000 val_loss: 2.5262 val_bpb: 1.0978 +stopping_early: wallclock_cap train_time: 590094ms step: 5454/20000 +peak memory allocated: 30120 MiB reserved: 30154 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.52368690 val_bpb:1.09676525 eval_time:2005ms +Serialized model: 132406149 bytes +Code size: 23803 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 66 Hessians in 9.8s +GPTQ quantization: 66 layers with full GPTQ, 0 fallback to clip-search +selective_prune: unpruned=16.00MB target=16.0MB +selective_prune: already fits, no pruning needed +Serialized model int6+brotli: 15974129 bytes +Total submission size int6+brotli: 15997932 bytes +final_int6_roundtrip val_loss:2.55027811 val_bpb:1.10832149 eval_time:7527ms +final_int6_sliding_window val_loss:2.50739734 val_bpb:1.08968600 eval_time:76169ms +final_causal_slot val_loss:2.47727670 val_bpb:1.07660728 eval_time:444871ms diff --git a/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/train_seed42.log b/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/train_seed42.log new file mode 100644 index 0000000000..4d66277169 --- /dev/null +++ b/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/train_seed42.log @@ -0,0 +1,129 @@ +W0404 07:38:35.392000 77439 torch/distributed/run.py:803] +W0404 07:38:35.392000 77439 torch/distributed/run.py:803] ***************************************** +W0404 07:38:35.392000 77439 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0404 07:38:35.392000 77439 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp4096 + distributed: True + ema_decay: 0.997 + embed_lr: 0.6 + embed_wd: 0.09 + embedding_dim: 512 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_enabled: True + gptq_reserve_seconds: 10.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/923648bf-e0a6-40d4-b29e-0299c4f40422.txt + logit_softcap: 30.0 + matrix_lr: 0.02 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_wd: 0.09 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + parallel_start_layer: 7 + qk_gain_init: 5.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + recur_layers: 4,5 + recur_start_step: 3000 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: 923648bf-e0a6-40d4-b29e-0299c4f40422 + scalar_lr: 0.02 + seed: 42 + skip_gates_enabled: True + sliding_window_enabled: True + slot_enabled: True + slot_lr: 0.008 + slot_steps: 16 + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_4096_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp4096/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp4096/fineweb_val_*.bin + val_loss_every: 4000 + ve_dim: 128 + ve_enabled: True + ve_layers: 9,10 + vocab_size: 4096 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 45508608 +model_params:34401372 +gptq:reserving 10s, effective=590000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +0/20000 val_loss: 8.3187 val_bpb: 3.6152 +1/20000 train_loss: 8.3201 train_time: 0.0m tok/s: 8475031 +2/20000 train_loss: 12.1482 train_time: 0.0m tok/s: 8359023 +3/20000 train_loss: 10.6752 train_time: 0.0m tok/s: 8275865 +4/20000 train_loss: 8.8831 train_time: 0.0m tok/s: 8193201 +5/20000 train_loss: 7.6882 train_time: 0.0m tok/s: 8153963 +500/20000 train_loss: 2.8980 train_time: 0.8m tok/s: 7964606 +1000/20000 train_loss: 2.8826 train_time: 1.7m tok/s: 7943614 +1500/20000 train_loss: 2.9046 train_time: 2.5m tok/s: 7936900 +2000/20000 train_loss: 2.6485 train_time: 3.3m tok/s: 7933540 +2500/20000 train_loss: 2.7097 train_time: 4.1m tok/s: 7931972 +3000/20000 train_loss: 2.7596 train_time: 5.0m tok/s: 7931646 +recurrence:activated at step 3000, virtual_layers=[0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 10] +3500/20000 train_loss: 2.6817 train_time: 6.1m tok/s: 7528857 +4000/20000 train_loss: 2.6179 train_time: 7.1m tok/s: 7435705 +4000/20000 val_loss: 2.6409 val_bpb: 1.1477 +4500/20000 train_loss: 2.5735 train_time: 8.0m tok/s: 7365391 +5000/20000 train_loss: 2.5137 train_time: 9.0m tok/s: 7309483 +5454/20000 val_loss: 2.5257 val_bpb: 1.0976 +stopping_early: wallclock_cap train_time: 590101ms step: 5454/20000 +peak memory allocated: 30120 MiB reserved: 30154 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.52314384 val_bpb:1.09652925 eval_time:2008ms +Serialized model: 132406149 bytes +Code size: 23803 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 66 Hessians in 9.7s +GPTQ quantization: 66 layers with full GPTQ, 0 fallback to clip-search +selective_prune: unpruned=16.00MB target=16.0MB +selective_prune: already fits, no pruning needed +Serialized model int6+brotli: 15975658 bytes +Total submission size int6+brotli: 15999461 bytes +final_int6_roundtrip val_loss:2.54928373 val_bpb:1.10788934 eval_time:7568ms +final_int6_sliding_window val_loss:2.50641155 val_bpb:1.08925759 eval_time:76200ms +final_causal_slot val_loss:2.47636068 val_bpb:1.07620919 eval_time:444138ms diff --git a/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/train_seed999.log b/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/train_seed999.log new file mode 100644 index 0000000000..6e5391303e --- /dev/null +++ b/records/track_10min_16mb/2026-04-04_SP4096_DepthRecurrence_ParallelResid_CausalSLOT/train_seed999.log @@ -0,0 +1,129 @@ +W0404 08:37:52.226000 79859 torch/distributed/run.py:803] +W0404 08:37:52.226000 79859 torch/distributed/run.py:803] ***************************************** +W0404 08:37:52.226000 79859 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0404 08:37:52.226000 79859 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp4096 + distributed: True + ema_decay: 0.997 + embed_lr: 0.6 + embed_wd: 0.09 + embedding_dim: 512 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_enabled: True + gptq_reserve_seconds: 10.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/6718bb0f-6787-4ab1-8b9c-0876b188bd43.txt + logit_softcap: 30.0 + matrix_lr: 0.02 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_wd: 0.09 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + parallel_start_layer: 7 + qk_gain_init: 5.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + recur_layers: 4,5 + recur_start_step: 3000 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: 6718bb0f-6787-4ab1-8b9c-0876b188bd43 + scalar_lr: 0.02 + seed: 999 + skip_gates_enabled: True + sliding_window_enabled: True + slot_enabled: True + slot_lr: 0.008 + slot_steps: 16 + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_4096_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp4096/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp4096/fineweb_val_*.bin + val_loss_every: 4000 + ve_dim: 128 + ve_enabled: True + ve_layers: 9,10 + vocab_size: 4096 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 45508608 +model_params:34401372 +gptq:reserving 10s, effective=590000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +0/20000 val_loss: 8.3152 val_bpb: 3.6137 +1/20000 train_loss: 8.3175 train_time: 0.0m tok/s: 8490411 +2/20000 train_loss: 12.1471 train_time: 0.0m tok/s: 8357569 +3/20000 train_loss: 10.6845 train_time: 0.0m tok/s: 8272830 +4/20000 train_loss: 8.8562 train_time: 0.0m tok/s: 8224111 +5/20000 train_loss: 7.6730 train_time: 0.0m tok/s: 8195557 +500/20000 train_loss: 2.8950 train_time: 0.8m tok/s: 7978369 +1000/20000 train_loss: 2.8864 train_time: 1.7m tok/s: 7940168 +1500/20000 train_loss: 2.9143 train_time: 2.5m tok/s: 7931463 +2000/20000 train_loss: 2.6564 train_time: 3.3m tok/s: 7926848 +2500/20000 train_loss: 2.7095 train_time: 4.1m tok/s: 7924812 +3000/20000 train_loss: 2.7595 train_time: 5.0m tok/s: 7924426 +recurrence:activated at step 3000, virtual_layers=[0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 10] +3500/20000 train_loss: 2.6851 train_time: 6.1m tok/s: 7523493 +4000/20000 train_loss: 2.6191 train_time: 7.1m tok/s: 7430400 +4000/20000 val_loss: 2.6418 val_bpb: 1.1481 +4500/20000 train_loss: 2.5732 train_time: 8.0m tok/s: 7360123 +5000/20000 train_loss: 2.5157 train_time: 9.0m tok/s: 7304375 +5450/20000 val_loss: 2.5266 val_bpb: 1.0980 +stopping_early: wallclock_cap train_time: 590074ms step: 5450/20000 +peak memory allocated: 30120 MiB reserved: 30154 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.52419234 val_bpb:1.09698491 eval_time:2010ms +Serialized model: 132406149 bytes +Code size: 23803 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 66 Hessians in 9.8s +GPTQ quantization: 66 layers with full GPTQ, 0 fallback to clip-search +selective_prune: unpruned=16.00MB target=16.0MB +selective_prune: pruning 32832/9366434 lowest-error +-1 values (excess=4104B) +Serialized model int6+brotli: 15971138 bytes +Total submission size int6+brotli: 15994941 bytes +final_int6_roundtrip val_loss:2.55120246 val_bpb:1.10872320 eval_time:7398ms +final_int6_sliding_window val_loss:2.50826843 val_bpb:1.09006456 eval_time:76282ms +final_causal_slot val_loss:2.47819696 val_bpb:1.07700722 eval_time:443052ms