diff --git a/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/README.md b/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/README.md new file mode 100644 index 0000000000..091252136d --- /dev/null +++ b/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/README.md @@ -0,0 +1,59 @@ +# Record: SP8192 + Parallel Residuals + Coprime-Stride Loader + +**val_bpb = 1.08459** (3-seed mean, std 0.00069) | 15.99 MB | 8xH100 SXM | ~115s eval + +## Results (3-seed) + +| Seed | BPB | val_loss (nats) | Artifact | +|------|-----|-----------------|----------| +| 1337 | **1.08414** | 2.80045 | 15,985,531 | +| 42 | **1.08424** | 2.80070 | 15,989,295 | +| 2025 | **1.08538** | 2.80365 | 15,986,932 | +| **Mean** | **1.08459** | **2.80160** | | + +Merged SOTA (PR #1019, 3-seed mean): **2.88218 nats** (1.1147 BPB). This run: **2.80160 nats**. Delta: **-0.0806 nats**. Clears the 0.005-nat threshold. + +## Changes from Base (PR #1394) + +### 1. Parallel Residuals (from layer 7) +Layers 7-10 execute attention and MLP in parallel (PaLM-style) instead of sequential. The normalized input feeds both branches simultaneously, with learned per-channel scales (`attn_scale`, `mlp_scale`) controlling the contribution of each. Zero additional parameters beyond the existing scale vectors. Nearest PR: #1334 (parallel residuals on SP4096). Different: applied to SP8192 stack with depth recurrence, where the parallel execution interacts with the looped layers 4-5 differently than on SP4096. + +### 2. Coprime-Stride Data Loader +Replaces standard sequential shard traversal with coprime-stride ordering. For each shard, a stride coprime to the number of sequences is selected, ensuring all sequences are visited exactly once in a pseudo-random order without repetition. This provides better data diversity within each epoch without additional compute cost. Not present in any SP8192 submission. + +### Architecture +- SP8192 vocabulary (8192 BPE tokens via SentencePiece) +- 11 transformer layers, dim 512, MLP 4x, 8 heads / 4 KV heads (GQA) +- Depth recurrence: layers 4-5 looped 2x (effective 13 layers) +- XSA-all (exclusive self-attention on all 11 layers) +- Skip gates, RMSNorm, LeakyReLU(0.5)^2 activation +- MuonEq-R optimizer (row-normalized Newton-Schulz) +- GPTQ int6 weights + int8 embeddings + brotli compression +- SDClip (std-dev based quantization clipping) +- EMA (decay 0.997) + +### Compression +- Code: lzma+base85 self-extracting (43KB -> 15.8KB) +- Model: GPTQ int6 + brotli-11 (~15.97MB) +- Total artifact: ~15.99MB (under 16MB limit) + +## Compliance +- All techniques are training-side architecture changes. No eval-time adaptation. +- No SLOT, no TTT, no n-gram caches. +- Eval uses `torch.inference_mode()` for scoring. Model weights frozen at eval time. +- GPTQ calibration uses AR self-generated training data (not validation data). +- Sliding window evaluation with stride 64, standard BPB calculation. + +## Reproduction + +```bash +pip install brotli +pip install flash_attn_3 --no-deps --find-links https://windreamer.github.io/flash-attention3-wheels/cu128_torch291/ +torchrun --standalone --nproc_per_node=8 train_gpt.py +``` + +No env vars needed. Code defaults are the submission config. SP8192 data downloads automatically from `kevclark/parameter-golf` on first run. + +## Credits +Base: PR #1394 (@clarkkev) — SP8192 + Depth Recurrence + MuonEq-R + SDClip + GPTQ int6. +Parallel residuals pattern: PR #1334 (@aryanbhosale) — first demonstrated on SP4096. diff --git a/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/requirements.txt b/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/requirements.txt new file mode 100644 index 0000000000..71867b8334 --- /dev/null +++ b/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/requirements.txt @@ -0,0 +1 @@ +brotli diff --git a/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/submission.json b/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/submission.json new file mode 100644 index 0000000000..74144b1d48 --- /dev/null +++ b/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/submission.json @@ -0,0 +1,15 @@ +{ + "track": "10min_16mb", + "val_bpb_mean": 1.08459, + "val_bpb_std": 0.00069, + "seeds": [1337, 42, 2025], + "results": { + "1337": {"val_bpb": 1.08414061, "val_loss": 2.80044779, "bytes_total": 15985531}, + "42": {"val_bpb": 1.08423848, "val_loss": 2.80070059, "bytes_total": 15989295}, + "2025": {"val_bpb": 1.08538096, "val_loss": 2.80365174, "bytes_total": 15986932} + }, + "base_pr": 1394, + "hardware": "8xH100 SXM", + "training_time_seconds": 588, + "eval_method": "sliding_window" +} diff --git a/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/train_gpt.py b/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/train_gpt.py new file mode 100644 index 0000000000..8d977ea376 --- /dev/null +++ b/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/train_gpt.py @@ -0,0 +1,2 @@ +# VOCAB_SIZE, 8192 # for evaluate.py auto-detection +import lzma,base64;exec(compile(lzma.decompress(base64.b85decode(b'{Wp48S^xk9=GL@E0stWa8~^|S5YJf5;Hv5{s9gXwn@VT6Qap3bt~@<3h>ok~)Km^%c^ys%R{D_%yAk9-_tV7^coUOo3$w>`(`ci)t`2F7>r>Ltx>>S2CRw|7ov>Wn1e~_!RLQ=%V9g?)G3yPsu%SBy!lj1PaC-x%dDmCDOZ^r^!)+WWz}ejKXTJ#^U6Ra!};QocHHXQC+4UM!QQ!-N5Xd|%~a(9)bTYIO+>B~8~@lqmri%^qEkQUy074Rh6w7V_#^s9J-3BNA`G;qyR$LYcI?e+loZVWi~B$n=TKFp{%SeHYp{oNWh;U@Ahk8M2$OU%K8B$lb*dRQXd-GR_@*KAZdRdwSd#X_bO(lvJ3fp9Otblkh?o!zlDF02+sRjLV6IqG{ieQx44UY(f20c)^AD5kE{7_@f9?Q-ePHMY$wCTcn5ij2k?>T>CFcZ<|5Bh`%hA!j2d4G(X-Bbwu<(#drck2`tR2eo$wi$p$UEHkQdFiFmlJR#zIG@3*smdlqZ?s>Cn@I!i44iGk>T1KUmKDUWEJXYFF3Mh*&Tbca$esa+z^`enxeV%UmK_#Ex_)>$lBJA(Wj|4yV%J<~unPL@@@KfP=NTcv-SVPiG3BDdu=*>C1izrS~RvqEe6Re7Xf)zp2fR3F%Ntl(>3N{Nxb8vzZkhK?{Rpe$4_KYFAQQI8+y6>9jc4wHeJ2^*|#6%ShCfQ3zc=<|0Z`Ja7Y9E|lQy~CwnZkBE%teQ=I^Ddvss1V~h8|jSxW!&||)rKK)$#K{ntoatClG0YVqyY7Qmr3Mv0zyH;LQ+{zu5s@m_NC1q7o?u96Zu_lD}&wKWnBj8K_s&H!?(%4lDiW`^h3XfN^w7^NZ}p;fxjfP(oriMP93@G`Hokf^LPBO_tUOCA9Y?Ycbc|&Ozk{6p6^Be5B~llA6b0L6cbtv^Vkq!Kv#A|GL0_5%6C1U%s;o?{>h@9KI+Dwrzrw4p&PwOoXNEPg+i3?tCZo5gvp{$48`j*%6?1_2lYE_<5L|gF)I;xo=L^B}fgC34&i7g6#!NR1C%VE$;9A`1jy3(?aL)zptjpu@kuTmhtvwcM@3tw!`sx~L@_mU?cj#>c8IPpDOH&&j7o|Y3p|58R~&?Y{_mut<{Tnf&sxD9f3-X;AF4fUcuMBob=Cwz5gMSe4H<2Y58wc%s`q3kLoo?}Ounl{`xs5`2}>uTeQZf*|u+~MF52p?tSO5E2=N$0BDF4qUB-&k0+hQ`IXxLj6EcKaUYxWA@INm)8lVMY`UnZLg?lsz)JGZI$YMwMVQvy5L6ot48TZW4Ce=QbprK{6#vN}~oe4*NMzv%h|gAIZcf^UIHf460B!F3yBkcE}f8codomJt7D6T&`WwKkaIZ#PdEIRj1iM5Dg~vQfg<~lvTA6%=u3OxWLCehbC(Dz%VW-Li+^3#s3kwr4g#={BM5U>1fk|?Qf_d!cDQy=6c4lYhBnzZ1-ek9??{sRGuzCw85FX?`;NiXC9ULdAYgG#e}}~P}@5dT#m~HCbjF8X|AIyzv}0nI66v8Ecq}knsz5_8>0lb|MfnOUC+?~Wr&&xx8~4p6pCLBs~t;Qo$6t96lqD_4`1C8r}u_VnsLc4r0sjJ&jV#E)9@lk0vwVqwOrRCcvhq<5oC=|3+Zd3KUm>sT9%XHP)CBB^D3Ig#o6U*S;!Z%aY4YjOY3fc)Gamb-As0K^GfP0Nsbx5l1lNhuG%rTB#P|lX*@9z7UHid}4*hF}JV)V<+D&WTX{yEL>{6>$t!_6Y)@np{JD0~EYdiXTrPrR2lcj8v7(##<^bIPz{AjB}2C{v2{gIEe)Th*hFbnnNGv8Nv#WTYJSS=dR|2uOP=cK)I~-xY>D-5d$(_GjRL0xP_S#M@`RCd=2VjVio~hg8ieN@wK?B36~~1aW+G0q0Wh3hmEcaWPODLH*IiCG#AUXWBl|S9XHbc@FH`}8N%pw6h%2Is!STS;YHhUGnpj0i^gM7NXaG1<5yi^*H~dZP}kH3_R&Sj(vcKtKziSdH7lWGLd3l58x=xJ@V993OlE6X6}Ij^!U-MxCkwG3k*NBmP04c#koXtaqYpo5)>UyIND(hjWT%SN~rP_&e>@k={z#9~$%k&c5u**qU~ji*F!uGYvC=g6aX@!`QXuk{EsEk~x4IL7>2wHbfU*Xq)OneWpwl2whd3W3sC|5Xa96g(n9H;4ym3lmIdMpoY+npex#4`RU}>SD_6Zpxmvz9TsP_Z%pIu5Zqr#fLfiqRn?_RDKAW=#QNXN@&cG|g5?6WWtUl|L5(9nlvA3dpC9bq3Q{WvS%Vi{2M;WU~6Ym^n{Md_AygF@Z*NByLJX(p>d_0K_Jd-X_oj+r_jhVOzRstpVhVFl|%-9rI6}CFHYE=w7=cQr(Fj`NMl`+Dv9cGOcVvcT>h;~Rm#|={JKl`0#`!Qg-DoL`ZUa?L|kKs?E)GE+h?2I$_p2GX5ycyr`*dn@~$Zy4WplGhJco)tSYMW5P@JTr{H&1Rj&?S(m{17?+5z0)Wv@ZwY6i8l@X^5V;3(#3DB8E73aPg&PY$`qdEag;(uuP(#bpb7@_1W!7Q!GvyD1xQHr;{)QMWNXG-?NPDt>5aWzfoN&yZ{P4`mn6Vs6;_z3rHG?kbRb0h%yklV{Z5_OXKHm=oSSC;RFLbIfK&w=$+2u|C$>a3&YRj-o%mcEwu!=)IHIsxS?H^j<8hkEosvtfPXmljt^2vgah$4+3IDoN0~i53_}>*f$|#;`KSzpkQcb6pKnlb{PPojYm|z4PO{MZ$PLvCdbi#t|qz9}%so+#%RSjg70yYd1FTOm#<#LX=l^b+UC8m!1NBt9(wTOIWA#>vs61cqgjx){yf+%?W83JIS`Z0F|S(FKL6*Lq2rT()Ysz*gVQKMaCiTYW+NbsI_Nj$Fmmy)^5RO_wh;A>ZQt1LM#E`~77iAm|H?K9qHE0&W_m2_`0Gq28A5Ktrv#=XM1RgN*{GHx^A@?eADCWum0=6UNNw%?)ew;A0-s*O&%-EF`Rhw$MsPP22C;EkU-W*8vd8Y*(TgI|$lO{!xP`Sa7#JOpt~cQIr1c!Y3cKAnyXyn$8|!f@X_`xCEZPb-3(@Pip`>b<%K9*{OhVCvbNSsOVm3LOc6&C+-byY7iW$sTa6_?cojL3P756VE4Z^HqE*CCGOy&jsQFfY*D&Oz|J^>W^f7+YTVC0}&R|&HHgl#X>b50##u2OM?zEC1KDc{GlId7wIvLbB2I*-aSnO$U3N=)hInkyN3xmpYZ2SS@8c^hE|XHuv(9y$^HfS*Mxp`Qf4)fc9+klDQmud-MK?#84L2lx`G4VIpi_4bLu}!)+(kTy|2^99d`=+GUTB{0U9i^-~w%`Xjw_n6y4edygu?S9(_j*g{9Wd{W=UjM#K+aAZeFLKXUz0Xe!*Sp4r(|&)2*#bDTwoLVEvHURF2DkZQ{@L$zV&|J1!ah(qz5$d(prm3wjad;U3`H4$H-PbieNp+6@W)JPafNq1nde4x#<_ix0r8@qmWD+>zJOq|kndW8Yt>Q%+rN5t*X!mH>IuIZH>ZmgJS-xEFxwNtDT%X{@q#Qg)?kn?@HsY@sdXyfdYxSXy&5F+P^2y~n)5~x-#o!R;gJnKBIj(i=%Br3jcv)e-B*S*XE80`!hj*=iGT=@-xpEit)7ML=p&lVIh8TGoMm;d)g0*drnj5V~l9j1kN4gBb2m$_pK9mtvk1S%Y^6IhL5eHona7g1y0ScdcIew~ipsPw~|Y@rs(!y{5GJYop%2iEk&08gnS*P&=jYb~PS+mv-(hcYfkr3)u0H(nrdQ)kCx>AJq7#QoW&#`|g=`Rk!&mt;xr-Q_0TCjs-ApiLQmsr*J&mjB)2d`PHXhcH-2HecP_c%7-;Tkrx=Vvc*pKRq!`*wQ7fE&yUmVA5aRE|zzebuW#?9yyOTJQKKYpBLMSoLfw2@eDKaY2G!`^DFJmErP~7IGimPK*!k7<3)Haj9A63Zs$(y?6Bq2Vjq-f(I&vn&@UoRDW&#g}}C6I8_84COJpt0ns@Ap?j(5sjAGCZ1#^7Q8Gh9;p+Od=t{hA*U)jT}BUpAyu2(xwFeAvD@&lxr8QwM1Ho`$+1MDX{o3#H|H54tCDh4JTQ`xu)^(TOf!80W}FQFl;O64;?s#KO{72&Yf$l~hX~|;t2+hEB$u6B37qq6krbxM0Tv|GLIs0tCNG4nekbc^sq-O5N{OUN&+-3jfAW%*aC#f8Lyi7R5sAKEh}^R}42fc^aPRqUWfA(JeGx#U2H`M=SCf@c-5h9$AeW*L>F&!!-R0_zi5k_WQk3q4)}l%P$=^P}io`XM>@ELnE5W6r$5dKh+}F<&hX7!IakWv)a1cp(`SUsV_x8CIM(bU9SMG>zc(;3JCo>5@MO;$p%aL+{X{r^tJee#qbnq$BE;--vJ?$}DO9u9Qu9UrfCeD}E{M+&q={z1c`tftmnnm!%M=}w_bHkQ0P!V-;(*}n`6IZKf4Qn5SuKpPA1Q4fV7f%|qtY;x+{D6Vc17N;D>?}+B_=Z}gBiWn{VXx%xj@u1HS=k&zK`#2cRK)<^5v;kL>e|if5nzu=|>n}f~xQjAG<%N6Q{H@$Udpl#tCj!HT*HnOseR1}`aN3D>XT#*{5hTjn)=yQV8JAIWV6Z2mvZ4|RBrR%edMhqHgD^w-J}LJ>7#C-)TJn#9|a8G1dRL4Nv4&yzeuUlb?gN!et`g6lxluH%~y!2N)d^Ag;ZzI+9KPTU|{3XIAVgoNusLGlA+;|HtWPr2JMj_0VSC-2%snOf0pX$lFf~H9@MJoR2O3v=G!!lLc**r?fVTYz{U!%82<}E0I^ATxlD*!F_CqjnDjiHnbjBVh5^QD#_wo(}fTkYjkrOiPyn-)j$qrE-cZE=HacnknBb>J-*lSPz~+);iJB$!Dj3X(+*mULY&hKhA-5CR=wT`iZ7&UQ~tvn8gE%HMW2>oM`W!8OP5`(#=AiKI_&%#Q!F%>E}s4-c9hm-=;C?EKD+FEJ5p34D$xUP|-`^kjCwX_KpM&Qyt-+U5*^C!E;I?!ABp|#ipP}Yh+BHRn3nY62qAN_OV4sOjwOKQ|KKC)qLT`SD*^SP*2i?`49XRfhsCb_@sYT#ns;soSk=T1Rj_!{=lA?0qMFEp`?_`*HbRehw_cgvS%%D)&GF4S`aZ(B$9H|h-n6wExSb$Oj(#){*iphhoUp%{k!dcY2B*>wg~9qwj#*mGKe_m4iLDNUyH=0LOdTP3&Qb1d_0H2Qw=4!^PZH-5|z8=%n}zt_4%jQ!dpMzB+VH_3`B@KC$r<2>3qQkfOSh!JtKuIHpYhz7mUKS_4#Ckk>qoU~9fkUC}KJMroVoy5gRV8?j#bTxO{9SZaNe>c7D1!LCjpQ`&!9~!XWJ{zJCmKee7-SVA%DfYo(BUouqAh8yy-&;!xN6~eY@i8ED`;+JAt>7su=fs`Ul0Hh16--i8n&2Z|xsFt6To2xB>7COq2W0+-GmQ5Lrdu#2K07hsUNaCZy&-rzcvJ+}2XlYI(|jqd;m9gYpBay@OvfUT@Jiw+iA*ndR_*U#jp-QTyxwk%bavCEv0IAO<*c20q6IfsQWAQLBJVT;8eGye@joo%AO@UMJGBI8+UXe`Uw2f-qtaRCVKoAqS1mMpQeP#Ul6i-5NN&T!lxv{#@JWm4&zoMvJdlSI3+F2f983YKFHBRSK)v+rysm1!HBF>nj^hPjJ?Vi#L_n#HC8Gi?$J?7u4v#1x-HI2N7@v)(My3uG0^{`>nC5$2lw@zf$q4xM~%c$|{7}2;|CFN9p+V7Ma6?J5qaJU}swEiuLhrS5#)TAb_YIVSOE%vh}uIsn`u&%!;MvEmp@BO`>kUy~@e#TAIaEHh``AiC&K@X5J;F$g?uv;r}IaF)IX%V;j8Z6o_t+t6$@|W5y}!e?)<&BSFHPAn#5uT0>(3z>TpFLFVW&kH?5gqW(cuWbkI6+;G<0n81$&T2-DD^?RG7&#C}t*xIO|R0$+zPa}2{E%}*HXLHbc#TqRiVky=^X-W8@)(OYhYK1|iKTKlf(1M#)?X;pWlGi7u%Z)5sfuN9DOdJ;N@9LCzF=<{SdO&c`PgR8Rn>93jJ~0J)51*wKnRZU;S}fgh23U4%knBTHw{K5LZ60&B(5}s~xRm89Q(U$bZixVUv{mqe5?lj1}W!Zl^fPCclJy(ltmZ)0pfG1ER@OqM4q#GY6N%N~r8JEfaV^^DWq;;}2obb~nn)UQcTrZazSIi=eLL$FkWu}h=YZZ2PPgjZfTC&ywiDz}RrlQwy{L>KXu{>^$S5SWC|wjf#MBH3)0yS`}%1mE!=EpVr=uRk8PZep$@aNp-K`}zLR8fV%bf(4ig<>0lCOZVTL(={AXvm~y*w+3+>#(sDe5g=cgV2J>14BJSED;S;^dK59O73v_*|2i5gRM$wU;_L4g91GAICCCTeVh_T-Dt*|GXZ-~DAP!DgyBa4V3ehy4p>}g0J61qhQ%V#}qrH6qo`OTcy54j%e^TZWG+T0HiGnK#W@{<(faTNEv~B)WVK-seaRcU=0y71nuX~5p7!P3GP0%*LQht433Cf*f|6GT~o)OOUjpMlAjpdof&nfn2Lf5X-k1a(1LsT$$t%}Lj*mZyAsZMOc7P+7ss$huz6Hh2_j?oa1dn=Zvjv;|D&Aln59|CbC8R-76Y8Gh)-Kbv*Y3kn9d?H&G5{wok9r|8#Jf(?xyoaAq^!HjaiQ<;_xv@;~BTj?LAvr5UiVf)mc?5BAi8K;jF?2!!H7Rpy?|+McCb|Da_~Fvy!RzzwIuQF6aAo&3VA8UiJR#cHR18lAoC(Ja38$$Lc+I8A++$BN&Y7mDsw%&SpH4Ko+4Mu?>?5Z}b}ZbnXj#5sZ5{DNkp}v*p4E}iB9mgX0I{$hppe9LQ0bKhK5m&iH=HAvV!X+e7OUnTfE^=fC?5u*=jQe}S&>8pioi0?Q}@qg4&#kmEdd?J<^R7}^PnjCWw>7rT%K<7Z{FuzIBgZl>+@z=4hz7ZZ&~tblYH6b-=(=66zWRpq1_O!@5%6(e88M8;@9y#28GlCetzQ;@}w9pq)H*~GJGR|Q>zCAWxVytWWuvE%VcMqqXH}s{PGXF$btlAlT52Te>RXWMlQ{%!A!qc0#w6WBL)qFkr}G1VNcbUQ|;R!&~-rqzrW9u^hi;Fh7MMGyi~@alg5Zglb_lPupyb;+4`eX76|puRdv|=PAdIxT1JyI10eLn=?XvRd4tw}fWvEPtVw9wEyooLt&+oG($M;9ZWUl!1sXD)ZNfi@YeKm6M|=79pTiZ3VkZGoGi}e8hED1Gqnh@rFA+MIz2wdsXqg0-if9Q}P-xqUAjGZA&TluML8v_OviO>cHxXGT;hD-%_MAq7&Qeh_fv8D3A#`Wtn4Jh7BT?%g@4>R0EDp$W75>&C1;L}5_byyP6IdeR=DSh?V~@^&p2GHqrGai2E6uV8H2$x&aHa32dW6SX*6WLudZA9nYl8tvrYPrOD#?!xm{J#~wrI=Y!-yp|(P;}Y0HB^FmVngimmIarwBG!gr7E}$f2y@~qduVrZi}W2;Aw0vAxSNsB9h9;P1Ce;=psY*eE)r=9kppXNEC=AyfmqX|J3(FQ#ZY4wd>$O}h5kretP1qm)(bM(s|n#0a0ggvngf+4xNu{CRHwg64@qih*H%=TM8AGuXSa3FYbEy+8x_VJDfl;)ve7YW?HC~S7+Z?e?=Y5y(YTxS4CU?#nmU=$zDUuhTbkiK2tdfN^EILdE6p7R#h}xZNzEip^+XX@Whsz;pH!$@jC+tHr4=iMn?->YwSAKI5u_T&q(bT3fLd0&=9Q`JUwr`6^e+qpVQxTvl!t(hjt6BGR&i=>NterjcB_SjNJ4VVrJwJsx~ou<&TVj%y?4BN@h`zW9MLjo*&l}G3(f`aBq?j(-v91;@d=wOpmk0Z=8c2m$rjo*BU+nyswe^pY@nO8Cm|OADGQ|HZBvAR(W-)-QGf2T#^~E=vUPfN|*n1)6fQqj!$S09g-mE-f3edBTEy%WI=ruBNe+-y+g>qIMYPI>CeIfi0h{rZ*yo&-G4?9L83{eZzd=071Q=V%dbPk{ox;;)H$GPWh*}a0ntF#d^#2o!rtQ#_^!XT$Rlz{V096Qk5q|^ke+t86F3Lhud?pQUiA$%N?RT2xS_HMa#|kePt_oAwGmVi!-uMkOBD@6k<12{!)0aA0Nl8NP=NQWGGk8*#X^)Z*?o!x5Q93EwV=Of0hzG!7Dv=?6;?hpigwG589E?wV5?JeDJ$xEXh`l;SFvB_Wm+xUKw(j7^e=x?f5b~OzPgrE7oa6~no{P&$af_+_36!a`lsoOIN~WBfh^p~J#Uo4}@}a#Ne7X7FwP$$)me`Ug^K|VU=2YeCg6)%7v>?#diz)E<-Rc6$hZ_hk5URC^Ts)Xv9K$AAW`lzkxtu1t*>C+Z&Fp|Tb}DV>2XhO&^T%$Qyn^$ICcNDW<~--cre;v%4t2`~6yfE@p@LsPxa#u3*RK}n9t8yhx@d-@-^*3n6Jkht$ix;Uf;Z@-9+Ys6m3uLeX^QyS6^(!i4i!=oFlW$d9Dd&<|T7yGiHc*zFdHeSs@S4d@+r43ec(La&al0wCBV0h`n!*rWQ$PM|ngAuBIbe4K{y$*m|L|c{iQL@@RA%DWXF`k;J0Id!^-HV`@nk5SNAN!QaCUPE7h%KCfcV_993z_wcn~!0{z6^>|YjpE|G&?vuz@_uTgEuPx6|G}EqH@OpZfQU0U(SC}iP$cFu>{Uy$ieQrVtyL5+(q;hSCI{e=3QD4(}{A&S2nxrTl$1jI^2ce>?NJmjFa&solP8n1vf)OMw&0le->%bBgBa1Fp~UM9Otj;3A!sb<95Q0w3>_cF?VC(1Mb!G%R+cY<1_HVH%0|7v5S`Q2dXW+9s;RD%82N7SEv!PE|sI$q$t{`SUK*X3G$4aVqa<+B`VigwLw0=@*?;tq-Xy7@vObTvOK8d6zd)ZdTchhv%)AtuKJSbb+tlGG3KV@!USm8oy0lZLIft#=K*m@>UhPxSqB-7H`U6e;gfC!qhWfbS}Ita2(fIeo(02N|F*%KE*pGAb5Wi#K}6=2F-cC1$PJ*R#+W{-g7%>v?@sys(Li^^|%x;Qls1m0hP{y3iZtcJ;Bg?QO2l%1s{GacDIfs1H&$T4rR$GH(U7F9_#nkWsy9N#ixr0#l^MOe=|VfNFy<8dIyg(^w4z4jbX+75U&|DvEN;Oery0Y65r1t@qWv9j1Dpu2$=SwSrooWp~`Ad5Wu67}}c(dyaNffcPh%gdY$s-mHDtn;6-BLrB3UQ9rzvnI6Y1!wWzEHkvONe4|@CiZVPC}-N@;=rJEzs!8LAka6Zu+l)$wFlgWtCxto_%d;a8|+AcxrV(bybquFWzIlNDoGDOya9%{09?N%iS>u_hy~>e2$m30Xw4N)H$T`+wht)RVvFfO>Py}&Z$1lW7EnLpJ&uN7%i{@K@o`XwbSJv6C(;&Em^hfZV%PuL@B`*W!AutdUpCzqdt-GJ)-Wze~@AQ)1ni>_u?NRaG@4?-rS_JUd#Iq-L`xY^^|0+Uby+I=FEeR#9llHJDU#H26l5WEa8v?t*n^y?P&mlSZnX`5fH<3s68^q_Hon*=8E}=ff?9tKdGP}!sr#31D+dx12%g8EZklc#xp|8*V#auzpU6jp6nDhqTOz)cr!@}lWZCv6pDCbwK+lPw)!=*0K*|2r$MJ;e2dcW4|7c~12icGykpn=wC#i;WD>D{75|@>Nq`o8D$%r2uT+?Tv{xE2zJQ^n0@(4$h2f+g4)gK1H4LY2}iV2^V2kn(G1PL8h%8hSeaFPff_HjFE3wgrfm{5O~PbMcG(oQBX1oIbdJkP`~Gj1`X|ApkNo0mQ7ZPIuoVMhD}jnck|5I5NVzPWU&um`0qY);(4`olc5>(p(4A}cI+hn6<@=Hhe1)t_K$MSMc&u<5)^Q(>oDHx@ZIA~aCTi1hq4#iUs1^6f72Nw0AN!gmqsdUlQT{y-`2hYUP9H{7YA@Dp7o9GndKm}ZcVUScnzf|4og&Kn)Dr%iAEUBh7(0$SGQveSp)fd#jkaj_Jy}pYWOtYqis{O|zgnmYRZewjOt*7NNr6&N{k8k0#`kNWQD+KU47&WFJ%7A6_@?Em&M0-&ih3#Y~(v4P;)QfovBYQl0ssI200dcD')),'train_gpt.py','exec')) diff --git a/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/train_seed1337.log b/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/train_seed1337.log new file mode 100644 index 0000000000..81d4513d33 --- /dev/null +++ b/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/train_seed1337.log @@ -0,0 +1,152 @@ +From https://github.com/resouer/parameter-golf + * branch exp/round-11/w1 -> FETCH_HEAD +Note: switching to 'dfba2272e4bdfffc308593ecf87cd1c407cc617e'. +You are in 'detached HEAD' state. You can look around, make experimental +changes and commit them, and you can discard any commits you make in this +state without impacting any branches by switching back to a branch. +If you want to create a new branch to retain commits you create, you may +do so (now or later) by using -c with the switch command. Example: + git switch -c +Or undo this operation with: + git switch - +Turn off this advice by setting config variable advice.detachedHead to false +HEAD is now at dfba227 Pack train_gpt.py: lzma+base85 (43755→15798 bytes) with vocab detection header +data_setup: vocab=8192 shards=128 +Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. +W0406 06:54:10.597000 1 torch/distributed/run.py:803] +W0406 06:54:10.597000 1 torch/distributed/run.py:803] ***************************************** +W0406 06:54:10.597000 1 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.997 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/4d251436-8780-4ade-a0e4-8cdb4c91338b.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 4 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.02 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.085 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_start_layer: 7 + qk_gain_init: 4.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: 4d251436-8780-4ade-a0e4-8cdb4c91338b + scalar_lr: 0.02 + seed: 1337 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 128 +val_tokens: 40540160 +model_params:35943512 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0047 val_bpb: 3.4860 +1/20000 train_loss: 9.0084 train_time: 0.0m tok/s: 8033007 +2/20000 train_loss: 12.3253 train_time: 0.0m tok/s: 7985277 +3/20000 train_loss: 11.0666 train_time: 0.0m tok/s: 7915768 +4/20000 train_loss: 9.4821 train_time: 0.0m tok/s: 7884688 +5/20000 train_loss: 8.4563 train_time: 0.0m tok/s: 7866009 +500/20000 train_loss: 3.3996 train_time: 0.9m tok/s: 7650116 +1000/20000 train_loss: 3.2070 train_time: 1.7m tok/s: 7642596 +1500/20000 train_loss: 3.2021 train_time: 2.6m tok/s: 7645568 +2000/20000 train_loss: 3.1245 train_time: 3.4m tok/s: 7648645 +2500/20000 train_loss: 2.9898 train_time: 4.3m tok/s: 7645810 +layer_loop:enabled step:2859 frac:0.500 encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +3000/20000 train_loss: 2.9915 train_time: 5.2m tok/s: 7535851 +3500/20000 train_loss: 3.0112 train_time: 6.3m tok/s: 7232615 +4000/20000 train_loss: 2.9493 train_time: 7.5m tok/s: 7020410 +4000/20000 val_loss: 2.9178 val_bpb: 1.1296 +4500/20000 train_loss: 2.9293 train_time: 8.6m tok/s: 6831062 +5000/20000 train_loss: 2.9130 train_time: 9.8m tok/s: 6716278 +5019/20000 val_loss: 2.8154 val_bpb: 1.0899 +stopping_early: wallclock_cap train_time: 588031ms step: 5019/20000 +peak memory allocated: 34604 MiB reserved: 34708 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.81300466 val_bpb:1.08900177 eval_time:6298ms +Serialized model: 135426937 bytes +Code size: 15798 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 11.5s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15969733 bytes +Total submission size quantized+brotli: 15985531 bytes +final_int6_roundtrip_exact val_loss:2.84353000 val_bpb:1.10081908 eval_time:28402ms +final_int6_sliding_window val_loss:2.80044779 val_bpb:1.08414061 eval_time:115492ms diff --git a/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/train_seed2025.log b/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/train_seed2025.log new file mode 100644 index 0000000000..a7749617ce --- /dev/null +++ b/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/train_seed2025.log @@ -0,0 +1,152 @@ +From https://github.com/resouer/parameter-golf + * branch submission/r10-packed -> FETCH_HEAD +Note: switching to '30313f4091c56dc9b307db986433a927276a9248'. +You are in 'detached HEAD' state. You can look around, make experimental +changes and commit them, and you can discard any commits you make in this +state without impacting any branches by switching back to a branch. +If you want to create a new branch to retain commits you create, you may +do so (now or later) by using -c with the switch command. Example: + git switch -c +Or undo this operation with: + git switch - +Turn off this advice by setting config variable advice.detachedHead to false +HEAD is now at 30313f4 R10 W3 packed submission code (15798 bytes) +data_setup: vocab=8192 shards=128 +Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. +W0406 07:16:13.664000 1 torch/distributed/run.py:803] +W0406 07:16:13.664000 1 torch/distributed/run.py:803] ***************************************** +W0406 07:16:13.664000 1 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.997 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/98bb7cd6-1d43-4167-bcc9-19200d6d7fb2.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 4 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.02 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.085 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_start_layer: 7 + qk_gain_init: 4.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: 98bb7cd6-1d43-4167-bcc9-19200d6d7fb2 + scalar_lr: 0.02 + seed: 2025 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 128 +val_tokens: 40540160 +model_params:35943512 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0067 val_bpb: 3.4868 +1/20000 train_loss: 9.0104 train_time: 0.0m tok/s: 7986673 +2/20000 train_loss: 12.3810 train_time: 0.0m tok/s: 7990325 +3/20000 train_loss: 11.1231 train_time: 0.0m tok/s: 7950587 +4/20000 train_loss: 9.5025 train_time: 0.0m tok/s: 7919191 +5/20000 train_loss: 8.4666 train_time: 0.0m tok/s: 7905525 +500/20000 train_loss: 3.4009 train_time: 0.8m tok/s: 7713323 +1000/20000 train_loss: 3.2113 train_time: 1.7m tok/s: 7696059 +1500/20000 train_loss: 3.2054 train_time: 2.6m tok/s: 7696571 +2000/20000 train_loss: 3.1325 train_time: 3.4m tok/s: 7697640 +2500/20000 train_loss: 2.9975 train_time: 4.3m tok/s: 7695430 +layer_loop:enabled step:2877 frac:0.500 encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +3000/20000 train_loss: 2.9987 train_time: 5.2m tok/s: 7597947 +3500/20000 train_loss: 3.0134 train_time: 6.3m tok/s: 7288254 +4000/20000 train_loss: 2.9576 train_time: 7.4m tok/s: 7071881 +4000/20000 val_loss: 2.9232 val_bpb: 1.1317 +4500/20000 train_loss: 2.9343 train_time: 8.5m tok/s: 6913667 +5000/20000 train_loss: 2.9191 train_time: 9.7m tok/s: 6770717 +5055/20000 val_loss: 2.8180 val_bpb: 1.0909 +stopping_early: wallclock_cap train_time: 588116ms step: 5055/20000 +peak memory allocated: 34604 MiB reserved: 34708 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.81558464 val_bpb:1.09000056 eval_time:7304ms +Serialized model: 135426937 bytes +Code size: 15798 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 11.4s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15971134 bytes +Total submission size quantized+brotli: 15986932 bytes +final_int6_roundtrip_exact val_loss:2.84667194 val_bpb:1.10203542 eval_time:28343ms +final_int6_sliding_window val_loss:2.80365174 val_bpb:1.08538096 eval_time:114844ms diff --git a/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/train_seed42.log b/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/train_seed42.log new file mode 100644 index 0000000000..c5dc2c2ea1 --- /dev/null +++ b/records/track_10min_16mb/2026-04-06_SP8192_ParallelResiduals_CoprimeStride/train_seed42.log @@ -0,0 +1,152 @@ +From https://github.com/resouer/parameter-golf + * branch submission/r10-packed -> FETCH_HEAD +Note: switching to 'e079f2aa5278fa377d5ebcc6cbc851017537589b'. +You are in 'detached HEAD' state. You can look around, make experimental +changes and commit them, and you can discard any commits you make in this +state without impacting any branches by switching back to a branch. +If you want to create a new branch to retain commits you create, you may +do so (now or later) by using -c with the switch command. Example: + git switch -c +Or undo this operation with: + git switch - +Turn off this advice by setting config variable advice.detachedHead to false +HEAD is now at e079f2a R10 W3 packed submission code (15798 bytes) +data_setup: vocab=8192 shards=128 +Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. +W0406 06:54:10.462000 1 torch/distributed/run.py:803] +W0406 06:54:10.462000 1 torch/distributed/run.py:803] ***************************************** +W0406 06:54:10.462000 1 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.997 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/27e5376c-2590-48ed-a7ea-5e650ca7456f.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 4 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.02 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.085 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_start_layer: 7 + qk_gain_init: 4.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: 27e5376c-2590-48ed-a7ea-5e650ca7456f + scalar_lr: 0.02 + seed: 42 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 128 +val_tokens: 40540160 +model_params:35943512 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0090 val_bpb: 3.4877 +1/20000 train_loss: 9.0125 train_time: 0.0m tok/s: 7917075 +2/20000 train_loss: 12.3897 train_time: 0.0m tok/s: 7951107 +3/20000 train_loss: 11.1326 train_time: 0.0m tok/s: 7919545 +4/20000 train_loss: 9.5319 train_time: 0.0m tok/s: 7891117 +5/20000 train_loss: 8.4757 train_time: 0.0m tok/s: 7885644 +500/20000 train_loss: 3.3990 train_time: 0.8m tok/s: 7717413 +1000/20000 train_loss: 3.2145 train_time: 1.7m tok/s: 7702962 +1500/20000 train_loss: 3.2036 train_time: 2.6m tok/s: 7699670 +2000/20000 train_loss: 3.1254 train_time: 3.4m tok/s: 7697218 +2500/20000 train_loss: 2.9918 train_time: 4.3m tok/s: 7695502 +layer_loop:enabled step:2877 frac:0.500 encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +3000/20000 train_loss: 2.9932 train_time: 5.2m tok/s: 7596786 +3500/20000 train_loss: 3.0111 train_time: 6.3m tok/s: 7287585 +4000/20000 train_loss: 2.9540 train_time: 7.4m tok/s: 7071779 +4000/20000 val_loss: 2.9203 val_bpb: 1.1305 +4500/20000 train_loss: 2.9321 train_time: 8.5m tok/s: 6901862 +5000/20000 train_loss: 2.9166 train_time: 9.7m tok/s: 6771780 +5055/20000 val_loss: 2.8149 val_bpb: 1.0897 +stopping_early: wallclock_cap train_time: 588024ms step: 5055/20000 +peak memory allocated: 34604 MiB reserved: 34708 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.81255343 val_bpb:1.08882708 eval_time:6245ms +Serialized model: 135426937 bytes +Code size: 15798 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 11.4s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15973497 bytes +Total submission size quantized+brotli: 15989295 bytes +final_int6_roundtrip_exact val_loss:2.84382865 val_bpb:1.10093469 eval_time:28257ms +final_int6_sliding_window val_loss:2.80070059 val_bpb:1.08423848 eval_time:114963ms