openai · Devchandrasen · Mar 25, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/real_ttt_ng0_nogradfix.log b/real_ttt_ng0_nogradfix.log
@@ -0,0 +1,75 @@
+logs/real_ttt_ng0_nogradfix.txt
+val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=/content/parameter-golf/data/tokenizers/fineweb_1024_bpe.model
+train_loader:dataset:fineweb10B_sp1024 train_shards:1
+val_loader:shards pattern=/content/parameter-golf/data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:32768
+model_params:1492059
+mtp_num_heads:0 mtp_loss_weight:0.2 mtp_params:0
+XSA:last_2 active_layers:[2, 3]
+world_size:1 grad_accum_steps:8
+sdp_backends:cudnn=False flash=True mem_efficient=False math=False
+attention_mode:gqa num_heads:6 num_kv_heads:3
+tie_embeddings:True embed_lr:0.035 head_lr:0.0 matrix_lr:0.025 scalar_lr:0.025
+train_batch_tokens:4096 train_seq_len:256 iterations:300 warmup_steps:20 max_wallclock_seconds:420.000
+seed:2026
+warmup_step:1/20
+warmup_step:2/20
+warmup_step:3/20
+warmup_step:4/20
+warmup_step:5/20
+warmup_step:6/20
+warmup_step:7/20
+warmup_step:8/20
+warmup_step:9/20
+warmup_step:10/20
+warmup_step:11/20
+warmup_step:12/20
+warmup_step:13/20
+warmup_step:14/20
+warmup_step:15/20
+warmup_step:16/20
+warmup_step:17/20
+warmup_step:18/20
+warmup_step:19/20
+warmup_step:20/20
+step:1/300 train_loss:6.9279 train_time:206ms step_avg:205.96ms
+step:2/300 train_loss:7.1854 train_time:400ms step_avg:200.01ms
+step:3/300 train_loss:7.3355 train_time:606ms step_avg:202.02ms
+step:4/300 train_loss:7.1885 train_time:794ms step_avg:198.54ms
+step:5/300 train_loss:6.7816 train_time:985ms step_avg:197.05ms
+step:6/300 train_loss:6.7315 train_time:1174ms step_avg:195.71ms
+step:7/300 train_loss:6.4685 train_time:1364ms step_avg:194.84ms
+step:8/300 train_loss:6.2437 train_time:1556ms step_avg:194.55ms
+step:9/300 train_loss:6.1190 train_time:1756ms step_avg:195.10ms
+step:10/300 train_loss:6.1810 train_time:1942ms step_avg:194.16ms
+step:25/300 train_loss:5.7491 train_time:4801ms step_avg:192.04ms
+step:50/300 train_loss:4.9939 train_time:10252ms step_avg:205.04ms
+step:75/300 train_loss:4.7562 train_time:15040ms step_avg:200.53ms
+step:100/300 train_loss:4.5184 train_time:20458ms step_avg:204.58ms
+step:125/300 train_loss:4.2640 train_time:25326ms step_avg:202.61ms
+step:150/300 train_loss:4.1456 train_time:30144ms step_avg:200.96ms
+step:175/300 train_loss:4.2562 train_time:35602ms step_avg:203.44ms
+step:200/300 train_loss:4.2994 train_time:40430ms step_avg:202.15ms
+step:225/300 train_loss:4.0574 train_time:45897ms step_avg:203.99ms
+step:250/300 train_loss:4.0453 train_time:50704ms step_avg:202.81ms
+step:275/300 train_loss:4.1673 train_time:55663ms step_avg:202.41ms
+step:300/300 train_loss:4.0045 train_time:60921ms step_avg:203.07ms
+step:300/300 val_loss:4.1373 val_bpb:2.3953 train_time:60922ms step_avg:203.07ms
+peak memory allocated: 95 MiB reserved: 106 MiB
+ema:applying EMA weights
+DIAGNOSTIC post_ema val_loss:5.0071 val_bpb:2.8989 eval_time:514ms
+Serialized model: 5388121 bytes
+Code size: 95423 bytes
+Serialized model int6+lzma: 1225892 bytes
+Total submission size int6+lzma: 1321315 bytes
+final_int6_roundtrip val_loss:5.0810 val_bpb:2.9417 eval_time:488ms
+final_int6_roundtrip_exact val_loss:5.08102238 val_bpb:2.94169822
+final_int6_sliding_window val_loss:5.0748 val_bpb:2.9349 stride:64 eval_time:589ms
+final_int6_sliding_window_exact val_loss:5.07475208 val_bpb:2.93490344
+final_int8_zlib_roundtrip_exact val_loss:5.07475208 val_bpb:2.93490344
+ttt_sliding:start chunks=8 chunk_tokens=4096 total_windows=512 stride=64 ttt_lr=0.002 ttt_epochs=1 freeze_blocks=2
+ttt_sliding:params unfrozen=1490511 frozen=1548
+  ttt_chunk [1/8] bpb=2.893501 time=0.2s
+  ttt_chunk [8/8] bpb=2.921239 time=1.0s
+ttt_sliding:done val_loss=5.051125 val_bpb=2.921239 elapsed=1.0s
+legal_ttt val_loss:5.0511 val_bpb:2.9212 eval_time:1029ms
+legal_ttt_exact val_loss:5.05112508 val_bpb:2.92123914
diff --git a/real_ttt_ng1_nogradfix.log b/real_ttt_ng1_nogradfix.log
@@ -0,0 +1,75 @@
+logs/real_ttt_ng1_nogradfix.txt
+val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=/content/parameter-golf/data/tokenizers/fineweb_1024_bpe.model
+train_loader:dataset:fineweb10B_sp1024 train_shards:1
+val_loader:shards pattern=/content/parameter-golf/data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:32768
+model_params:1492059
+mtp_num_heads:0 mtp_loss_weight:0.2 mtp_params:0
+XSA:last_2 active_layers:[2, 3]
+world_size:1 grad_accum_steps:8
+sdp_backends:cudnn=False flash=True mem_efficient=False math=False
+attention_mode:gqa num_heads:6 num_kv_heads:3
+tie_embeddings:True embed_lr:0.035 head_lr:0.0 matrix_lr:0.025 scalar_lr:0.025
+train_batch_tokens:4096 train_seq_len:256 iterations:300 warmup_steps:20 max_wallclock_seconds:420.000
+seed:2026
+warmup_step:1/20
+warmup_step:2/20
+warmup_step:3/20
+warmup_step:4/20
+warmup_step:5/20
+warmup_step:6/20
+warmup_step:7/20
+warmup_step:8/20
+warmup_step:9/20
+warmup_step:10/20
+warmup_step:11/20
+warmup_step:12/20
+warmup_step:13/20
+warmup_step:14/20
+warmup_step:15/20
+warmup_step:16/20
+warmup_step:17/20
+warmup_step:18/20
+warmup_step:19/20
+warmup_step:20/20
+step:1/300 train_loss:6.9279 train_time:224ms step_avg:223.71ms
+step:2/300 train_loss:7.1854 train_time:447ms step_avg:223.74ms
+step:3/300 train_loss:7.3233 train_time:645ms step_avg:214.90ms
+step:4/300 train_loss:7.1710 train_time:844ms step_avg:210.92ms
+step:5/300 train_loss:6.7682 train_time:1036ms step_avg:207.29ms
+step:6/300 train_loss:6.7159 train_time:1234ms step_avg:205.65ms
+step:7/300 train_loss:6.4614 train_time:1460ms step_avg:208.55ms
+step:8/300 train_loss:6.2392 train_time:1662ms step_avg:207.78ms
+step:9/300 train_loss:6.1148 train_time:1859ms step_avg:206.57ms
+step:10/300 train_loss:6.1764 train_time:2057ms step_avg:205.67ms
+step:25/300 train_loss:5.7544 train_time:5130ms step_avg:205.20ms
+step:50/300 train_loss:5.0143 train_time:10849ms step_avg:216.98ms
+step:75/300 train_loss:4.7533 train_time:15840ms step_avg:211.20ms
+step:100/300 train_loss:4.5147 train_time:21816ms step_avg:218.16ms
+step:125/300 train_loss:4.2680 train_time:26773ms step_avg:214.18ms
+step:150/300 train_loss:4.1545 train_time:32113ms step_avg:214.09ms
+step:175/300 train_loss:4.2556 train_time:37308ms step_avg:213.19ms
+step:200/300 train_loss:4.3005 train_time:42260ms step_avg:211.30ms
+step:225/300 train_loss:4.0681 train_time:47834ms step_avg:212.59ms
+step:250/300 train_loss:4.0530 train_time:52963ms step_avg:211.85ms
+step:275/300 train_loss:4.1599 train_time:58561ms step_avg:212.95ms
+step:300/300 train_loss:4.0000 train_time:63871ms step_avg:212.90ms
+step:300/300 val_loss:4.1363 val_bpb:2.3948 train_time:63871ms step_avg:212.90ms
+peak memory allocated: 95 MiB reserved: 106 MiB
+ema:applying EMA weights
+DIAGNOSTIC post_ema val_loss:5.0888 val_bpb:2.9462 eval_time:576ms
+Serialized model: 5388121 bytes
+Code size: 95423 bytes
+Serialized model int6+lzma: 1219864 bytes
+Total submission size int6+lzma: 1315287 bytes
+final_int6_roundtrip val_loss:5.1846 val_bpb:3.0017 eval_time:574ms
+final_int6_roundtrip_exact val_loss:5.18460592 val_bpb:3.00166873
+final_int6_sliding_window val_loss:4.4451 val_bpb:2.5708 stride:64 eval_time:3455ms
+final_int6_sliding_window_exact val_loss:4.44510223 val_bpb:2.57075530
+final_int8_zlib_roundtrip_exact val_loss:4.44510223 val_bpb:2.57075530
+ttt_sliding:start chunks=8 chunk_tokens=4096 total_windows=512 stride=64 ttt_lr=0.002 ttt_epochs=1 freeze_blocks=2
+ttt_sliding:params unfrozen=1490511 frozen=1548
+  ttt_chunk [1/8] bpb=2.787920 time=0.4s
+  ttt_chunk [8/8] bpb=2.562853 time=3.1s
+ttt_sliding:done val_loss=4.431438 val_bpb=2.562853 elapsed=3.1s
+legal_ttt val_loss:4.4314 val_bpb:2.5629 eval_time:3104ms
+legal_ttt_exact val_loss:4.43143776 val_bpb:2.56285268
diff --git a/records/track_non_record_16mb/2026-03-26_CAGE5_ColabT4_Smoke/README.md b/records/track_non_record_16mb/2026-03-26_CAGE5_ColabT4_Smoke/README.md
@@ -0,0 +1,35 @@
+# CAGE5 Colab T4 smoke (non-record 16MB)
+
+This folder captures a non-record smoke submission for Parameter Golf.
+
+This is an in-progress submission used to validate a complete training -> quantization/export -> evaluation pipeline for a strictly causal hashed 5-gram mixer, and then verify that the same mixer also stacks with legal score-first TTT.
+
+## Summary
+
+- Hardware: 1x Tesla T4 (Google Colab GPU)
+- Track: non-record-16mb
+- Core idea: interpolate the neural model with a strictly causal hashed 5-gram cache during sliding-window evaluation and legal score-first TTT evaluation
+
+## Best result in `train.log`
+
+- `legal_ttt_exact val_loss: 4.43143776`
+- `legal_ttt_exact val_bpb: 2.56285268`
+- `final_int6_sliding_window_exact val_loss: 4.44510223`
+- `final_int6_sliding_window_exact val_bpb: 2.57075530`
+- `Total submission size int6+lzma: 1315287 bytes`
+- `Serialized model int6+lzma: 1219864 bytes`
+- `Code size: 95423 bytes`
+
+## A/B result against baseline (`ablation_baseline.log`)
+
+- Baseline with legal TTT, no n-gram: `legal_ttt_exact val_bpb = 2.92123914`
+- Legal TTT + n-gram: `legal_ttt_exact val_bpb = 2.56285268`
+- Absolute gain: `0.35838646 BPB`
+
+## Included files
+
+- `train_gpt.py` — Colab-tested script used for the winning smoke run
+- `train.log` — winning legal TTT + n-gram run
+- `ablation_baseline.log` — matched baseline without n-gram
+- `submission.json` — metadata for this non-record smoke submission
+- `README.md` — summary and results
diff --git a/records/track_non_record_16mb/2026-03-26_CAGE5_ColabT4_Smoke/flash_attn_interface.py b/records/track_non_record_16mb/2026-03-26_CAGE5_ColabT4_Smoke/flash_attn_interface.py
@@ -0,0 +1,43 @@
+import math
+import torch
+
+def flash_attn_func(q, k, v, causal=True):
+    # Expected shapes:
+    # q: [B, T, H, D]
+    # k: [B, T, Hkv, D]
+    # v: [B, T, Hkv, D]
+    if q.ndim != 4 or k.ndim != 4 or v.ndim != 4:
+        raise ValueError(f"Unexpected shapes: q={tuple(q.shape)} k={tuple(k.shape)} v={tuple(v.shape)}")
+
+    bsz, seqlen, q_heads, head_dim = q.shape
+    kv_heads = k.shape[2]
+
+    if q_heads != kv_heads:
+        if q_heads % kv_heads != 0:
+            raise ValueError(f"q_heads={q_heads} must be divisible by kv_heads={kv_heads}")
+        repeat = q_heads // kv_heads
+        k = k.repeat_interleave(repeat, dim=2)
+        v = v.repeat_interleave(repeat, dim=2)
+
+    # Manual causal attention for maximum compatibility on Colab GPUs.
+    # Move heads forward: [B, H, T, D]
+    q = q.permute(0, 2, 1, 3).contiguous()
+    k = k.permute(0, 2, 1, 3).contiguous()
+    v = v.permute(0, 2, 1, 3).contiguous()
+
+    # Do attention math in fp32 for stability, then cast back.
+    qf = q.float()
+    kf = k.float()
+    vf = v.float()
+
+    scale = 1.0 / math.sqrt(head_dim)
+    scores = torch.matmul(qf, kf.transpose(-2, -1)) * scale
+
+    if causal:
+        mask = torch.ones((seqlen, seqlen), device=scores.device, dtype=torch.bool).triu(1)
+        scores = scores.masked_fill(mask, float("-inf"))
+
+    probs = torch.softmax(scores, dim=-1)
+    y = torch.matmul(probs, vf).to(q.dtype)
+
+    return y.permute(0, 2, 1, 3).contiguous()
diff --git a/records/track_non_record_16mb/2026-03-26_CAGE5_ColabT4_Smoke/requirements.txt b/records/track_non_record_16mb/2026-03-26_CAGE5_ColabT4_Smoke/requirements.txt
@@ -0,0 +1 @@
+zstandard
diff --git a/records/track_non_record_16mb/2026-03-26_CAGE5_ColabT4_Smoke/submission.json b/records/track_non_record_16mb/2026-03-26_CAGE5_ColabT4_Smoke/submission.json
@@ -0,0 +1,14 @@
+{
+  "author": "Chandrasen Pandey",
+  "github_id": "Devchandrasen",
+  "name": "CAGE5 Colab T4 smoke with legal TTT + 5-gram mixer",
+  "blurb": "Non-record 16MB smoke submission on a Colab Tesla T4 validating a strictly causal hashed 5-gram mixer that stacks with legal score-first TTT.",
+  "date": "2026-03-26T00:00:00Z",
+  "track": "non-record-16mb",
+  "val_loss": 4.43143776,
+  "val_bpb": 2.56285268,
+  "wallclock_seconds": 82.48,
+  "bytes_total": 1315287,
+  "bytes_model_int6_lzma": 1219864,
+  "bytes_code": 95423
+}
diff --git a/records/track_non_record_16mb/2026-03-26_CAGE5_ColabT4_Smoke/train.log b/records/track_non_record_16mb/2026-03-26_CAGE5_ColabT4_Smoke/train.log
@@ -0,0 +1,68 @@
+logs/confirm_best_alpha_seed2026.txt
+val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=/content/parameter-golf/data/tokenizers/fineweb_1024_bpe.model
+train_loader:dataset:fineweb10B_sp1024 train_shards:1
+val_loader:shards pattern=/content/parameter-golf/data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:32768
+model_params:417035
+mtp_num_heads:0 mtp_loss_weight:0.2 mtp_params:0
+XSA:last_1 active_layers:[1]
+world_size:1 grad_accum_steps:8
+sdp_backends:cudnn=False flash=True mem_efficient=False math=False
+attention_mode:gqa num_heads:4 num_kv_heads:2
+tie_embeddings:True embed_lr:0.035 head_lr:0.0 matrix_lr:0.025 scalar_lr:0.025
+train_batch_tokens:4096 train_seq_len:256 iterations:300 warmup_steps:20 max_wallclock_seconds:300.000
+seed:2026
+warmup_step:1/20
+warmup_step:2/20
+warmup_step:3/20
+warmup_step:4/20
+warmup_step:5/20
+warmup_step:6/20
+warmup_step:7/20
+warmup_step:8/20
+warmup_step:9/20
+warmup_step:10/20
+warmup_step:11/20
+warmup_step:12/20
+warmup_step:13/20
+warmup_step:14/20
+warmup_step:15/20
+warmup_step:16/20
+warmup_step:17/20
+warmup_step:18/20
+warmup_step:19/20
+warmup_step:20/20
+step:1/300 train_loss:6.9320 train_time:131ms step_avg:131.44ms
+step:2/300 train_loss:6.9578 train_time:246ms step_avg:122.91ms
+step:3/300 train_loss:6.8724 train_time:357ms step_avg:118.89ms
+step:4/300 train_loss:6.7315 train_time:468ms step_avg:116.92ms
+step:5/300 train_loss:6.4418 train_time:591ms step_avg:118.28ms
+step:6/300 train_loss:6.5043 train_time:703ms step_avg:117.08ms
+step:7/300 train_loss:6.3753 train_time:814ms step_avg:116.33ms
+step:8/300 train_loss:6.1810 train_time:928ms step_avg:116.03ms
+step:9/300 train_loss:6.0762 train_time:1041ms step_avg:115.62ms
+step:10/300 train_loss:6.1478 train_time:1151ms step_avg:115.09ms
+step:25/300 train_loss:5.6688 train_time:2864ms step_avg:114.57ms
+step:50/300 train_loss:4.9650 train_time:6360ms step_avg:127.21ms
+step:75/300 train_loss:4.7800 train_time:9243ms step_avg:123.24ms
+step:100/300 train_loss:4.5894 train_time:12128ms step_avg:121.28ms
+step:125/300 train_loss:4.3424 train_time:14947ms step_avg:119.57ms
+step:150/300 train_loss:4.2386 train_time:18404ms step_avg:122.69ms
+step:175/300 train_loss:4.3761 train_time:21245ms step_avg:121.40ms
+step:200/300 train_loss:4.4160 train_time:24078ms step_avg:120.39ms
+step:225/300 train_loss:4.1972 train_time:26940ms step_avg:119.73ms
+step:250/300 train_loss:4.1978 train_time:30427ms step_avg:121.71ms
+step:275/300 train_loss:4.2685 train_time:33372ms step_avg:121.35ms
+step:300/300 train_loss:4.1652 train_time:36238ms step_avg:120.79ms
+step:300/300 val_loss:4.2688 val_bpb:2.4714 train_time:36238ms step_avg:120.79ms
+peak memory allocated: 43 MiB reserved: 60 MiB
+ema:applying EMA weights
+DIAGNOSTIC post_ema val_loss:5.1194 val_bpb:2.9639 eval_time:270ms
+Serialized model: 1315153 bytes
+Code size: 94276 bytes
+Serialized model int6+lzma: 562412 bytes
+Total submission size int6+lzma: 656688 bytes
+final_int6_roundtrip val_loss:5.1202 val_bpb:2.9644 eval_time:268ms
+final_int6_roundtrip_exact val_loss:5.12024499 val_bpb:2.96440646
+final_int6_sliding_window_s64 val_loss:4.6690 val_bpb:2.7002 stride:64 eval_time:2212ms
+final_int6_sliding_window_s64_exact val_loss:4.66898128 val_bpb:2.70023225
+final_int8_zlib_roundtrip_exact val_loss:4.66898128 val_bpb:2.70023225