Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions real_ttt_ng0_nogradfix.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
logs/real_ttt_ng0_nogradfix.txt
val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=/content/parameter-golf/data/tokenizers/fineweb_1024_bpe.model
train_loader:dataset:fineweb10B_sp1024 train_shards:1
val_loader:shards pattern=/content/parameter-golf/data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:32768
model_params:1492059
mtp_num_heads:0 mtp_loss_weight:0.2 mtp_params:0
XSA:last_2 active_layers:[2, 3]
world_size:1 grad_accum_steps:8
sdp_backends:cudnn=False flash=True mem_efficient=False math=False
attention_mode:gqa num_heads:6 num_kv_heads:3
tie_embeddings:True embed_lr:0.035 head_lr:0.0 matrix_lr:0.025 scalar_lr:0.025
train_batch_tokens:4096 train_seq_len:256 iterations:300 warmup_steps:20 max_wallclock_seconds:420.000
seed:2026
warmup_step:1/20
warmup_step:2/20
warmup_step:3/20
warmup_step:4/20
warmup_step:5/20
warmup_step:6/20
warmup_step:7/20
warmup_step:8/20
warmup_step:9/20
warmup_step:10/20
warmup_step:11/20
warmup_step:12/20
warmup_step:13/20
warmup_step:14/20
warmup_step:15/20
warmup_step:16/20
warmup_step:17/20
warmup_step:18/20
warmup_step:19/20
warmup_step:20/20
step:1/300 train_loss:6.9279 train_time:206ms step_avg:205.96ms
step:2/300 train_loss:7.1854 train_time:400ms step_avg:200.01ms
step:3/300 train_loss:7.3355 train_time:606ms step_avg:202.02ms
step:4/300 train_loss:7.1885 train_time:794ms step_avg:198.54ms
step:5/300 train_loss:6.7816 train_time:985ms step_avg:197.05ms
step:6/300 train_loss:6.7315 train_time:1174ms step_avg:195.71ms
step:7/300 train_loss:6.4685 train_time:1364ms step_avg:194.84ms
step:8/300 train_loss:6.2437 train_time:1556ms step_avg:194.55ms
step:9/300 train_loss:6.1190 train_time:1756ms step_avg:195.10ms
step:10/300 train_loss:6.1810 train_time:1942ms step_avg:194.16ms
step:25/300 train_loss:5.7491 train_time:4801ms step_avg:192.04ms
step:50/300 train_loss:4.9939 train_time:10252ms step_avg:205.04ms
step:75/300 train_loss:4.7562 train_time:15040ms step_avg:200.53ms
step:100/300 train_loss:4.5184 train_time:20458ms step_avg:204.58ms
step:125/300 train_loss:4.2640 train_time:25326ms step_avg:202.61ms
step:150/300 train_loss:4.1456 train_time:30144ms step_avg:200.96ms
step:175/300 train_loss:4.2562 train_time:35602ms step_avg:203.44ms
step:200/300 train_loss:4.2994 train_time:40430ms step_avg:202.15ms
step:225/300 train_loss:4.0574 train_time:45897ms step_avg:203.99ms
step:250/300 train_loss:4.0453 train_time:50704ms step_avg:202.81ms
step:275/300 train_loss:4.1673 train_time:55663ms step_avg:202.41ms
step:300/300 train_loss:4.0045 train_time:60921ms step_avg:203.07ms
step:300/300 val_loss:4.1373 val_bpb:2.3953 train_time:60922ms step_avg:203.07ms
peak memory allocated: 95 MiB reserved: 106 MiB
ema:applying EMA weights
DIAGNOSTIC post_ema val_loss:5.0071 val_bpb:2.8989 eval_time:514ms
Serialized model: 5388121 bytes
Code size: 95423 bytes
Serialized model int6+lzma: 1225892 bytes
Total submission size int6+lzma: 1321315 bytes
final_int6_roundtrip val_loss:5.0810 val_bpb:2.9417 eval_time:488ms
final_int6_roundtrip_exact val_loss:5.08102238 val_bpb:2.94169822
final_int6_sliding_window val_loss:5.0748 val_bpb:2.9349 stride:64 eval_time:589ms
final_int6_sliding_window_exact val_loss:5.07475208 val_bpb:2.93490344
final_int8_zlib_roundtrip_exact val_loss:5.07475208 val_bpb:2.93490344
ttt_sliding:start chunks=8 chunk_tokens=4096 total_windows=512 stride=64 ttt_lr=0.002 ttt_epochs=1 freeze_blocks=2
ttt_sliding:params unfrozen=1490511 frozen=1548
ttt_chunk [1/8] bpb=2.893501 time=0.2s
ttt_chunk [8/8] bpb=2.921239 time=1.0s
ttt_sliding:done val_loss=5.051125 val_bpb=2.921239 elapsed=1.0s
legal_ttt val_loss:5.0511 val_bpb:2.9212 eval_time:1029ms
legal_ttt_exact val_loss:5.05112508 val_bpb:2.92123914
75 changes: 75 additions & 0 deletions real_ttt_ng1_nogradfix.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
logs/real_ttt_ng1_nogradfix.txt
val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=/content/parameter-golf/data/tokenizers/fineweb_1024_bpe.model
train_loader:dataset:fineweb10B_sp1024 train_shards:1
val_loader:shards pattern=/content/parameter-golf/data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:32768
model_params:1492059
mtp_num_heads:0 mtp_loss_weight:0.2 mtp_params:0
XSA:last_2 active_layers:[2, 3]
world_size:1 grad_accum_steps:8
sdp_backends:cudnn=False flash=True mem_efficient=False math=False
attention_mode:gqa num_heads:6 num_kv_heads:3
tie_embeddings:True embed_lr:0.035 head_lr:0.0 matrix_lr:0.025 scalar_lr:0.025
train_batch_tokens:4096 train_seq_len:256 iterations:300 warmup_steps:20 max_wallclock_seconds:420.000
seed:2026
warmup_step:1/20
warmup_step:2/20
warmup_step:3/20
warmup_step:4/20
warmup_step:5/20
warmup_step:6/20
warmup_step:7/20
warmup_step:8/20
warmup_step:9/20
warmup_step:10/20
warmup_step:11/20
warmup_step:12/20
warmup_step:13/20
warmup_step:14/20
warmup_step:15/20
warmup_step:16/20
warmup_step:17/20
warmup_step:18/20
warmup_step:19/20
warmup_step:20/20
step:1/300 train_loss:6.9279 train_time:224ms step_avg:223.71ms
step:2/300 train_loss:7.1854 train_time:447ms step_avg:223.74ms
step:3/300 train_loss:7.3233 train_time:645ms step_avg:214.90ms
step:4/300 train_loss:7.1710 train_time:844ms step_avg:210.92ms
step:5/300 train_loss:6.7682 train_time:1036ms step_avg:207.29ms
step:6/300 train_loss:6.7159 train_time:1234ms step_avg:205.65ms
step:7/300 train_loss:6.4614 train_time:1460ms step_avg:208.55ms
step:8/300 train_loss:6.2392 train_time:1662ms step_avg:207.78ms
step:9/300 train_loss:6.1148 train_time:1859ms step_avg:206.57ms
step:10/300 train_loss:6.1764 train_time:2057ms step_avg:205.67ms
step:25/300 train_loss:5.7544 train_time:5130ms step_avg:205.20ms
step:50/300 train_loss:5.0143 train_time:10849ms step_avg:216.98ms
step:75/300 train_loss:4.7533 train_time:15840ms step_avg:211.20ms
step:100/300 train_loss:4.5147 train_time:21816ms step_avg:218.16ms
step:125/300 train_loss:4.2680 train_time:26773ms step_avg:214.18ms
step:150/300 train_loss:4.1545 train_time:32113ms step_avg:214.09ms
step:175/300 train_loss:4.2556 train_time:37308ms step_avg:213.19ms
step:200/300 train_loss:4.3005 train_time:42260ms step_avg:211.30ms
step:225/300 train_loss:4.0681 train_time:47834ms step_avg:212.59ms
step:250/300 train_loss:4.0530 train_time:52963ms step_avg:211.85ms
step:275/300 train_loss:4.1599 train_time:58561ms step_avg:212.95ms
step:300/300 train_loss:4.0000 train_time:63871ms step_avg:212.90ms
step:300/300 val_loss:4.1363 val_bpb:2.3948 train_time:63871ms step_avg:212.90ms
peak memory allocated: 95 MiB reserved: 106 MiB
ema:applying EMA weights
DIAGNOSTIC post_ema val_loss:5.0888 val_bpb:2.9462 eval_time:576ms
Serialized model: 5388121 bytes
Code size: 95423 bytes
Serialized model int6+lzma: 1219864 bytes
Total submission size int6+lzma: 1315287 bytes
final_int6_roundtrip val_loss:5.1846 val_bpb:3.0017 eval_time:574ms
final_int6_roundtrip_exact val_loss:5.18460592 val_bpb:3.00166873
final_int6_sliding_window val_loss:4.4451 val_bpb:2.5708 stride:64 eval_time:3455ms
final_int6_sliding_window_exact val_loss:4.44510223 val_bpb:2.57075530
final_int8_zlib_roundtrip_exact val_loss:4.44510223 val_bpb:2.57075530
ttt_sliding:start chunks=8 chunk_tokens=4096 total_windows=512 stride=64 ttt_lr=0.002 ttt_epochs=1 freeze_blocks=2
ttt_sliding:params unfrozen=1490511 frozen=1548
ttt_chunk [1/8] bpb=2.787920 time=0.4s
ttt_chunk [8/8] bpb=2.562853 time=3.1s
ttt_sliding:done val_loss=4.431438 val_bpb=2.562853 elapsed=3.1s
legal_ttt val_loss:4.4314 val_bpb:2.5629 eval_time:3104ms
legal_ttt_exact val_loss:4.43143776 val_bpb:2.56285268
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# CAGE5 Colab T4 smoke (non-record 16MB)

This folder captures a non-record smoke submission for Parameter Golf.

This is an in-progress submission used to validate a complete training -> quantization/export -> evaluation pipeline for a strictly causal hashed 5-gram mixer, and then verify that the same mixer also stacks with legal score-first TTT.

## Summary

- Hardware: 1x Tesla T4 (Google Colab GPU)
- Track: non-record-16mb
- Core idea: interpolate the neural model with a strictly causal hashed 5-gram cache during sliding-window evaluation and legal score-first TTT evaluation

## Best result in `train.log`

- `legal_ttt_exact val_loss: 4.43143776`
- `legal_ttt_exact val_bpb: 2.56285268`
- `final_int6_sliding_window_exact val_loss: 4.44510223`
- `final_int6_sliding_window_exact val_bpb: 2.57075530`
- `Total submission size int6+lzma: 1315287 bytes`
- `Serialized model int6+lzma: 1219864 bytes`
- `Code size: 95423 bytes`

## A/B result against baseline (`ablation_baseline.log`)

- Baseline with legal TTT, no n-gram: `legal_ttt_exact val_bpb = 2.92123914`
- Legal TTT + n-gram: `legal_ttt_exact val_bpb = 2.56285268`
- Absolute gain: `0.35838646 BPB`

## Included files

- `train_gpt.py` — Colab-tested script used for the winning smoke run
- `train.log` — winning legal TTT + n-gram run
- `ablation_baseline.log` — matched baseline without n-gram
- `submission.json` — metadata for this non-record smoke submission
- `README.md` — summary and results
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import math
import torch

def flash_attn_func(q, k, v, causal=True):
# Expected shapes:
# q: [B, T, H, D]
# k: [B, T, Hkv, D]
# v: [B, T, Hkv, D]
if q.ndim != 4 or k.ndim != 4 or v.ndim != 4:
raise ValueError(f"Unexpected shapes: q={tuple(q.shape)} k={tuple(k.shape)} v={tuple(v.shape)}")

bsz, seqlen, q_heads, head_dim = q.shape
kv_heads = k.shape[2]

if q_heads != kv_heads:
if q_heads % kv_heads != 0:
raise ValueError(f"q_heads={q_heads} must be divisible by kv_heads={kv_heads}")
repeat = q_heads // kv_heads
k = k.repeat_interleave(repeat, dim=2)
v = v.repeat_interleave(repeat, dim=2)

# Manual causal attention for maximum compatibility on Colab GPUs.
# Move heads forward: [B, H, T, D]
q = q.permute(0, 2, 1, 3).contiguous()
k = k.permute(0, 2, 1, 3).contiguous()
v = v.permute(0, 2, 1, 3).contiguous()

# Do attention math in fp32 for stability, then cast back.
qf = q.float()
kf = k.float()
vf = v.float()

scale = 1.0 / math.sqrt(head_dim)
scores = torch.matmul(qf, kf.transpose(-2, -1)) * scale

if causal:
mask = torch.ones((seqlen, seqlen), device=scores.device, dtype=torch.bool).triu(1)
scores = scores.masked_fill(mask, float("-inf"))

probs = torch.softmax(scores, dim=-1)
y = torch.matmul(probs, vf).to(q.dtype)

return y.permute(0, 2, 1, 3).contiguous()
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
zstandard
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"author": "Chandrasen Pandey",
"github_id": "Devchandrasen",
"name": "CAGE5 Colab T4 smoke with legal TTT + 5-gram mixer",
"blurb": "Non-record 16MB smoke submission on a Colab Tesla T4 validating a strictly causal hashed 5-gram mixer that stacks with legal score-first TTT.",
"date": "2026-03-26T00:00:00Z",
"track": "non-record-16mb",
"val_loss": 4.43143776,
"val_bpb": 2.56285268,
"wallclock_seconds": 82.48,
"bytes_total": 1315287,
"bytes_model_int6_lzma": 1219864,
"bytes_code": 95423
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
logs/confirm_best_alpha_seed2026.txt
val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=/content/parameter-golf/data/tokenizers/fineweb_1024_bpe.model
train_loader:dataset:fineweb10B_sp1024 train_shards:1
val_loader:shards pattern=/content/parameter-golf/data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:32768
model_params:417035
mtp_num_heads:0 mtp_loss_weight:0.2 mtp_params:0
XSA:last_1 active_layers:[1]
world_size:1 grad_accum_steps:8
sdp_backends:cudnn=False flash=True mem_efficient=False math=False
attention_mode:gqa num_heads:4 num_kv_heads:2
tie_embeddings:True embed_lr:0.035 head_lr:0.0 matrix_lr:0.025 scalar_lr:0.025
train_batch_tokens:4096 train_seq_len:256 iterations:300 warmup_steps:20 max_wallclock_seconds:300.000
seed:2026
warmup_step:1/20
warmup_step:2/20
warmup_step:3/20
warmup_step:4/20
warmup_step:5/20
warmup_step:6/20
warmup_step:7/20
warmup_step:8/20
warmup_step:9/20
warmup_step:10/20
warmup_step:11/20
warmup_step:12/20
warmup_step:13/20
warmup_step:14/20
warmup_step:15/20
warmup_step:16/20
warmup_step:17/20
warmup_step:18/20
warmup_step:19/20
warmup_step:20/20
step:1/300 train_loss:6.9320 train_time:131ms step_avg:131.44ms
step:2/300 train_loss:6.9578 train_time:246ms step_avg:122.91ms
step:3/300 train_loss:6.8724 train_time:357ms step_avg:118.89ms
step:4/300 train_loss:6.7315 train_time:468ms step_avg:116.92ms
step:5/300 train_loss:6.4418 train_time:591ms step_avg:118.28ms
step:6/300 train_loss:6.5043 train_time:703ms step_avg:117.08ms
step:7/300 train_loss:6.3753 train_time:814ms step_avg:116.33ms
step:8/300 train_loss:6.1810 train_time:928ms step_avg:116.03ms
step:9/300 train_loss:6.0762 train_time:1041ms step_avg:115.62ms
step:10/300 train_loss:6.1478 train_time:1151ms step_avg:115.09ms
step:25/300 train_loss:5.6688 train_time:2864ms step_avg:114.57ms
step:50/300 train_loss:4.9650 train_time:6360ms step_avg:127.21ms
step:75/300 train_loss:4.7800 train_time:9243ms step_avg:123.24ms
step:100/300 train_loss:4.5894 train_time:12128ms step_avg:121.28ms
step:125/300 train_loss:4.3424 train_time:14947ms step_avg:119.57ms
step:150/300 train_loss:4.2386 train_time:18404ms step_avg:122.69ms
step:175/300 train_loss:4.3761 train_time:21245ms step_avg:121.40ms
step:200/300 train_loss:4.4160 train_time:24078ms step_avg:120.39ms
step:225/300 train_loss:4.1972 train_time:26940ms step_avg:119.73ms
step:250/300 train_loss:4.1978 train_time:30427ms step_avg:121.71ms
step:275/300 train_loss:4.2685 train_time:33372ms step_avg:121.35ms
step:300/300 train_loss:4.1652 train_time:36238ms step_avg:120.79ms
step:300/300 val_loss:4.2688 val_bpb:2.4714 train_time:36238ms step_avg:120.79ms
peak memory allocated: 43 MiB reserved: 60 MiB
ema:applying EMA weights
DIAGNOSTIC post_ema val_loss:5.1194 val_bpb:2.9639 eval_time:270ms
Serialized model: 1315153 bytes
Code size: 94276 bytes
Serialized model int6+lzma: 562412 bytes
Total submission size int6+lzma: 656688 bytes
final_int6_roundtrip val_loss:5.1202 val_bpb:2.9644 eval_time:268ms
final_int6_roundtrip_exact val_loss:5.12024499 val_bpb:2.96440646
final_int6_sliding_window_s64 val_loss:4.6690 val_bpb:2.7002 stride:64 eval_time:2212ms
final_int6_sliding_window_s64_exact val_loss:4.66898128 val_bpb:2.70023225
final_int8_zlib_roundtrip_exact val_loss:4.66898128 val_bpb:2.70023225
Loading