Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
429 changes: 429 additions & 0 deletions experiment_log.md

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions run_best.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash
# Run the best configuration found from 131 experiments
# Best result: 1.5207 BPB on RTX 4000 Ada (276 steps in 600s)
# Note: ITERATIONS=400 is the RTX 4000 proxy schedule horizon used in the experiments.
# The competition constraint is 10 minutes on 8xH100, not 400 fixed steps.
#
# Usage: bash run_best.sh
# Requires: GPU with PyTorch, same environment as train_gpt.py

cd "$(dirname "$0")"

env \
ITERATIONS=400 \
TIDAL_LR=1 \
LOGIT_SOFTCAP=15.0 \
ROPE_BASE=5000 \
PARALLEL_BLOCK=1 \
MLP_ACT=silu2 \
HEAD_DIVERSITY=1e-4 \
EMBED_LR=0.8 \
MATRIX_LR=0.11 \
ENCODER_LAYERS=0 \
NUM_KV_HEADS=2 \
TIE_EMBEDDINGS=0 \
python train_gpt_focal_fixed.py
1,523 changes: 1,523 additions & 0 deletions train_gpt_focal_fixed.py

Large diffs are not rendered by default.

87 changes: 87 additions & 0 deletions wave10_finetune.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/bin/bash
# Wave 10: Fine-grained tuning around best config from Wave 9
# This script assumes 1/8 asymmetric is confirmed best.
# Fine-tune softcap and matrix LR with small increments.
cd /workspace/parameter-golf
LOG="/workspace/wave10_results.log"

echo "=== WAVE 10: FINE-TUNE $(date) ===" > $LOG
echo "Building on best from Wave 9" >> $LOG
echo "" >> $LOG

grab() {
local name="$1"
local logfile=$(ls -t /workspace/parameter-golf/logs/*.txt | head -1)
local result=$(grep "^step:.*val_bpb" "$logfile" | tail -1)
echo "$result" >> $LOG
echo "END: $(date)" >> $LOG
echo "" >> $LOG
sleep 2
pkill -9 -f train_gpt_focal 2>/dev/null
sleep 3
}

run() {
local name="$1"
shift
echo "--- $name ---" >> $LOG
echo "START: $(date)" >> $LOG
env ITERATIONS=400 "$@" python train_gpt_focal_fixed.py > "/workspace/${name}.txt" 2>&1
grab "$name"
}

# ============================================
# PHASE 1: FINE-GRAINED SOFTCAP SWEEP ON 1/8
# If SC15 won Wave 9, sweep 13-17 in steps of 1
# ============================================
echo "========== SOFTCAP FINE SWEEP ===========" >> $LOG

# Sweep softcap around SC15 with MatLR=0.10 (E8 base)
run "F1_SC13_MatLR10" TIDAL_LR=1 LOGIT_SOFTCAP=13.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.10 ENCODER_LAYERS=1
run "F2_SC14_MatLR10" TIDAL_LR=1 LOGIT_SOFTCAP=14.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.10 ENCODER_LAYERS=1
run "F3_SC16_MatLR10" TIDAL_LR=1 LOGIT_SOFTCAP=16.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.10 ENCODER_LAYERS=1
run "F4_SC17_MatLR10" TIDAL_LR=1 LOGIT_SOFTCAP=17.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.10 ENCODER_LAYERS=1

# ============================================
# PHASE 2: FINE-GRAINED MATRIX LR SWEEP ON 1/8
# ============================================
echo "========== MATRIX LR FINE SWEEP ===========" >> $LOG

# E8 showed SC15+MatLR0.10 = 1.5354. Sweep around MatLR0.10 with SC15.
run "F5_SC15_MatLR009" TIDAL_LR=1 LOGIT_SOFTCAP=15.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.09 ENCODER_LAYERS=1
run "F6_SC15_MatLR011" TIDAL_LR=1 LOGIT_SOFTCAP=15.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.11 ENCODER_LAYERS=1

# ============================================
# PHASE 3: GQA ON 1/8
# 2 KV heads was decent before (A17 = 1.5761)
# With asymmetric it might be different.
# ============================================
echo "========== GQA ON 1/8 ===========" >> $LOG

run "F7_GQA2" TIDAL_LR=1 LOGIT_SOFTCAP=15.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.10 ENCODER_LAYERS=1 NUM_KV_HEADS=2

# ============================================
# PHASE 4: TIDAL WARMUP RATIO ON 1/8
# Default Tidal = 38.2% warmup. With more decoder
# layers and faster steps, maybe different ratio helps.
# ============================================
echo "========== TIDAL VARIANT ===========" >> $LOG

# Try 30% warmup (shorter warmup, more time at high LR)
run "F8_Tidal30" TIDAL_LR=1 TIDAL_WARMUP=0.30 LOGIT_SOFTCAP=15.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.10 ENCODER_LAYERS=1

# ============================================
# PHASE 5: STACK BEST COMBO
# Combine the best softcap + best LR from above
# ============================================
echo "========== FINAL STACK ===========" >> $LOG

# Stack: E8 config (SC15+MatLR0.10) + QK2.0
run "F9_E8_QK2" TIDAL_LR=1 LOGIT_SOFTCAP=15.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.10 ENCODER_LAYERS=1 QK_GAIN_INIT=2.0

# Rerun E8 config for confidence
run "F10_E8_Rerun" TIDAL_LR=1 LOGIT_SOFTCAP=15.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.10 ENCODER_LAYERS=1

echo "" >> $LOG
echo "=== WAVE 10 COMPLETE $(date) ===" >> $LOG
cat $LOG
39 changes: 39 additions & 0 deletions wave11_final.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash
# Wave 11: Trimmed to 2 key experiments only
cd /workspace/parameter-golf
LOG="/workspace/wave11_results.log"

echo "=== WAVE 11: TRIMMED $(date) ===" > $LOG
echo "Only 2 key experiments" >> $LOG
echo "" >> $LOG

grab() {
local name="$1"
local logfile=$(ls -t /workspace/parameter-golf/logs/*.txt | head -1)
local result=$(grep "^step:.*val_bpb" "$logfile" | tail -1)
echo "$result" >> $LOG
echo "END: $(date)" >> $LOG
echo "" >> $LOG
sleep 2
pkill -9 -f train_gpt_focal 2>/dev/null
sleep 3
}

run() {
local name="$1"
shift
echo "--- $name ---" >> $LOG
echo "START: $(date)" >> $LOG
env ITERATIONS=400 "$@" python train_gpt_focal_fixed.py > "/workspace/${name}.txt" 2>&1
grab "$name"
}

# G1: Untied embeddings — novel, could be big win
run "G1_Untied" TIDAL_LR=1 LOGIT_SOFTCAP=15.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.11 ENCODER_LAYERS=1 TIE_EMBEDDINGS=0

# G3: WD Schedule — competition winners use this
run "G3_WDSched" TIDAL_LR=1 LOGIT_SOFTCAP=15.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.11 ENCODER_LAYERS=1 WD_SCHEDULE=0.01

echo "" >> $LOG
echo "=== WAVE 11 COMPLETE $(date) ===" >> $LOG
cat $LOG
80 changes: 80 additions & 0 deletions wave12_aggressive.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/bin/bash
# Wave 12: Aggressive experiments — pure decoder, wider MLP, model scaling
# Uses findings from Waves 9-11 + 3.1MB artifact headroom
cd /workspace/parameter-golf
LOG="/workspace/wave12_results.log"

echo "=== WAVE 12: AGGRESSIVE $(date) ===" > $LOG
echo "Pure decoder, wider MLP, scaling experiments" >> $LOG
echo "" >> $LOG

grab() {
local name="$1"
local logfile=$(ls -t /workspace/parameter-golf/logs/*.txt | head -1)
local result=$(grep "^step:.*val_bpb" "$logfile" | tail -1)
echo "$result" >> $LOG
echo "END: $(date)" >> $LOG
echo "" >> $LOG
sleep 2
pkill -9 -f train_gpt_focal 2>/dev/null
sleep 3
}

run() {
local name="$1"
shift
echo "--- $name ---" >> $LOG
echo "START: $(date)" >> $LOG
env ITERATIONS=400 "$@" python train_gpt_focal_fixed.py > "/workspace/${name}.txt" 2>&1
grab "$name"
}

# ============================================
# PHASE 1: PURE DECODER (ENCODER_LAYERS=0)
# Bug fixed: default=-1, so ENCODER_LAYERS=0 now works
# All 9 layers as decoder, no encoder skip connections
# ============================================
echo "========== PURE DECODER ===========" >> $LOG

# H1: Pure decoder with best config (E8 base)
run "H1_PureDecoder" TIDAL_LR=1 LOGIT_SOFTCAP=15.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.11 ENCODER_LAYERS=0 NUM_KV_HEADS=2 TIE_EMBEDDINGS=0

# ============================================
# PHASE 2: WIDER MLP (3x)
# Top competition entries use 3x MLP width
# MLP_MULT=3 + SiLU² → hidden=1024 (vs current 682)
# More params but potentially much better quality
# Will be slower per step but might make up in quality
# ============================================
echo "========== WIDER MLP ===========" >> $LOG

# H2: 3x MLP on best config (1/8 split)
run "H2_MLP3x" TIDAL_LR=1 LOGIT_SOFTCAP=15.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.11 ENCODER_LAYERS=1 MLP_MULT=3 NUM_KV_HEADS=2 TIE_EMBEDDINGS=0

# H3: 3x MLP + pure decoder
run "H3_MLP3x_PureDec" TIDAL_LR=1 LOGIT_SOFTCAP=15.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.11 ENCODER_LAYERS=0 MLP_MULT=3 NUM_KV_HEADS=2 TIE_EMBEDDINGS=0

# ============================================
# PHASE 3: WD SCHEDULE (only env var that exists)
# WD_SCHEDULE ramps weight decay from 0 to target
# Competition winners use WD=0.04
# ============================================
echo "========== WEIGHT DECAY SCHEDULE ===========" >> $LOG

# H4: WD schedule ramping to 0.04
run "H4_WDSched04" TIDAL_LR=1 LOGIT_SOFTCAP=15.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.11 ENCODER_LAYERS=1 WD_SCHEDULE=0.04 NUM_KV_HEADS=2 TIE_EMBEDDINGS=0

# ============================================
# PHASE 4: STACK WINNERS FROM ABOVE
# ============================================
echo "========== STACK ===========" >> $LOG

# H5: Pure decoder + 3x MLP + WD (aggressive combo)
run "H5_AllStack" TIDAL_LR=1 LOGIT_SOFTCAP=15.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.11 ENCODER_LAYERS=0 MLP_MULT=3 WD_SCHEDULE=0.04 NUM_KV_HEADS=2 TIE_EMBEDDINGS=0

# H6: Best config rerun for final confidence
run "H6_BestRerun" TIDAL_LR=1 LOGIT_SOFTCAP=15.0 ROPE_BASE=5000 PARALLEL_BLOCK=1 MLP_ACT=silu2 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.11 ENCODER_LAYERS=1 NUM_KV_HEADS=2 TIE_EMBEDDINGS=0

echo "" >> $LOG
echo "=== WAVE 12 COMPLETE $(date) ===" >> $LOG
cat $LOG
107 changes: 107 additions & 0 deletions wave5_arch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/bin/bash
# Wave 5: Architecture experiments — static changes, torch.compile safe
cd /workspace/parameter-golf
LOG="/workspace/wave5_results.log"

echo "=== WAVE 5: ARCHITECTURE $(date) ===" > $LOG
echo "BASELINE: Cosine = 1.6117 | Best: 1.5744 (Tidal+SC20+RoPE5k+HD+AggrLR)" >> $LOG
echo "" >> $LOG

grab() {
local name="$1"
local logfile=$(ls -t /workspace/parameter-golf/logs/*.txt | head -1)
local result=$(grep "^step:.*val_bpb" "$logfile" | tail -1)
echo "$result" >> $LOG
echo "END: $(date)" >> $LOG
echo "" >> $LOG
sleep 2
pkill -9 -f train_gpt_focal 2>/dev/null
sleep 3
}

run() {
local name="$1"
shift
echo "--- $name ---" >> $LOG
echo "START: $(date)" >> $LOG
env ITERATIONS=400 "$@" python train_gpt_focal_fixed.py > "/workspace/${name}.txt" 2>&1
grab "$name"
}

# ============================================
# PHASE 0: ACTIVATION FUNCTIONS (on best config)
# ============================================
echo "========== ACTIVATION FUNCTIONS ==========" >> $LOG

# A1: LeakyReLU² — on the leaderboard! (#2 entry uses this)
run "A1_LeakyReLU2" TIDAL_LR=1 LOGIT_SOFTCAP=20.0 ROPE_BASE=5000 MLP_ACT=leaky_relu2

# A2: LeakyReLU² on cosine baseline (to isolate activation effect)
run "A2_LeakyReLU2_cosine" COSINE_LR=1 MLP_ACT=leaky_relu2

# A3: SwiGLU — standard in Llama/Mistral
run "A3_SwiGLU" TIDAL_LR=1 LOGIT_SOFTCAP=20.0 ROPE_BASE=5000 MLP_ACT=swiglu

# A4: SwiGLU on cosine baseline
run "A4_SwiGLU_cosine" COSINE_LR=1 MLP_ACT=swiglu

# A5: GELU² — smoother than ReLU²
run "A5_GELU2" TIDAL_LR=1 LOGIT_SOFTCAP=20.0 ROPE_BASE=5000 MLP_ACT=gelu2

# A6: SiLU² — like SwiGLU but without gate
run "A6_SiLU2" TIDAL_LR=1 LOGIT_SOFTCAP=20.0 ROPE_BASE=5000 MLP_ACT=silu2

# ============================================
# PHASE 1: BLOCK STRUCTURE
# ============================================
echo "========== BLOCK STRUCTURE ==========" >> $LOG

# A7: Parallel attention + MLP (PaLM-style)
run "A7_Parallel" TIDAL_LR=1 LOGIT_SOFTCAP=20.0 ROPE_BASE=5000 PARALLEL_BLOCK=1

# A8: Parallel on cosine
run "A8_Parallel_cosine" COSINE_LR=1 PARALLEL_BLOCK=1

# A9: Sandwich norm (extra norm after attention)
run "A9_Sandwich" TIDAL_LR=1 LOGIT_SOFTCAP=20.0 ROPE_BASE=5000 SANDWICH_NORM=1

# ============================================
# PHASE 2: ACTIVATION + STRUCTURE COMBOS
# ============================================
echo "========== COMBOS ==========" >> $LOG

# A10: LeakyReLU² + Parallel (combine two arch changes)
run "A10_LeakyParallel" TIDAL_LR=1 LOGIT_SOFTCAP=20.0 ROPE_BASE=5000 MLP_ACT=leaky_relu2 PARALLEL_BLOCK=1

# A11: SwiGLU + Parallel
run "A11_SwiGLUParallel" TIDAL_LR=1 LOGIT_SOFTCAP=20.0 ROPE_BASE=5000 MLP_ACT=swiglu PARALLEL_BLOCK=1

# A12: LeakyReLU² + full best config
run "A12_LeakyBest" TIDAL_LR=1 LOGIT_SOFTCAP=20.0 ROPE_BASE=5000 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.06 MLP_ACT=leaky_relu2

# A13: SwiGLU + full best config
run "A13_SwiGLUBest" TIDAL_LR=1 LOGIT_SOFTCAP=20.0 ROPE_BASE=5000 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.06 MLP_ACT=swiglu

# A14: LeakyReLU² + Parallel + full best (EVERYTHING)
run "A14_EVERYTHING" TIDAL_LR=1 LOGIT_SOFTCAP=20.0 ROPE_BASE=5000 HEAD_DIVERSITY=1e-4 EMBED_LR=0.8 MATRIX_LR=0.06 MLP_ACT=leaky_relu2 PARALLEL_BLOCK=1

# ============================================
# PHASE 3: WIDER/DEEPER WITH ARCH CHANGES
# ============================================
echo "========== SCALE WITH ARCH ==========" >> $LOG

# A15: LeakyReLU² + wider MLP (3x)
run "A15_LeakyMLP3" TIDAL_LR=1 LOGIT_SOFTCAP=20.0 ROPE_BASE=5000 MLP_ACT=leaky_relu2 MLP_MULT=3

# A16: SwiGLU + wider dim (since SwiGLU has fewer params per layer)
run "A16_SwiGLU_wide" TIDAL_LR=1 LOGIT_SOFTCAP=20.0 ROPE_BASE=5000 MLP_ACT=swiglu MODEL_DIM=576 NUM_LAYERS=8

# A17: LeakyReLU² + 2 KV heads (more aggressive GQA)
run "A17_LeakyGQA2" TIDAL_LR=1 LOGIT_SOFTCAP=20.0 ROPE_BASE=5000 MLP_ACT=leaky_relu2 NUM_KV_HEADS=2

# A18: LeakyReLU² + 1 KV head (MQA)
run "A18_LeakyMQA" TIDAL_LR=1 LOGIT_SOFTCAP=20.0 ROPE_BASE=5000 MLP_ACT=leaky_relu2 NUM_KV_HEADS=1

echo "" >> $LOG
echo "=== WAVE 5 COMPLETE $(date) ===" >> $LOG
cat $LOG
Loading