Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
c607e49
Rakı Training v3 - Markov curriculum + BigramHash + EMA
Mertyandimata Apr 3, 2026
9df8a78
Rakı v5 - Full integrated system: Stochastic Depth, TTT, BigramHash e…
Mertyandimata Apr 3, 2026
9d2c07b
Rakí v5 - hybrid entropy×surprise scoring
Mertyandimata Apr 3, 2026
925517d
Rakí v5 CUDA - H100 ready
Mertyandimata Apr 3, 2026
57f6510
v6 cuda bugfix
Mertyandimata Apr 3, 2026
07379fa
v7 all bugs fixed
Mertyandimata Apr 3, 2026
9ba11c1
Raki patcher for baseline train_gpt.py
Mertyandimata Apr 3, 2026
20f8a8d
Delete train_raki_v6.py
Mertyandimata Apr 3, 2026
f57362a
Delete train_raki_v3.py
Mertyandimata Apr 3, 2026
33f213f
Delete train_raki_v5.py
Mertyandimata Apr 3, 2026
36b99fd
Delete train_raki_v5_cuda.py
Mertyandimata Apr 3, 2026
d5e29c4
Raki Training: OEE-inspired Markov curriculum + EMA
Mertyandimata Apr 3, 2026
d902413
Yandimata v2: GPU-Markov + sliding window + int6 + zstd + all meta
Mertyandimata Apr 3, 2026
8dcfbaa
Delete train_raki_v7.py
Mertyandimata Apr 3, 2026
ff42cd6
Raki V2: INT8 fix + GPTQ-lite + all meta techniques
Mertyandimata Apr 3, 2026
b3dedfc
Çift Raki: 6B×2rec + trigram + int8
Mertyandimata Apr 3, 2026
b33b017
Çift Raki: 6B×2rec + trigram + int8
Mertyandimata Apr 3, 2026
152421b
V2: BigramHash + sliding window + SOTA params
Mertyandimata Apr 3, 2026
be9fec8
V2: BigramHash + sliding window + SOTA params
Mertyandimata Apr 3, 2026
00b43f8
V2: BigramHash + sliding window + SOTA params
Mertyandimata Apr 3, 2026
60e96dc
V2: add pruning for 16MB fit
Mertyandimata Apr 3, 2026
d31e17b
V3: mixed int6/int8 + Partial RoPE
Mertyandimata Apr 3, 2026
afb1b8d
V4: Adaptive Markov curriculum
Mertyandimata Apr 4, 2026
7d8b4dd
V4: Adaptive Markov + auto qmax
Mertyandimata Apr 4, 2026
a5c43de
V5: Raki triple role - curriculum + adaptive + logit boost
Mertyandimata Apr 4, 2026
10ee02e
V3: auto qmax, V5: triple Raki
Mertyandimata Apr 4, 2026
1b2593c
fix: global→globals() for auto qmax
Mertyandimata Apr 4, 2026
8f37271
Delete patch_v2.py
Mertyandimata Apr 4, 2026
85678fb
Delete patch_v4.py
Mertyandimata Apr 4, 2026
fd88fef
Delete patch_v3.py
Mertyandimata Apr 4, 2026
5707147
Delete patch_raki.py
Mertyandimata Apr 4, 2026
a5e4ffc
Delete patch_v5.py
Mertyandimata Apr 4, 2026
cbf0fee
feat: Raki V6 — Hadamard rotation + SVD boost + depth recycling + lay…
Mertyandimata Apr 4, 2026
6058dc7
V7: mulaw companding + bigram KL injection
Mertyandimata Apr 4, 2026
b5d1953
V8: LeakyReLU² + Late QAT + XSA4 + LN Scale + MLP3x
Mertyandimata Apr 4, 2026
8485317
V8 + comparison script
Mertyandimata Apr 4, 2026
2851bc4
V5 V7 V8 patches
Mertyandimata Apr 4, 2026
a7a731b
V8: LeakyReLU² + Late QAT + XSA4 + LN Scale + MLP3x
Mertyandimata Apr 4, 2026
e022bf9
V10
Mertyandimata Apr 4, 2026
c885184
V10
Mertyandimata Apr 4, 2026
d2262f2
V10
Mertyandimata Apr 4, 2026
5b2e1f8
V10 fix: GPTQ device mismatch
Mertyandimata Apr 5, 2026
98700da
V11: GPTQ fix + Brotli-11 + qTTT + decay prior
Mertyandimata Apr 5, 2026
0858a2f
V11: fix all 12 bugs
Mertyandimata Apr 5, 2026
e0e5a19
V11: GPTQ fix + Brotli-11 + qTTT + decay prior
Mertyandimata Apr 5, 2026
0d582ff
v12: SLOT-24 + pre-quant TTT
Mertyandimata Apr 5, 2026
cf5fe59
v13
Mertyandimata Apr 5, 2026
78e36d2
v14: PR1339 base + Markov curriculum + TurboMuon AOL + EMA-SWA blend
Mertyandimata Apr 5, 2026
79cfd27
v14
Mertyandimata Apr 5, 2026
4337e26
v14.1 auto qmax
Mertyandimata Apr 5, 2026
e3bfbe2
v14.2 auto qmax + dynamo reset + full audit
Mertyandimata Apr 5, 2026
a6bbe18
LoRA TTT
Mertyandimata Apr 7, 2026
c53bb92
Revert "LoRA TTT"
Mertyandimata Apr 7, 2026
5b2e3c4
Raki v6: EngramLite + Mousse + TTT — val_bpb 1.1026
Mertyandimata Apr 7, 2026
8c0a493
Add files via upload
Mertyandimata Apr 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions compare_all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash
# ============================================================
# Raki A/B/C Test — V5 vs V7 vs V8 (RunPod 1xGPU, 5min each)
# ============================================================
set -e

SECS=300
COMMON="MUON_WD=0.04 MATRIX_LR=0.025 SCALAR_LR=0.025 TIED_EMBED_LR=0.035 \
MUON_MOMENTUM=0.99 MUON_MOMENTUM_WARMUP_START=0.92 MUON_MOMENTUM_WARMUP_STEPS=500 \
EMA_DECAY=0.997 EVAL_STRIDE=64 TRAIN_BATCH_TOKENS=786432 \
MAX_WALLCLOCK_SECONDS=$SECS WARMUP_STEPS=10 VAL_LOSS_EVERY=500 SEED=1337"

echo "============================================"
echo " Raki Comparison — 5min × 3 runs (1xGPU)"
echo "============================================"

# --- data ---
if [ ! -d "./data/datasets/fineweb10B_sp1024" ]; then
echo "[DATA] Downloading..."
python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 10
else
echo "[DATA] Already present."
fi

mkdir -p logs
cp train_gpt.py train_gpt_backup.py

run_version() {
local VER=$1
local PATCH=$2
local EXTRA=$3
echo ""
echo "===== Running $VER ====="
cp train_gpt_backup.py train_gpt.py
python3 $PATCH
env $COMMON $EXTRA RUN_ID=test_$VER \
torchrun --standalone --nproc_per_node=1 train_gpt.py 2>&1 | tee logs/test_$VER.txt
}

# --- V5 ---
run_version "v5" "patch_v5.py" ""

# --- V7 ---
run_version "v7" "patch_v7.py" "MLP_MULT=3"

# --- V8 ---
run_version "v8" "patch_v8.py" ""

# --- restore ---
cp train_gpt_backup.py train_gpt.py

# --- results ---
echo ""
echo "============================================"
echo " RESULTS"
echo "============================================"
printf "%-6s %-14s %-14s %-10s %-8s\n" "Ver" "Pre-quant BPB" "Post-quant BPB" "Quant gap" "Steps"

for V in v5 v7 v8; do
LOG="logs/test_$V.txt"
PRE=$(grep "val_bpb" $LOG | grep -v "final" | tail -1 | sed 's/.*val_bpb:\([0-9.]*\).*/\1/' 2>/dev/null || echo "?")
POST=$(grep "roundtrip_exact" $LOG | sed 's/.*val_bpb:\([0-9.]*\).*/\1/' 2>/dev/null || echo "?")
STEP=$(grep "stopping_early\|^step:" $LOG | tail -1 | sed 's/.*step:\([0-9]*\).*/\1/' 2>/dev/null || echo "?")
if [[ "$PRE" != "?" && "$POST" != "?" ]]; then
GAP=$(python3 -c "print(f'{float(\"$POST\")-float(\"$PRE\"):.4f}')" 2>/dev/null || echo "?")
else
GAP="?"
fi
printf "%-6s %-14s %-14s %-10s %-8s\n" "$V" "$PRE" "$POST" "$GAP" "$STEP"
done

echo ""
echo "V8 quant gap << V5 quant gap = Late QAT calisiyor"
echo "V8 pre-quant < V5 pre-quant = LeakyReLU²+XSA+LN etkisi"
echo "============================================"
143 changes: 143 additions & 0 deletions fix_v11.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#!/usr/bin/env python3
"""Fix all bugs in patch_v11.py identified by Gemini/Grok/Opus."""
import sys

with open("patch_v11.py", "r") as f:
code = f.read()

fixes = 0

def fix(old, new, label):
global code, fixes
if old in code:
code = code.replace(old, new, 1)
fixes += 1
print(f" FIXED: {label}")
else:
print(f" SKIP: {label} (not found)")

# FIX 1: Turbo-Muon bf16 overflow — compute D_r/D_c in float32
fix(
''' X = G.bfloat16()
# AOL preconditioning: D_r^{-1/2} @ X @ D_c^{-1/2}
D_r = (X * X).sum(dim=1, keepdim=True).clamp_min(eps * eps)
D_c = (X * X).sum(dim=0, keepdim=True).clamp_min(eps * eps)
X = X / (D_r * D_c).pow(0.25)''',
''' X = G.bfloat16()
# AOL preconditioning in float32 to prevent bf16 overflow
Xf = X.float()
D_r = (Xf * Xf).sum(dim=1, keepdim=True).clamp_min(eps)
D_c = (Xf * Xf).sum(dim=0, keepdim=True).clamp_min(eps)
X = (Xf / (D_r * D_c).pow(0.25)).bfloat16()''',
"Turbo-Muon bf16 overflow")

# FIX 2: GPTQ double diagonal division — remove extra division
fix(
''' if j2 < n_cols:
W[:, j2:] -= Err @ (Hinv[j1:j2, j2:] / Hinv[j1:j2, j1:j2].diag().clamp_min(1e-10).unsqueeze(1))''',
''' if j2 < n_cols:
W[:, j2:] -= Err @ Hinv[j1:j2, j2:]''',
"GPTQ double diagonal division")

# FIX 3: Brotli quality in binary search — use q=4 for speed, q=11 only for final
fix(
''' _tsz = len(brotli.compress(_tbuf.getvalue(), quality=11))
if _tsz + _code_bytes <= 16_000_000:''',
''' _tsz = len(brotli.compress(_tbuf.getvalue(), quality=4))
del _tobj, _tbuf; import gc as _gc; _gc.collect()
if _tsz + _code_bytes <= 16_000_000:''',
"Brotli speed in binary search")

# FIX 4: Final GPTQ qmax loop also uses quality=4, then final check with q=11
fix(
''' _tsz = len(brotli.compress(_tbuf.getvalue(), quality=11))
if _tsz + _code_bytes <= 16_000_000:
globals()["BLOCK_QUANT_MAX"] = _try_qmax
log0(f"raki_v11:gptq_final_qmax={_try_qmax} est_bytes={_tsz + _code_bytes}")''',
''' _tsz = len(brotli.compress(_tbuf.getvalue(), quality=11))
del _tobj, _tbuf; import gc as _gc2; _gc2.collect()
if _tsz + _code_bytes <= 16_000_000:
globals()["BLOCK_QUANT_MAX"] = _try_qmax
log0(f"raki_v11:final_qmax={_try_qmax} bytes={_tsz + _code_bytes}")''',
"GPTQ final loop memory cleanup")

# FIX 5: best_mse device mismatch
fix(
"best_mse = torch.full((t32.shape[0],), float('inf'))",
"best_mse = torch.full((t32.shape[0],), float('inf'), device=t32.device)",
"best_mse device mismatch")

# FIX 6: TTT AdamW momentum conflict — create fresh optimizer per chunk
fix(
''' ttt_opt = torch.optim.AdamW(ttt_params, lr=TTT_LR, weight_decay=0.0)
seq_len = args.train_seq_len''',
''' seq_len = args.train_seq_len''',
"TTT remove global optimizer (moved to per-chunk)")

fix(
''' # TRAIN: fine-tune on scored chunk (AdamW, cosine LR)
base_ttt.train()''',
''' # TRAIN: fresh AdamW per chunk (no momentum conflict)
ttt_opt = torch.optim.AdamW(ttt_params, lr=TTT_LR, weight_decay=0.0)
base_ttt.train()''',
"TTT fresh optimizer per chunk")

# FIX 7: TTT decay prior + AdamW conflict — remove decay prior
fix(
''' # Decay prior: pull toward pre-TTT weights
if TTT_DECAY > 0:
with torch.no_grad():
for p in ttt_params:
p.data.add_(_pre_ttt[id(p)] - p.data, alpha=TTT_DECAY)''',
''' pass # No decay prior (conflicts with AdamW momentum)''',
"Remove TTT decay prior")

# FIX 8: Remove _pre_ttt allocation (no longer needed)
fix(
''' # Save pre-TTT weights for decay prior
_pre_ttt = {id(p): p.data.clone() for p in ttt_params}

ttt_opt''',
''' ttt_opt''',
"Remove pre-TTT weight copy")

# FIX 9: TTT ttt_tok_count — use int instead of CUDA tensor for counter
fix(
''' ttt_tok_count = torch.zeros((), device=device, dtype=torch.float64)''',
''' ttt_tok_count = 0''',
"TTT tok count as int")

# FIX 10: Hessian multi-file collection
fix(
''' hdr = np.fromfile(files[0], dtype="<i4", count=256)
ntok = min(int(hdr[2]), n_batches * seq_len + 1)
tokens = np.fromfile(files[0], dtype="<u2", count=ntok, offset=hdr_bytes)''',
''' all_tokens = []
remaining = n_batches * seq_len + 1
for f in files:
if remaining <= 0:
break
hdr = np.fromfile(f, dtype="<i4", count=256)
ntok = min(int(hdr[2]), remaining)
tok = np.fromfile(f, dtype="<u2", count=ntok, offset=hdr_bytes)
all_tokens.append(tok)
remaining -= len(tok)
tokens = np.concatenate(all_tokens) if all_tokens else np.array([], dtype=np.uint16)''',
"Hessian multi-file")

# FIX 11: Remove TTT_DECAY config (no longer used)
fix(
"TTT_DECAY = float(os.environ.get(\"TTT_DECAY\", \"0.001\"))\n",
"",
"Remove TTT_DECAY config")

# FIX 12: Update version strings
code = code.replace("raki_v11:ttt_starting", "raki_v11:ttt")
code = code.replace("raki_v11:ttt lr=", "raki_v11:ttt lr=")

with open("patch_v11.py", "w") as f:
f.write(code)

print(f"\n{fixes} fixes applied to patch_v11.py")
print("All critical bugs resolved: bf16 overflow, GPTQ double-div, brotli speed,")
print("device mismatch, memory leak, TTT momentum, hessian multi-file")
Loading