Skip to content

Commit e22a1e5

Browse files
Antigravity Agentclaude
andcommitted
fix(train): relax early kill thresholds + farm --fresh/--seed-start (EXP-025)
72/72 W7 runs killed by aggressive thresholds calibrated to outlier seeds. v13 proof: PPL 73@60K killed by threshold 50. Kill thresholds (old → new): 10K: 200 → 500, 30K: 50 → 200, 60K: — → 100, 80K: — → 50 New features: - 4 configurable kill stages via --kill-ppl-* / HSLM_KILL_PPL_* env vars - Force-save checkpoint at 32K (historical PPL minimum) - checkpoint_best keeper (hslm_best.bin, never deleted) - Enhanced kill logging (step/ppl/threshold/seed) - tri farm recycle --fresh (HSLM_FRESH=1, was hardcoded 0) - tri farm recycle --seed-start N (was hardcoded 601) - T-JEPA objective support (--objective jepa/hybrid/ntp) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 7dd2f3f commit e22a1e5

File tree

4 files changed

+502
-24
lines changed

4 files changed

+502
-24
lines changed

EXPERIENCE_LOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,3 +206,11 @@ Structured knowledge base for HSLM training. Every significant event gets an ent
206206
**Context**: grad_clip=1.0 analysis across all code paths.
207207
**Outcome**: clip=1.0 is hardcoded default in trainer.zig, cli.zig, entrypoint_train.zig, tri_farm.zig. Applied per-parameter via clipGradNorm on 8 tensors (q/k/v/o, shadow_up/down, output_shadow/bias).
208208
**Lesson**: Clip is always on. It prevents catastrophic spikes but doesn't fix wrong optimizer/LR config. The 90x PPL difference (265 vs 2.96) is optimizer/LR, not clip.
209+
210+
### EXP-025 | FAILURE | 2026-03-15 | training
211+
**Impact**: CRITICAL
212+
**Context**: 72 W7 farm workers (FARM-4/5/6) ALL killed by aggressive early termination. Hardcoded thresholds: `step >= 10K && PPL > 200 → kill`, `step >= 30K && PPL > 50 → kill`. Thresholds calibrated to outlier seeds (R5/R23v2 PPL ~3 at 32K), not median convergence (PPL 50-112 at 30K). Local v13 proof: PPL 151 @ 30K → 73 @ 60K — healthy convergence killed prematurely.
213+
**Outcome**: All 72 runs wasted. Zero data collected beyond 30K steps. Equivalent to ~2160 GPU-hours lost.
214+
**Fix**: Relaxed defaults (10K→500, 30K→200, 60K→100, 80K→50) + 4 configurable stages via `--kill-ppl-*` flags and `HSLM_KILL_PPL_*` env vars. Added force-save at 32K (historical PPL minimum), checkpoint_best keeper (always saves best PPL), detailed kill logging (step/ppl/threshold/seed).
215+
**Lesson**: Never calibrate kill thresholds to best-case seeds. Use median convergence curve + 2x safety margin. v13 proof: PPL 73@60K would be killed by threshold 50 but survives with 100. Make thresholds configurable via env vars so farm operators can tune per-experiment.
216+
**Prevention**: Always validate new thresholds against ALL known runs (v13, W5-19, R18, R5, typical seed, bad seed) before deploying.

src/cli/entrypoint_train.zig

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ const TrainConfig = struct {
4545
adaptive_sparsity: bool = false,
4646
ternary_schedule: bool = false,
4747

48+
// T-JEPA objective
49+
objective: []const u8 = "ntp", // ntp | jepa | hybrid
50+
4851
// Data sharding (T10)
4952
data_shard: []const u8 = "0",
5053
num_shards: []const u8 = "1",
@@ -55,6 +58,12 @@ const TrainConfig = struct {
5558

5659
// Gradient clipping
5760
grad_clip: []const u8 = "1.0",
61+
62+
// Early kill thresholds (EXP-025: relaxed to match median convergence)
63+
kill_ppl_10k: []const u8 = "500",
64+
kill_ppl_30k: []const u8 = "200",
65+
kill_ppl_60k: []const u8 = "100",
66+
kill_ppl_80k: []const u8 = "50",
5867
};
5968

6069
fn envStr(key: []const u8, default: []const u8) []const u8 {
@@ -97,6 +106,9 @@ fn readConfig() TrainConfig {
97106
.adaptive_sparsity = envBool("HSLM_ADAPTIVE_SPARSITY", false),
98107
.ternary_schedule = envBool("HSLM_TERNARY_SCHEDULE", false),
99108

109+
// T-JEPA objective
110+
.objective = envStr("HSLM_OBJECTIVE", "ntp"),
111+
100112
// Data sharding
101113
.data_shard = envStr("HSLM_DATA_SHARD", "0"),
102114
.num_shards = envStr("HSLM_NUM_SHARDS", "1"),
@@ -105,6 +117,12 @@ fn readConfig() TrainConfig {
105117
// Validation split
106118
.val_split = envStr("HSLM_VAL_SPLIT", "0.1"),
107119
.grad_clip = envStr("HSLM_GRAD_CLIP", "1.0"),
120+
121+
// Early kill thresholds
122+
.kill_ppl_10k = envStr("HSLM_KILL_PPL_10K", "500"),
123+
.kill_ppl_30k = envStr("HSLM_KILL_PPL_30K", "200"),
124+
.kill_ppl_60k = envStr("HSLM_KILL_PPL_60K", "100"),
125+
.kill_ppl_80k = envStr("HSLM_KILL_PPL_80K", "50"),
108126
};
109127
}
110128

@@ -277,6 +295,10 @@ pub fn main() !void {
277295
.{ .flag = "--total-lines", .val = config.total_lines, .default = "15600056" },
278296
.{ .flag = "--val-split", .val = config.val_split, .default = "0.0" },
279297
.{ .flag = "--grad-clip", .val = config.grad_clip, .default = "1.0" },
298+
.{ .flag = "--kill-ppl-10k", .val = config.kill_ppl_10k, .default = "500" },
299+
.{ .flag = "--kill-ppl-30k", .val = config.kill_ppl_30k, .default = "200" },
300+
.{ .flag = "--kill-ppl-60k", .val = config.kill_ppl_60k, .default = "100" },
301+
.{ .flag = "--kill-ppl-80k", .val = config.kill_ppl_80k, .default = "50" },
280302
};
281303
for (optionals) |opt| {
282304
if (!std.mem.eql(u8, opt.val, opt.default)) {
@@ -368,6 +390,15 @@ pub fn main() !void {
368390
log.info("Zero initialization mode enabled", .{});
369391
}
370392

393+
// T-JEPA objective
394+
if (!std.mem.eql(u8, config.objective, "ntp")) {
395+
buf[argc] = "--objective";
396+
argc += 1;
397+
buf[argc] = config.objective;
398+
argc += 1;
399+
log.info("Objective: {s}", .{config.objective});
400+
}
401+
371402
const argv = buf[0..argc];
372403

373404
// Log the full command

0 commit comments

Comments
 (0)