Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
80 commits
Select commit Hold shift + click to select a range
45e0937
Plotting Code
Helw150 Dec 8, 2025
4076d48
Run Evals in ISOFlop
Helw150 Dec 9, 2025
61e66f4
Write Out JSONL for eval metrics rather than only storing in WandB
Helw150 Dec 16, 2025
58535f5
Infra for jobs which read eval metrics
Helw150 Dec 16, 2025
c26760b
Stash
Helw150 Dec 16, 2025
55d9074
IsoFLOPS into ExecutorStep
Helw150 Dec 16, 2025
e49bd99
Lots of refactoring
Helw150 Dec 18, 2025
348339d
More Cleanly Updates
Helw150 Dec 18, 2025
84b14aa
Remove Experiment Deps
Helw150 Dec 21, 2025
6ffaee8
LoB
Helw150 Dec 21, 2025
fd68b3e
More Refactor
Helw150 Dec 21, 2025
614827d
Refactoring
Helw150 Dec 22, 2025
e3f961b
Lint
Helw150 Dec 22, 2025
32fb074
Tweak
Helw150 Dec 22, 2025
6371fa4
Fix Caller
Helw150 Dec 22, 2025
06179a3
I don't actually want this, migrating is simpler
Helw150 Dec 22, 2025
81ed7a0
First Refactor
Helw150 Dec 22, 2025
d06e7f2
Fixes
Helw150 Dec 22, 2025
aa62db9
Lint
Helw150 Dec 22, 2025
681e73c
Full Run Scales
Helw150 Dec 22, 2025
3039dd8
Snapshot Test
Helw150 Dec 23, 2025
b141510
Fix UV Sync
Helw150 Dec 24, 2025
8acbdb3
Lint
Helw150 Dec 24, 2025
7ba8073
Just one
Helw150 Dec 26, 2025
05b7e96
More Complete Writing
Helw150 Dec 26, 2025
ae28885
Move to using metadata for all info
Helw150 Dec 26, 2025
66a7a30
Round counts now they are accurate
Helw150 Dec 26, 2025
001af32
New Tests and Merge Main
Helw150 Dec 26, 2025
c9349a6
Range Fix
Helw150 Dec 26, 2025
c9d3039
Lint
Helw150 Dec 26, 2025
06c3e5e
Lint
Helw150 Dec 26, 2025
cb8e36f
Claude Review Comment Fixes
Helw150 Dec 26, 2025
c395e85
Top Level Code for Validation Sets
Helw150 Jan 5, 2026
4691d5e
Focus tests in response to Russell PR comments
Helw150 Jan 5, 2026
23c87a7
Remove Parsing Since I moved to getting metadata directly
Helw150 Jan 6, 2026
0ffaa6e
Keep Only More General Test
Helw150 Jan 6, 2026
23ccd6b
Oversafe Claude
Helw150 Jan 6, 2026
c922953
Split Analysis, Plotting, and WandB steps
Helw150 Jan 6, 2026
6074f9e
Use Typed Return Values
Helw150 Jan 6, 2026
878d38b
Try to segment out the opinionated stuff
Helw150 Jan 6, 2026
20a584c
Fix FLOP counting bug
Helw150 Jan 7, 2026
3cc9214
Try to separate opinions from main code
Helw150 Jan 7, 2026
acc0ff4
Try to fix inversion
Helw150 Jan 7, 2026
a0ecf86
Lint
Helw150 Jan 7, 2026
90c2b38
Move the Optimizer stuff outside of the unopinionated section
Helw150 Jan 7, 2026
1e83300
Fix Mismatch Now
Helw150 Jan 7, 2026
86aedf1
Tmp
Helw150 Jan 7, 2026
c8a2d13
Try to separate concerns to experiments more
Helw150 Jan 8, 2026
13fdd1f
Lint
Helw150 Jan 8, 2026
5343625
Missing Tabs
Helw150 Jan 8, 2026
3258061
Keep Moving Stuff into the Recipe in Experiments
Helw150 Jan 8, 2026
fa453ae
Name Consistently with Chinchilla
Helw150 Jan 8, 2026
ff33db6
Lint
Helw150 Jan 8, 2026
19e6847
Keep Lib Opinion Clean Even More
Helw150 Jan 8, 2026
9cdeebf
Wandb is always available
Helw150 Jan 8, 2026
4421d2a
Claude Code got out of hand here
Helw150 Jan 8, 2026
569e301
Note Differences
Helw150 Jan 8, 2026
0150292
Remove all 6ND
Helw150 Jan 8, 2026
6ce4376
Simplify
Helw150 Jan 8, 2026
9c99d8d
Comment Tweak
Helw150 Jan 8, 2026
3f1fa57
Move Naming to the Experiment Code
Helw150 Jan 8, 2026
f50b879
Legacy Support
Helw150 Jan 9, 2026
c04ac2d
Path Bugs and Logging Bugs in training
Helw150 Jan 9, 2026
f946f83
Serialization Issues
Helw150 Jan 9, 2026
114e828
More Grugging
Helw150 Jan 9, 2026
5418131
Try to remove a lot of the batch_size dependency
Helw150 Jan 10, 2026
cb3939c
Refactor: move vocab_size inside ScalingRecipe
claude Jan 10, 2026
b0efefa
Use get_vocab_size_for_tokenizer to derive vocab_size from tokenizer
claude Jan 10, 2026
78620f7
Weird Roundtripping
Helw150 Jan 12, 2026
916ca6a
Requested Error
Helw150 Jan 13, 2026
75d8e1a
Refactor
Helw150 Jan 13, 2026
101cd87
Delete More
Helw150 Jan 13, 2026
e39d074
Comment
Helw150 Jan 13, 2026
d70596a
as_input_path and pre-estimate memory
Helw150 Jan 13, 2026
0fd52e4
Remove Needless Protocl
Helw150 Jan 13, 2026
dbe4784
Read Eval Records
Helw150 Jan 13, 2026
0c7592c
This can run the full ladder now
Helw150 Jan 14, 2026
20427d4
Mild Simplify
Helw150 Jan 14, 2026
705a47a
More Grad Accum
Helw150 Jan 17, 2026
2330ff8
Merge branch 'main' into will/scaling_plots
Helw150 Jan 19, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions docs/references/default-steps.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,6 @@ In general, you should reach for the default steps before writing your own.

::: experiments.defaults.simulated_epoching_train

## Scaling Law Prediction

::: marin.scaling_laws.create_ladder_suite.scaling_law_suite

::: experiments.defaults.default_scaling_law_pred

## Evaluation

::: experiments.evals.evals.default_eval
41 changes: 0 additions & 41 deletions experiments/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
CORE_TASKS,
MMLU_TASKS,
convert_to_levanter_task_config,
convert_to_task_metrics,
)
from experiments.llama import compute_num_parameters, llama_8b
from experiments.paloma import paloma_tokenized
Expand All @@ -59,7 +58,6 @@
InputName,
VersionedValue,
ensure_versioned,
get_executor_step,
this_output_path,
unwrap_versioned_value,
)
Expand All @@ -72,7 +70,6 @@
tokenize,
)
from marin.processing.tokenize.tokenize import HfTokenizeConfig, TokenizeConfigBase
from marin.scaling_laws.scaling_laws import ScalingLawConfig, run_scaling_law_analysis
from marin.training.training import (
TrainLmOnPodConfig,
run_levanter_train_lm,
Expand Down Expand Up @@ -637,41 +634,3 @@ def _get_tokenizer_for_train(tokenized: InputName | ExecutorStep | LMMixtureData
raise ValueError(f"Could not determine tokenizer from {tokenized}")

return tokenizer


def default_scaling_law_pred(
ladder_runs: Sequence[ExecutorStep | InputName | str],
pred_run: ExecutorStep | InputName | str | None = None,
task_losses: Sequence[str] = ("eval/paloma/c4_en/bpb",),
task_accuracies: Sequence[str] | Sequence[EvalTaskConfig] | None = None,
):
"""
Given a suite of small models, predict the performance on a number of (N, D) values.
"""
# get the executor steps or run IDs for the ladder runs and the pred run
ladder_steps_or_ids = [get_executor_step(run) if not isinstance(run, str) else run for run in ladder_runs]

pred_run_or_id = None
if pred_run:
pred_run_or_id = get_executor_step(pred_run) if not isinstance(pred_run, str) else pred_run

# convert the task accuracies to strings if they are `EvalTaskConfig`s
if task_accuracies is not None:
task_accuracies = convert_to_task_metrics(task_accuracies, metric="acc")

if pred_run_or_id:
name = pred_run_or_id if isinstance(pred_run_or_id, str) else pred_run_or_id.name
else:
name = "projection"

return ExecutorStep(
name=f"""scaling_laws/{name}""",
fn=run_scaling_law_analysis,
config=ScalingLawConfig(
name=name,
ladder_model_steps=ladder_steps_or_ids,
pred_model_step=pred_run_or_id,
task_losses=task_losses,
task_accuracies=task_accuracies,
),
)
18 changes: 9 additions & 9 deletions experiments/exp1600_perpcorr.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

from experiments.evals.evals import evaluate_levanter_lm_evaluation_harness
from experiments.evals.task_configs import EvalTaskConfig
from experiments.isoflop_sweep import generate_isoflop_sweep
from experiments.isoflop_sweep import MARIN_2025_RECIPE, create_isoflop_sweep_steps
from experiments.llama import llama3_tokenizer
from experiments.models import ModelConfig as HFModelConfig, download_model_step
from experiments.paloma import paloma_tokenized
Expand Down Expand Up @@ -56,22 +56,22 @@
@lru_cache(maxsize=1)
def build_steps():
steps = []
isoflop_steps, isoflop_metadatas = generate_isoflop_sweep(
isoflop_steps, isoflop_candidates = create_isoflop_sweep_steps(
nemotron_mix,
experiment_name="nemo-wider-depth-adapt",
recipe=MARIN_2025_RECIPE,
)
for isoflop_step, isoflop_metadata in zip(isoflop_steps, isoflop_metadatas, strict=False):
for isoflop_step, candidate in zip(isoflop_steps, isoflop_candidates, strict=False):
experiment_name = isoflop_step.name.split("/")[-1]
paloma_tokenized_dict = paloma_tokenized(tokenizer=llama3_tokenizer)
uncheatable_eval_tokenized_dict = uncheatable_eval_tokenized(tokenizer=llama3_tokenizer)
eval_data = mixture_for_evaluation(paloma_tokenized_dict | uncheatable_eval_tokenized_dict)
budget, hidden_size, num_layers, batch_size, train_steps = isoflop_metadata
wandb_tags = [
f"FLOPs={budget:.1e}",
f"d={hidden_size}",
f"L={num_layers}",
f"B={batch_size}",
f"steps={train_steps}",
f"FLOPs={candidate.flops_budget:.1e}",
f"d={candidate.hidden_size}",
f"L={candidate.num_layers}",
f"B={candidate.batch_size}",
f"steps={candidate.train_steps}",
]
model_config = isoflop_step.config.train_config.model
checkpoint_path = output_path_of(isoflop_step)
Expand Down
47 changes: 30 additions & 17 deletions experiments/exp1603_subgroup_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
from experiments.models import ModelConfig, download_model_step
from marin.execution.executor import executor_main, output_path_of, versioned
from marin.evaluation.log_probs import default_lm_log_probs
from marin.processing.tokenize import get_vocab_size_for_tokenizer

# Vocab size for building model configs
VOCAB_SIZE = get_vocab_size_for_tokenizer("stanford-crfm/marin-tokenizer")

# This is painfully slow to run in dry run mode
# nodryrun
Expand All @@ -40,8 +44,10 @@ def create_eval_steps() -> list:

steps = []
dist_eval = distributional_eval_sets(llama3_tokenizer)
for model, metadata in list(zip(*MARIN_SCALING_SUITES["nemotron"], strict=False)):
name = f"marin-nemo-{metadata[0]}C-{metadata[-3] * metadata[-2] * 4096}T-{metadata[1]}W-{metadata[2]}D"
for model, candidate in list(zip(*MARIN_SCALING_SUITES["nemotron"], strict=False)):
total_tokens = int(candidate.tokens)
params = candidate.model_config.total_trainable_params(VOCAB_SIZE)
name = f"marin-nemo-{candidate.flops_budget:.0e}C-{total_tokens}T-N{params:.0e}"

step = evaluate_levanter_lm_evaluation_harness(
model_name=name,
Expand All @@ -51,9 +57,10 @@ def create_eval_steps() -> list:
)
steps.append(step)

model_config = candidate.model_config
logprobs_step = default_lm_log_probs(
output_path_of(model).cd("checkpoints"),
metadata[-1],
model_config,
dist_eval,
resource_config=ResourceConfig.with_tpu("v5p-8"),
checkpoint_is_hf=False,
Expand All @@ -62,8 +69,10 @@ def create_eval_steps() -> list:

steps.append(logprobs_step)

for model, metadata in list(zip(*MARIN_SCALING_SUITES["common_pile"], strict=False)):
name = f"marin-comma-{metadata[0]}C-{metadata[-3] * metadata[-2] * 4096}T-{metadata[1]}W-{metadata[2]}D"
for model, candidate in list(zip(*MARIN_SCALING_SUITES["common_pile"], strict=False)):
total_tokens = int(candidate.tokens)
params = candidate.model_config.total_trainable_params(VOCAB_SIZE)
name = f"marin-comma-{candidate.flops_budget:.0e}C-{total_tokens}T-N{params:.0e}"

step = evaluate_levanter_lm_evaluation_harness(
model_name=name,
Expand All @@ -73,9 +82,10 @@ def create_eval_steps() -> list:
)
steps.append(step)

model_config = candidate.model_config
logprobs_step = default_lm_log_probs(
output_path_of(model).cd("checkpoints"),
metadata[-1],
model_config,
dist_eval,
resource_config=ResourceConfig.with_tpu("v5p-8"),
checkpoint_is_hf=False,
Expand All @@ -84,8 +94,10 @@ def create_eval_steps() -> list:

steps.append(logprobs_step)

for model, metadata in list(zip(*MARIN_SCALING_SUITES["dclm-default"], strict=False)):
name = f"marin-dclm-{metadata[0]}C-{metadata[-3] * metadata[-2] * 4096}T-{metadata[1]}W-{metadata[2]}D"
for model, candidate in list(zip(*MARIN_SCALING_SUITES["dclm-default"], strict=False)):
total_tokens = int(candidate.tokens)
params = candidate.model_config.total_trainable_params(VOCAB_SIZE)
name = f"marin-dclm-{candidate.flops_budget:.0e}C-{total_tokens}T-N{params:.0e}"

step = evaluate_levanter_lm_evaluation_harness(
model_name=name,
Expand All @@ -95,16 +107,17 @@ def create_eval_steps() -> list:
)
steps.append(step)

logprobs_step = default_lm_log_probs(
output_path_of(model).cd("checkpoints"),
metadata[-1],
dist_eval,
resource_config=ResourceConfig.with_tpu("v5p-8"),
checkpoint_is_hf=False,
name=versioned(f"{name}-DistRobust-ICE-logprobs"),
)
model_config = candidate.model_config
logprobs_step = default_lm_log_probs(
output_path_of(model).cd("checkpoints"),
model_config,
dist_eval,
resource_config=ResourceConfig.with_tpu("v5p-8"),
checkpoint_is_hf=False,
name=versioned(f"{name}-DistRobust-ICE-logprobs"),
)

steps.append(logprobs_step)
steps.append(logprobs_step)

baselines = [
("allenai/OLMo-2-1124-7B", "stage2-ingredient3-step8000-tokens34B"),
Expand Down
158 changes: 0 additions & 158 deletions experiments/exp1752_simulated_epoching.py

This file was deleted.

Loading