Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions experiments/evals/exp1602b_lm_eval_harness_selected.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Copyright 2025 The Marin Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Run the selected LM Eval Harness tasks across a set of Marin, Qwen 2.5, OLMo 2, Llama 3, and OLMo 3 models.
"""

from collections.abc import Iterable
from dataclasses import replace

from fray.cluster import ResourceConfig
from experiments.evals.evals import default_eval
from experiments.evals.task_configs import LM_EVAL_HARNESS_SELECTED_TASKS
from experiments.models import (
llama_3_1_8b,
llama_3_70b,
marin_8b_base,
marin_32b_base,
olmo_2_base_32b,
olmo_2_base_8b,
olmo_3_32b,
olmo_3_7b,
qwen2_5_32b,
)
from marin.evaluation.evaluation_config import EvalTaskConfig
from marin.execution.executor import ExecutorStep, executor_main

MARIN_MODELS: tuple[ExecutorStep, ...] = (marin_8b_base, marin_32b_base)
QWEN_2_5_MODELS: tuple[ExecutorStep, ...] = (qwen2_5_32b, )
OLMO_2_MODELS: tuple[ExecutorStep, ...] = (olmo_2_base_8b, olmo_2_base_32b)
LLAMA_3_MODELS: tuple[ExecutorStep, ...] = (llama_3_1_8b, llama_3_70b)
OLMO_3_MODELS: tuple[ExecutorStep, ...] = (olmo_3_7b, olmo_3_32b)

ALL_MODEL_STEPS: tuple[ExecutorStep, ...] = (
# *MARIN_MODELS,
# *QWEN_2_5_MODELS,
# *OLMO_2_MODELS,
# *LLAMA_3_MODELS,
*OLMO_3_MODELS,
)


def _create_per_task_eval_steps(model_step: ExecutorStep, tasks: Iterable[EvalTaskConfig]) -> list[ExecutorStep]:
"""Return one evaluation step per LM Eval Harness task for a given model."""

per_task_steps: list[ExecutorStep] = []
for task in tasks:
eval_step = default_eval(
step=model_step,
resource_config=ResourceConfig.with_tpu("v5p-8"),
evals=(task,),
discover_latest_checkpoint=False,
)
task_label = task.task_alias or task.name
# Make it obvious which harness task is running to simplify scheduling/debugging.
per_task_steps.append(replace(eval_step, name=f"{eval_step.name}/{task_label}"))

return per_task_steps


eval_steps: list[ExecutorStep] = []
for model_step in ALL_MODEL_STEPS:
eval_steps.extend(_create_per_task_eval_steps(model_step, LM_EVAL_HARNESS_SELECTED_TASKS))

if __name__ == "__main__":
# executor_main(steps=eval_steps)
for i in range(0, len(eval_steps), 4):
executor_main(steps=eval_steps[i : i + 4])
29 changes: 29 additions & 0 deletions experiments/evals/task_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,34 @@
)

# LM-Eval-Harness Tasks
LM_EVAL_HARNESS_SELECTED_TASKS = ( # multiple choice tasks
EvalTaskConfig("copa", 0),
EvalTaskConfig("mmlu", 5),
EvalTaskConfig("leaderboard_musr", 0),
EvalTaskConfig("anli_r1", 0),
EvalTaskConfig("anli_r2", 0),
EvalTaskConfig("anli_r3", 0),
EvalTaskConfig("truthfulqa_mc2", 6),
EvalTaskConfig("race", 0),
EvalTaskConfig("toxigen", 0),
EvalTaskConfig("agieval_lsat_ar", 3),
EvalTaskConfig("arc_easy", 10),
EvalTaskConfig("arc_challenge", 10),
EvalTaskConfig("leaderboard_bbh", 3),
EvalTaskConfig("boolq", 10),
EvalTaskConfig("commonsense_qa", 10),
EvalTaskConfig("leaderboard_gpqa", 0),
EvalTaskConfig("hellaswag", 10),
EvalTaskConfig("leaderboard_mmlu_pro", 5),
EvalTaskConfig("openbookqa", 0),
EvalTaskConfig("piqa", 10),
EvalTaskConfig("winogrande", 0),
EvalTaskConfig("wsc273", 0),
EvalTaskConfig("squadv2", 0),
EvalTaskConfig("minerva_math", 4),
)


# Reasoning and Logic Tasks
REASONING_TASKS = (
EvalTaskConfig("anli_r1", 0, task_alias="anli_r1_0shot"),
Expand Down Expand Up @@ -312,6 +340,7 @@
)



def convert_to_levanter_task_config(tasks: Sequence[EvalTaskConfig]) -> list[TaskConfig]:
"""
Convert a list of EvalTaskConfig to a list of TaskConfig that Levanter's eval_harness expects.
Expand Down
29 changes: 29 additions & 0 deletions experiments/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,14 @@ def get_model_local_path(step: ExecutorStep) -> str:
)
)

llama_3_70b = download_model_step(
ModelConfig(
hf_repo_id="meta-llama/Meta-Llama-3-70B",
hf_revision="main",
)
)


tulu_3_1_8b_sft = download_model_step(
ModelConfig(
hf_repo_id="allenai/Llama-3.1-Tulu-3-8B-SFT",
Expand Down Expand Up @@ -163,6 +171,20 @@ def get_model_local_path(step: ExecutorStep) -> str:
)
)

olmo_3_7b = download_model_step(
ModelConfig(
hf_repo_id="allenai/OLMo-3-1025-7B",
hf_revision="main",
)
)

olmo_3_32b = download_model_step(
ModelConfig(
hf_repo_id="allenai/OLMo-3-1125-32B",
hf_revision="main",
)
)

amber_base_7b = download_model_step(
ModelConfig(
hf_repo_id="LLM360/Amber",
Expand All @@ -185,6 +207,13 @@ def get_model_local_path(step: ExecutorStep) -> str:
)
)

marin_32b_base = download_model_step(
ModelConfig(
hf_repo_id="marin-community/marin-32b-base",
hf_revision="main",
)
)

llama_3_2_1b = download_model_step(
ModelConfig(
hf_repo_id="meta-llama/Llama-3.2-1B",
Expand Down
17 changes: 12 additions & 5 deletions lib/levanter/src/levanter/eval_harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -1309,11 +1309,18 @@ def _compute_averages(outputs):
for metric in metric_keys:
# Collect valid tasks for this metric
# We iterate over the n-samples because real tasks (as opposed to aggregates like "mmlu") have counts
valid_tasks = [
(outputs["results"][task_name].get(metric), outputs["n-samples"][task_name]["effective"])
for task_name in outputs["n-samples"]
if outputs["results"][task_name].get(metric, None) is not None
]
valid_tasks = []
for task_name, sample_counts in outputs["n-samples"].items():
task_results = outputs["results"].get(task_name)
if task_results is None:
logger.debug("Skipping %s because no results were produced.", task_name)
continue

metric_value = task_results.get(metric)
if metric_value is None:
continue

valid_tasks.append((metric_value, sample_counts["effective"]))

if not valid_tasks:
continue # Skip metrics with no valid tasks
Expand Down
Loading
Loading