Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions experiments/evals/run_ruler_evals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright 2025 The Marin Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Evaluate Llama-3.1-8B-Instruct on the RULER long-context benchmark.
"""

from experiments.evals.evals import default_eval
from experiments.evals.task_configs import LONG_CONTEXT_TASKS
from experiments.models import llama_3_1_8b_instruct
from fray.cluster import ResourceConfig
from marin.execution.executor import executor_main

ruler_eval_step = default_eval(
step=llama_3_1_8b_instruct,
resource_config=ResourceConfig.with_tpu("v4-8"),
evals=LONG_CONTEXT_TASKS,
apply_chat_template=True,
)

if __name__ == "__main__":
executor_main(steps=[ruler_eval_step])
9 changes: 6 additions & 3 deletions experiments/evals/task_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@

from collections.abc import Sequence

from levanter.eval_harness import TaskConfig

from marin.evaluation.evaluation_config import EvalTaskConfig

# tasks to run (corresponding to lm_eval_harness tasks)
Expand Down Expand Up @@ -311,11 +309,16 @@
EvalTaskConfig("webqs", 0, task_alias="webqs_0shot"),
)

# Long context evals
LONG_CONTEXT_TASKS = (EvalTaskConfig("ruler", 0, task_alias="ruler_0shot"),)


def convert_to_levanter_task_config(tasks: Sequence[EvalTaskConfig]) -> list[TaskConfig]:
def convert_to_levanter_task_config(tasks: Sequence[EvalTaskConfig]) -> list:
"""
Convert a list of EvalTaskConfig to a list of TaskConfig that Levanter's eval_harness expects.
"""
from levanter.eval_harness import TaskConfig

return [
TaskConfig(
task=task.name,
Expand Down
7 changes: 6 additions & 1 deletion lib/levanter/src/levanter/eval_harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -985,6 +985,11 @@ class LmEvalHarnessConfig:

These can be overridden on a per-request basis by the evaluation harness.
"""
metadata: dict | None = None
"""
Metadata to pass to lm-eval's TaskManager. Some tasks (e.g., RULER) require
'pretrained' to specify the tokenizer for data preprocessing.
"""

@property
def max_gen_toks(self) -> int:
Expand All @@ -1011,7 +1016,7 @@ def to_task_dict(self) -> dict:
logger.info("Loading tasks...")
import lm_eval.tasks as tasks

manager = tasks.TaskManager()
manager = tasks.TaskManager(metadata=self.metadata)
# we need to do it this way b/c i can't figure out how to run e.g. hellaswag 0 shot and 10 shot in a single run
this_tasks = {}
for task in tqdm(self.to_task_spec()):
Expand Down
3 changes: 3 additions & 0 deletions lib/marin/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ rl = [

eval = [
"lm-eval[math]@git+https://github.com/stanford-crfm/lm-evaluation-harness@d5e3391f22cde186c827674d5c3ec7c5f4fe0cab",
"accelerate",
"nltk",
"wonderwords",
]

[[tool.uv.index]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def evaluate(
apply_chat_template=model.apply_chat_template,
confirm_run_unsafe_code=True,
sample_logging=eval_harness.SampleLoggingConfig(max_samples_per_benchmark=20),
metadata={"pretrained": model_path},
),
tokenizer=model_path, # levanter picks up the tokenizer from the model path
checkpoint_path=model_path,
Expand Down
15 changes: 15 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.