marin-community · nikil-ravi · Dec 27, 2025 · Dec 27, 2025 · Dec 27, 2025 · Dec 28, 2025
diff --git a/experiments/evals/run_ruler_evals.py b/experiments/evals/run_ruler_evals.py
@@ -0,0 +1,33 @@
+# Copyright 2025 The Marin Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Evaluate Llama-3.1-8B-Instruct on the RULER long-context benchmark.
+"""
+
+from experiments.evals.evals import default_eval
+from experiments.evals.task_configs import LONG_CONTEXT_TASKS
+from experiments.models import llama_3_1_8b_instruct
+from fray.cluster import ResourceConfig
+from marin.execution.executor import executor_main
+
+ruler_eval_step = default_eval(
+    step=llama_3_1_8b_instruct,
+    resource_config=ResourceConfig.with_tpu("v4-8"),
+    evals=LONG_CONTEXT_TASKS,
+    apply_chat_template=True,
+)
+
+if __name__ == "__main__":
+    executor_main(steps=[ruler_eval_step])
diff --git a/experiments/evals/task_configs.py b/experiments/evals/task_configs.py
@@ -14,8 +14,6 @@
 
 from collections.abc import Sequence
 
-from levanter.eval_harness import TaskConfig
-
 from marin.evaluation.evaluation_config import EvalTaskConfig
 
 # tasks to run (corresponding to lm_eval_harness tasks)
@@ -311,11 +309,16 @@
     EvalTaskConfig("webqs", 0, task_alias="webqs_0shot"),
 )
 
+# Long context evals
+LONG_CONTEXT_TASKS = (EvalTaskConfig("ruler", 0, task_alias="ruler_0shot"),)
+
 
-def convert_to_levanter_task_config(tasks: Sequence[EvalTaskConfig]) -> list[TaskConfig]:
+def convert_to_levanter_task_config(tasks: Sequence[EvalTaskConfig]) -> list:
     """
     Convert a list of EvalTaskConfig to a list of TaskConfig that Levanter's eval_harness expects.
     """
+    from levanter.eval_harness import TaskConfig
+
     return [
         TaskConfig(
             task=task.name,

diff --git a/lib/levanter/src/levanter/eval_harness.py b/lib/levanter/src/levanter/eval_harness.py
@@ -985,6 +985,11 @@ class LmEvalHarnessConfig:
 
     These can be overridden on a per-request basis by the evaluation harness.
     """
+    metadata: dict | None = None
+    """
+    Metadata to pass to lm-eval's TaskManager. Some tasks (e.g., RULER) require
+    'pretrained' to specify the tokenizer for data preprocessing.
+    """
 
     @property
     def max_gen_toks(self) -> int:
@@ -1011,7 +1016,7 @@ def to_task_dict(self) -> dict:
         logger.info("Loading tasks...")
         import lm_eval.tasks as tasks
 
-        manager = tasks.TaskManager()
+        manager = tasks.TaskManager(metadata=self.metadata)
         # we need to do it this way b/c i can't figure out how to run e.g. hellaswag 0 shot and 10 shot in a single run
         this_tasks = {}
         for task in tqdm(self.to_task_spec()):

diff --git a/lib/marin/pyproject.toml b/lib/marin/pyproject.toml
@@ -129,6 +129,9 @@ rl = [
 
 eval = [
     "lm-eval[math]@git+https://github.com/stanford-crfm/lm-evaluation-harness@d5e3391f22cde186c827674d5c3ec7c5f4fe0cab",
+    "accelerate",
+    "nltk",
+    "wonderwords",
 ]
 
 [[tool.uv.index]]

diff --git a/lib/marin/src/marin/evaluation/evaluators/levanter_lm_eval_evaluator.py b/lib/marin/src/marin/evaluation/evaluators/levanter_lm_eval_evaluator.py
@@ -113,6 +113,7 @@ def evaluate(
                     apply_chat_template=model.apply_chat_template,
                     confirm_run_unsafe_code=True,
                     sample_logging=eval_harness.SampleLoggingConfig(max_samples_per_benchmark=20),
+                    metadata={"pretrained": model_path},
                 ),
                 tokenizer=model_path,  # levanter picks up the tokenizer from the model path
                 checkpoint_path=model_path,

diff --git a/uv.lock b/uv.lock