diff --git a/experiments/evals/run_ruler_evals.py b/experiments/evals/run_ruler_evals.py new file mode 100644 index 0000000000..ea36d6073c --- /dev/null +++ b/experiments/evals/run_ruler_evals.py @@ -0,0 +1,33 @@ +# Copyright 2025 The Marin Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Evaluate Llama-3.1-8B-Instruct on the RULER long-context benchmark. +""" + +from experiments.evals.evals import default_eval +from experiments.evals.task_configs import LONG_CONTEXT_TASKS +from experiments.models import llama_3_1_8b_instruct +from fray.cluster import ResourceConfig +from marin.execution.executor import executor_main + +ruler_eval_step = default_eval( + step=llama_3_1_8b_instruct, + resource_config=ResourceConfig.with_tpu("v4-8"), + evals=LONG_CONTEXT_TASKS, + apply_chat_template=True, +) + +if __name__ == "__main__": + executor_main(steps=[ruler_eval_step]) diff --git a/experiments/evals/task_configs.py b/experiments/evals/task_configs.py index 38762345c0..25c274259b 100644 --- a/experiments/evals/task_configs.py +++ b/experiments/evals/task_configs.py @@ -14,8 +14,6 @@ from collections.abc import Sequence -from levanter.eval_harness import TaskConfig - from marin.evaluation.evaluation_config import EvalTaskConfig # tasks to run (corresponding to lm_eval_harness tasks) @@ -311,11 +309,16 @@ EvalTaskConfig("webqs", 0, task_alias="webqs_0shot"), ) +# Long context evals +LONG_CONTEXT_TASKS = (EvalTaskConfig("ruler", 0, task_alias="ruler_0shot"),) + -def convert_to_levanter_task_config(tasks: Sequence[EvalTaskConfig]) -> list[TaskConfig]: +def convert_to_levanter_task_config(tasks: Sequence[EvalTaskConfig]) -> list: """ Convert a list of EvalTaskConfig to a list of TaskConfig that Levanter's eval_harness expects. """ + from levanter.eval_harness import TaskConfig + return [ TaskConfig( task=task.name, diff --git a/lib/levanter/src/levanter/eval_harness.py b/lib/levanter/src/levanter/eval_harness.py index e0591a37d5..336f082fe9 100644 --- a/lib/levanter/src/levanter/eval_harness.py +++ b/lib/levanter/src/levanter/eval_harness.py @@ -985,6 +985,11 @@ class LmEvalHarnessConfig: These can be overridden on a per-request basis by the evaluation harness. """ + metadata: dict | None = None + """ + Metadata to pass to lm-eval's TaskManager. Some tasks (e.g., RULER) require + 'pretrained' to specify the tokenizer for data preprocessing. + """ @property def max_gen_toks(self) -> int: @@ -1011,7 +1016,7 @@ def to_task_dict(self) -> dict: logger.info("Loading tasks...") import lm_eval.tasks as tasks - manager = tasks.TaskManager() + manager = tasks.TaskManager(metadata=self.metadata) # we need to do it this way b/c i can't figure out how to run e.g. hellaswag 0 shot and 10 shot in a single run this_tasks = {} for task in tqdm(self.to_task_spec()): diff --git a/lib/marin/pyproject.toml b/lib/marin/pyproject.toml index abc51b0fd9..cadd3f8993 100644 --- a/lib/marin/pyproject.toml +++ b/lib/marin/pyproject.toml @@ -129,6 +129,9 @@ rl = [ eval = [ "lm-eval[math]@git+https://github.com/stanford-crfm/lm-evaluation-harness@d5e3391f22cde186c827674d5c3ec7c5f4fe0cab", + "accelerate", + "nltk", + "wonderwords", ] [[tool.uv.index]] diff --git a/lib/marin/src/marin/evaluation/evaluators/levanter_lm_eval_evaluator.py b/lib/marin/src/marin/evaluation/evaluators/levanter_lm_eval_evaluator.py index 486764a8bb..cbc65de8ef 100644 --- a/lib/marin/src/marin/evaluation/evaluators/levanter_lm_eval_evaluator.py +++ b/lib/marin/src/marin/evaluation/evaluators/levanter_lm_eval_evaluator.py @@ -113,6 +113,7 @@ def evaluate( apply_chat_template=model.apply_chat_template, confirm_run_unsafe_code=True, sample_logging=eval_harness.SampleLoggingConfig(max_samples_per_benchmark=20), + metadata={"pretrained": model_path}, ), tokenizer=model_path, # levanter picks up the tokenizer from the model path checkpoint_path=model_path, diff --git a/uv.lock b/uv.lock index f1f759cc42..e228521b5a 100644 --- a/uv.lock +++ b/uv.lock @@ -2779,7 +2779,10 @@ cpu = [ { name = "torch", version = "2.9.1+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(sys_platform != 'darwin' and extra == 'extra-5-marin-cpu') or (extra == 'extra-5-marin-cpu' and extra == 'extra-5-marin-gpu') or (extra == 'extra-5-marin-gpu' and extra == 'extra-5-marin-tpu') or (extra != 'extra-5-marin-cpu' and extra == 'extra-8-levanter-gpu' and extra == 'extra-8-levanter-tpu') or (extra != 'extra-5-marin-tpu' and extra == 'extra-8-levanter-gpu' and extra == 'extra-8-levanter-tpu') or (extra != 'extra-5-marin-gpu' and extra == 'extra-8-levanter-gpu' and extra == 'extra-8-levanter-tpu')" }, ] eval = [ + { name = "accelerate" }, { name = "lm-eval", extra = ["math"] }, + { name = "nltk" }, + { name = "wonderwords" }, ] gpu = [ { name = "jax", extra = ["cuda12"], marker = "extra == 'extra-5-marin-gpu' or (extra == 'extra-8-levanter-gpu' and extra == 'extra-8-levanter-tpu')" }, @@ -2869,6 +2872,7 @@ test = [ [package.metadata] requires-dist = [ + { name = "accelerate", marker = "extra == 'eval'" }, { name = "braceexpand" }, { name = "cryptography", specifier = ">=45" }, { name = "datasets" }, @@ -2897,6 +2901,7 @@ requires-dist = [ { name = "lz4" }, { name = "markdownify", specifier = "==0.12.1" }, { name = "multiprocess", specifier = "==0.70.16" }, + { name = "nltk", marker = "extra == 'eval'" }, { name = "numpy" }, { name = "openai" }, { name = "pandas" }, @@ -2918,6 +2923,7 @@ requires-dist = [ { name = "verifiers", marker = "extra == 'rl'", specifier = "==0.1.5" }, { name = "wandb" }, { name = "warcio" }, + { name = "wonderwords", marker = "extra == 'eval'" }, { name = "zephyr", editable = "lib/zephyr" }, ] provides-extras = ["gpu", "tpu", "cpu", "rl", "eval"] @@ -6429,6 +6435,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2f/f9/9e082990c2585c744734f85bec79b5dae5df9c974ffee58fe421652c8e91/werkzeug-3.1.4-py3-none-any.whl", hash = "sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905", size = 224960, upload-time = "2025-11-29T02:15:21.13Z" }, ] +[[package]] +name = "wonderwords" +version = "3.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ff/23/e144fc3dfabb845dc1d94c45315d97b308cf75a664e3db3a89aeb1cb505d/wonderwords-3.0.1.tar.gz", hash = "sha256:5ee43ab6f13823a857a7c3d58c7b4db6a1350bd3aa5f914ed379ad49042a1c36", size = 73339, upload-time = "2025-10-30T17:30:44.231Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a4/75/855c2062d28b8e9247939f8262fb2f4ff3b12a49e4bab9fd1ba16cc5df82/wonderwords-3.0.1-py3-none-any.whl", hash = "sha256:4dd66deb6a76ca9e0b0422d1d3e111f9b910d7c16922d42de733ee8def98f8d0", size = 51658, upload-time = "2025-10-30T17:30:42.785Z" }, +] + [[package]] name = "word2number" version = "1.1"