stanford-crfm · yifanmai · Jan 13, 2026 · Aug 18, 2025 · Aug 20, 2025 · Aug 20, 2025
diff --git a/src/helm/benchmark/run_specs/medhelm_run_specs.py b/src/helm/benchmark/run_specs/medhelm_run_specs.py
@@ -92,8 +92,14 @@ def get_medhelm_configurable_benchmark_spec(config_path: str) -> RunSpec:
 
 
 @run_spec_function("medcalc_bench")
-def get_medcalc_bench_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medcalc_bench_scenario.MedCalcBenchScenario")
+def get_medcalc_bench_spec(version: Optional[str] = None) -> RunSpec:
+
+    scenario_args = {} if version is None else {"version": version}
+
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.medcalc_bench_scenario.MedCalcBenchScenario",
+        args=scenario_args,
+    )
 
     adapter_spec = get_generation_adapter_spec(
         instructions="Given a patient note and a clinical question, compute the requested medical value.",
@@ -112,8 +118,10 @@ def get_medcalc_bench_spec() -> RunSpec:
         )
     ] + get_exact_match_metric_specs()
 
+    run_spec_name = "medcalc_bench" if version is None else f"medcalc_bench:version={version}"
+
     return RunSpec(
-        name="medcalc_bench",
+        name=run_spec_name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,

diff --git a/src/helm/benchmark/scenarios/math_scenario.py b/src/helm/benchmark/scenarios/math_scenario.py
@@ -20,7 +20,7 @@
 
 
 def remove_boxed(string: str) -> Optional[str]:
-    r"""Source: https://github.com/hendrycks/math
+    """Source: https://github.com/hendrycks/math
 
     Extract the text within a \boxed{...} environment.
 
@@ -294,7 +294,7 @@ def is_equiv_chain_of_thought(str1: str, str2: str) -> float:
 
 
 class MATHScenario(Scenario):
-    """
+    r"""
     The MATH dataset from the paper
     "Measuring Mathematical Problem Solving With the MATH Dataset"
     by Hendrycks et al. (2021):

diff --git a/src/helm/benchmark/scenarios/medcalc_bench_scenario.py b/src/helm/benchmark/scenarios/medcalc_bench_scenario.py
@@ -1,4 +1,4 @@
-from typing import Dict, List
+from typing import Dict, List, Optional
 from datasets import load_dataset
 
 from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
@@ -28,7 +28,7 @@ class MedCalcBenchScenario(Scenario):
     This dataset contains a training dataset of 10,053 instances and a testing
     dataset of 1,047 instances.
 
-    Dataset: https://huggingface.co/datasets/ncbi/MedCalc-Bench-v1.0
+    Dataset: https://huggingface.co/datasets/ncbi/MedCalc-Bench
     Paper: https://arxiv.org/abs/2406.12036
 
     Sample Prompt:
@@ -112,9 +112,10 @@ def process_csv(self, data, split: str) -> List[Instance]:
             instances.append(instance)
         return instances
 
-    def get_instances(self, output_path: str) -> List[Instance]:
+    def get_instances(self, output_path: str, version: Optional[str] = None) -> List[Instance]:
         # Load the MedCalc-Bench dataset from Hugging Face
-        dataset = load_dataset("ncbi/MedCalc-Bench-v1.0")
+        dataset_path = f"ncbi/MedCalc-Bench{('-' + version) if version else ''}"
+        dataset = load_dataset(dataset_path)
 
         # Process all the instances - limit to zero shot setting
         instances: List[Instance] = []

diff --git a/src/helm/benchmark/scenarios/test_medcalc_bench_scenario.py b/src/helm/benchmark/scenarios/test_medcalc_bench_scenario.py
@@ -0,0 +1,44 @@
+import pytest
+from tempfile import TemporaryDirectory
+
+from helm.benchmark.scenarios.medcalc_bench_scenario import MedCalcBenchScenario
+
+
+@pytest.mark.scenarios
+def test_medcalc_bench_scenario():
+    with TemporaryDirectory() as tmpdir:
+        # Test for the MedCalc-Bench scenario
+        scenario = MedCalcBenchScenario()
+        instances = scenario.get_instances(tmpdir)
+
+        assert instances[0].split == "test"
+
+
+@pytest.mark.scenarios
+def test_medcalc_bench_v1_0_scenario():
+    with TemporaryDirectory() as tmpdir:
+        # Test for the MedCalc-Bench scenario
+        scenario = MedCalcBenchScenario()
+        instances = scenario.get_instances(tmpdir, "v1.0")
+
+        assert instances[0].split == "test"
+
+
+@pytest.mark.scenarios
+def test_medcalc_bench_v1_1_scenario():
+    with TemporaryDirectory() as tmpdir:
+        # Test for the MedCalc-Bench scenario
+        scenario = MedCalcBenchScenario()
+        instances = scenario.get_instances(tmpdir, "v1.1")
+
+        assert instances[0].split == "test"
+
+
+@pytest.mark.scenarios
+def test_medcalc_bench_v1_2_scenario():
+    with TemporaryDirectory() as tmpdir:
+        # Test for the MedCalc-Bench scenario
+        scenario = MedCalcBenchScenario()
+        instances = scenario.get_instances(tmpdir, "v1.2")
+
+        assert instances[0].split == "test"