diff --git a/src/helm/benchmark/run_specs/medhelm_run_specs.py b/src/helm/benchmark/run_specs/medhelm_run_specs.py index dfda85b8f64..9f4e46cd180 100644 --- a/src/helm/benchmark/run_specs/medhelm_run_specs.py +++ b/src/helm/benchmark/run_specs/medhelm_run_specs.py @@ -92,8 +92,14 @@ def get_medhelm_configurable_benchmark_spec(config_path: str) -> RunSpec: @run_spec_function("medcalc_bench") -def get_medcalc_bench_spec() -> RunSpec: - scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medcalc_bench_scenario.MedCalcBenchScenario") +def get_medcalc_bench_spec(version: Optional[str] = None) -> RunSpec: + + scenario_args = {} if version is None else {"version": version} + + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.medcalc_bench_scenario.MedCalcBenchScenario", + args=scenario_args, + ) adapter_spec = get_generation_adapter_spec( instructions="Given a patient note and a clinical question, compute the requested medical value.", @@ -112,8 +118,10 @@ def get_medcalc_bench_spec() -> RunSpec: ) ] + get_exact_match_metric_specs() + run_spec_name = "medcalc_bench" if version is None else f"medcalc_bench:version={version}" + return RunSpec( - name="medcalc_bench", + name=run_spec_name, scenario_spec=scenario_spec, adapter_spec=adapter_spec, metric_specs=metric_specs, diff --git a/src/helm/benchmark/scenarios/math_scenario.py b/src/helm/benchmark/scenarios/math_scenario.py index aae2dcd3f7e..a157f8bbeec 100644 --- a/src/helm/benchmark/scenarios/math_scenario.py +++ b/src/helm/benchmark/scenarios/math_scenario.py @@ -20,7 +20,7 @@ def remove_boxed(string: str) -> Optional[str]: - r"""Source: https://github.com/hendrycks/math + """Source: https://github.com/hendrycks/math Extract the text within a \boxed{...} environment. @@ -294,7 +294,7 @@ def is_equiv_chain_of_thought(str1: str, str2: str) -> float: class MATHScenario(Scenario): - """ + r""" The MATH dataset from the paper "Measuring Mathematical Problem Solving With the MATH Dataset" by Hendrycks et al. (2021): diff --git a/src/helm/benchmark/scenarios/medcalc_bench_scenario.py b/src/helm/benchmark/scenarios/medcalc_bench_scenario.py index 89083235320..15a2b28ab96 100644 --- a/src/helm/benchmark/scenarios/medcalc_bench_scenario.py +++ b/src/helm/benchmark/scenarios/medcalc_bench_scenario.py @@ -1,4 +1,4 @@ -from typing import Dict, List +from typing import Dict, List, Optional from datasets import load_dataset from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo @@ -28,7 +28,7 @@ class MedCalcBenchScenario(Scenario): This dataset contains a training dataset of 10,053 instances and a testing dataset of 1,047 instances. - Dataset: https://huggingface.co/datasets/ncbi/MedCalc-Bench-v1.0 + Dataset: https://huggingface.co/datasets/ncbi/MedCalc-Bench Paper: https://arxiv.org/abs/2406.12036 Sample Prompt: @@ -112,9 +112,10 @@ def process_csv(self, data, split: str) -> List[Instance]: instances.append(instance) return instances - def get_instances(self, output_path: str) -> List[Instance]: + def get_instances(self, output_path: str, version: Optional[str] = None) -> List[Instance]: # Load the MedCalc-Bench dataset from Hugging Face - dataset = load_dataset("ncbi/MedCalc-Bench-v1.0") + dataset_path = f"ncbi/MedCalc-Bench{('-' + version) if version else ''}" + dataset = load_dataset(dataset_path) # Process all the instances - limit to zero shot setting instances: List[Instance] = [] diff --git a/src/helm/benchmark/scenarios/test_medcalc_bench_scenario.py b/src/helm/benchmark/scenarios/test_medcalc_bench_scenario.py new file mode 100644 index 00000000000..ae2d32be75f --- /dev/null +++ b/src/helm/benchmark/scenarios/test_medcalc_bench_scenario.py @@ -0,0 +1,44 @@ +import pytest +from tempfile import TemporaryDirectory + +from helm.benchmark.scenarios.medcalc_bench_scenario import MedCalcBenchScenario + + +@pytest.mark.scenarios +def test_medcalc_bench_scenario(): + with TemporaryDirectory() as tmpdir: + # Test for the MedCalc-Bench scenario + scenario = MedCalcBenchScenario() + instances = scenario.get_instances(tmpdir) + + assert instances[0].split == "test" + + +@pytest.mark.scenarios +def test_medcalc_bench_v1_0_scenario(): + with TemporaryDirectory() as tmpdir: + # Test for the MedCalc-Bench scenario + scenario = MedCalcBenchScenario() + instances = scenario.get_instances(tmpdir, "v1.0") + + assert instances[0].split == "test" + + +@pytest.mark.scenarios +def test_medcalc_bench_v1_1_scenario(): + with TemporaryDirectory() as tmpdir: + # Test for the MedCalc-Bench scenario + scenario = MedCalcBenchScenario() + instances = scenario.get_instances(tmpdir, "v1.1") + + assert instances[0].split == "test" + + +@pytest.mark.scenarios +def test_medcalc_bench_v1_2_scenario(): + with TemporaryDirectory() as tmpdir: + # Test for the MedCalc-Bench scenario + scenario = MedCalcBenchScenario() + instances = scenario.get_instances(tmpdir, "v1.2") + + assert instances[0].split == "test"