Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions src/helm/benchmark/run_specs/medhelm_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,96 @@ def get_medcalc_bench_spec() -> RunSpec:
)


@run_spec_function("medcalc_bench_v1_0")
def get_medcalc_bench_v1_0_spec() -> RunSpec:
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medcalc_bench_scenario.MedCalcBenchV1_0Scenario")

adapter_spec = get_generation_adapter_spec(
instructions="Given a patient note and a clinical question, compute the requested medical value.",
input_noun=None,
newline_after_input_noun=False,
output_noun="Answer only the requested quantity without units. No explanation needed",
max_tokens=10,
max_train_instances=0,
stop_sequences=[],
)

metric_specs = [
MetricSpec(
class_name="helm.benchmark.metrics.medcalc_bench_metrics.MedCalcBenchMetric",
args={},
)
] + get_exact_match_metric_specs()

return RunSpec(
name="medcalc_bench_v1_0",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["medcalc_bench"],
)


@run_spec_function("medcalc_bench_v1_1")
def get_medcalc_bench_v1_1_spec() -> RunSpec:
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medcalc_bench_scenario.MedCalcBenchV1_1Scenario")

adapter_spec = get_generation_adapter_spec(
instructions="Given a patient note and a clinical question, compute the requested medical value.",
input_noun=None,
newline_after_input_noun=False,
output_noun="Answer only the requested quantity without units. No explanation needed",
max_tokens=10,
max_train_instances=0,
stop_sequences=[],
)

metric_specs = [
MetricSpec(
class_name="helm.benchmark.metrics.medcalc_bench_metrics.MedCalcBenchMetric",
args={},
)
] + get_exact_match_metric_specs()

return RunSpec(
name="medcalc_bench_v1_1",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["medcalc_bench"],
)


@run_spec_function("medcalc_bench_v1_2")
def get_medcalc_bench_v1_2_spec() -> RunSpec:
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medcalc_bench_scenario.MedCalcBenchV1_2Scenario")

adapter_spec = get_generation_adapter_spec(
instructions="Given a patient note and a clinical question, compute the requested medical value.",
input_noun=None,
newline_after_input_noun=False,
output_noun="Answer only the requested quantity without units. No explanation needed",
max_tokens=10,
max_train_instances=0,
stop_sequences=[],
)

metric_specs = [
MetricSpec(
class_name="helm.benchmark.metrics.medcalc_bench_metrics.MedCalcBenchMetric",
args={},
)
] + get_exact_match_metric_specs()

return RunSpec(
name="medcalc_bench_v1_2",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["medcalc_bench"],
)


@run_spec_function("clear")
def get_clear_spec(condition: str, data_path: str) -> RunSpec:
scenario_spec = ScenarioSpec(
Expand Down
2 changes: 1 addition & 1 deletion src/helm/benchmark/scenarios/math_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def is_equiv_chain_of_thought(str1: str, str2: str) -> float:


class MATHScenario(Scenario):
"""
r"""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Revert unrelated change.

The MATH dataset from the paper
"Measuring Mathematical Problem Solving With the MATH Dataset"
by Hendrycks et al. (2021):
Expand Down
93 changes: 91 additions & 2 deletions src/helm/benchmark/scenarios/medcalc_bench_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class MedCalcBenchScenario(Scenario):
This dataset contains a training dataset of 10,053 instances and a testing
dataset of 1,047 instances.

Dataset: https://huggingface.co/datasets/ncbi/MedCalc-Bench-v1.0
Dataset: https://huggingface.co/datasets/ncbi/MedCalc-Bench
Paper: https://arxiv.org/abs/2406.12036

Sample Prompt:
Expand Down Expand Up @@ -114,7 +114,7 @@ def process_csv(self, data, split: str) -> List[Instance]:

def get_instances(self, output_path: str) -> List[Instance]:
# Load the MedCalc-Bench dataset from Hugging Face
dataset = load_dataset("ncbi/MedCalc-Bench-v1.0")
dataset = load_dataset("ncbi/MedCalc-Bench")

# Process all the instances - limit to zero shot setting
instances: List[Instance] = []
Expand Down Expand Up @@ -147,3 +147,92 @@ def get_metadata(self):
main_metric="medcalc_bench_accuracy",
main_split="test",
)


class MedCalcBenchV1_0Scenario(MedCalcBenchScenario):
"""
MedCalc-Bench v1.0 is an updated the version of the MedCalc-Bench dataset designed to
evaluate LLMs' capabilities in medical calculations. This version serves as a baseline
for assessing the performance of language models in computing clinically relevant values
from patient notes.

Dataset: https://huggingface.co/datasets/ncbi/MedCalc-Bench-v1.0
Paper: https://arxiv.org/abs/2406.12036
"""

name = "medcalc_bench_v1_0"

def get_instances(self, output_path: str) -> List[Instance]:
# Load the MedCalc-Bench v1.0 dataset from Hugging Face
dataset = load_dataset("ncbi/MedCalc-Bench-v1.0")

# Process all the instances - limit to zero shot setting
instances: List[Instance] = []
splits: Dict[str, str] = {
# "train": TRAIN_SPLIT,
"test": TEST_SPLIT,
}
for hf_split, split in splits.items():
data = dataset[hf_split]
instances.extend(self.process_csv(data, split))

return instances


class MedCalcBenchV1_1Scenario(MedCalcBenchScenario):
"""
MedCalc-Bench v1.1 is an updated version of the MedCalc-Bench dataset with minor
corrections and clarifications to improve the quality of the questions and answers.
This version aims to provide a more accurate assessment of LLMs' capabilities in
medical calculations.

Dataset: https://huggingface.co/datasets/ncbi/MedCalc-Bench-v1.1
Paper: https://arxiv.org/abs/2406.12036
"""

name = "medcalc_bench_v1_1"

def get_instances(self, output_path: str) -> List[Instance]:
# Load the MedCalc-Bench v1.1 dataset from Hugging Face
dataset = load_dataset("ncbi/MedCalc-Bench-v1.1")

# Process all the instances - limit to zero shot setting
instances: List[Instance] = []
splits: Dict[str, str] = {
# "train": TRAIN_SPLIT,
"test": TEST_SPLIT,
}
for hf_split, split in splits.items():
data = dataset[hf_split]
instances.extend(self.process_csv(data, split))

return instances


class MedCalcBenchV1_2Scenario(MedCalcBenchScenario):
"""
MedCalc-Bench v1.2 is an updated version of the MedCalc-Bench dataset with improved
question clarity and answer accuracy. This version addresses ambiguities in the original
dataset to enhance the evaluation of LLMs' capabilities in medical calculations.

Dataset: https://huggingface.co/datasets/ncbi/MedCalc-Bench-v1.2
Paper: https://arxiv.org/abs/2406.12036
"""

name = "medcalc_bench_v1_2"

def get_instances(self, output_path: str) -> List[Instance]:
# Load the MedCalc-Bench v1.2 dataset from Hugging Face
dataset = load_dataset("ncbi/MedCalc-Bench-v1.2")

# Process all the instances - limit to zero shot setting
instances: List[Instance] = []
splits: Dict[str, str] = {
# "train": TRAIN_SPLIT,
"test": TEST_SPLIT,
}
for hf_split, split in splits.items():
data = dataset[hf_split]
instances.extend(self.process_csv(data, split))

return instances
49 changes: 49 additions & 0 deletions src/helm/benchmark/scenarios/test_medcalc_bench_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pytest
from tempfile import TemporaryDirectory

from helm.benchmark.scenarios.medcalc_bench_scenario import (
MedCalcBenchScenario,
MedCalcBenchV1_0Scenario,
MedCalcBenchV1_1Scenario,
MedCalcBenchV1_2Scenario,
)


@pytest.mark.scenarios
def test_medcalc_bench_scenario():
with TemporaryDirectory() as tmpdir:
# Test for the MedCalc-Bench scenario
scenario = MedCalcBenchScenario()
instances = scenario.get_instances(tmpdir)

assert instances[0].split == "test"


@pytest.mark.scenarios
def test_medcalc_bench_v1_0_scenario():
with TemporaryDirectory() as tmpdir:
# Test for the MedCalc-Bench scenario
scenario = MedCalcBenchV1_0Scenario()
instances = scenario.get_instances(tmpdir)

assert instances[0].split == "test"


@pytest.mark.scenarios
def test_medcalc_bench_v1_1_scenario():
with TemporaryDirectory() as tmpdir:
# Test for the MedCalc-Bench scenario
scenario = MedCalcBenchV1_1Scenario()
instances = scenario.get_instances(tmpdir)

assert instances[0].split == "test"


@pytest.mark.scenarios
def test_medcalc_bench_v1_2_scenario():
with TemporaryDirectory() as tmpdir:
# Test for the MedCalc-Bench scenario
scenario = MedCalcBenchV1_2Scenario()
instances = scenario.get_instances(tmpdir)

assert instances[0].split == "test"
Loading