Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ medhelm =
openpyxl~=3.1
python-docx~=1.1
transformers~=4.45,<4.50
evaluation-instruments @ git+https://github.com/epic-open-source/evaluation-instruments.git@1c4637e84fe4dc54f6695e438f3baca6b2cd4573
Copy link
Collaborator

@yifanmai yifanmai Aug 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PyPI packages cannot depend on packages outside PyPI. You should instead provide instructions to users to manually install this this package, either by printing the installation command in an error message, or by documenting it in ReadTheDocs.


audiolm =
crfm-helm[openai]
Expand Down
10 changes: 6 additions & 4 deletions src/helm/benchmark/annotation/model_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def annotate(self, request_state: RequestState) -> Dict[str, Any]:
annotations: Dict[str, Union[Optional[str], Optional[bool], Dict[str, Any]]] = {"prompt_text": annotator_prompt}

# Track failed annotations for each model
failed_counts: Dict[str, int] = {name: 0 for name in self._annotator_models}
failed_annotators: Set[str] = set()

# Annotate using multiple models
for annotator_name, annotator_model_info in self._annotator_models.items():
Expand All @@ -230,13 +230,15 @@ def annotate(self, request_state: RequestState) -> Dict[str, Any]:
if annotator_criteria is not None:
annotations[annotator_name] = annotator_criteria
else:
failed_counts[annotator_name] += 1
failed_annotators.add(annotator_name)

except Exception as e:
hlog(f"ERROR annotating with {annotator_name}: {e}")
failed_counts[annotator_name] += 1
failed_annotators.add(annotator_name)

hlog(f"Failed model annotations: {failed_counts}")
total_failed = len(failed_annotators)
if total_failed != 0:
hlog(f"Some model annotations failed: {failed_annotators}")
return annotations

def _annotate_with_model(
Expand Down
51 changes: 51 additions & 0 deletions src/helm/benchmark/annotation/note_summary_annotator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import evaluation_instruments.instruments.pdsqi_9.pdsqi_prompt as pdsqi

from typing import Dict, Optional

from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
from helm.clients.auto_client import AutoClient

from evaluation_instruments import prep
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Display an error if this is not installed:

from helm.common.optional_dependencies import OptionalDependencyNotInstalled

try:
    from evaluation_instruments import prep
    import evaluation_instruments.instruments.pdsqi_9.pdsqi_prompt as pdsqi
except ModuleNotFoundError as e:
    # Provide manual instructions for installing evaluation-instruments from GitHub
    # because PyPI does not allow installing dependencies directly from GitHub.
    raise OptionalDependencyNotInstalled(
        f"Optional dependency {e.name} is not installed. "
        "Please run `evaluation-instruments @ git+https://github.com/epic-open-source/evaluation-instruments.git@1c4637e84fe4dc54f6695e438f3baca6b2cd4573` to install it."
    ) from e  # noqa: E501



class NoteSummaryAnnotator(LLMAsJuryAnnotator):
"""The NoteSummary autograder."""

name = "note_summary"

def __init__(
self,
auto_client: AutoClient,
annotator_models: Dict[str, AnnotatorModelInfo],
template_name: Optional[str] = None,
):
super().__init__(
auto_client=auto_client,
prompt_template="",
annotation_criteria={},
annotator_models=annotator_models,
)

def _interpolate_prompt(
self, request_state: RequestState, custom_replacements: Optional[Dict[str, str]] = None
) -> str:
"""
Interpolate prompt template with request state information.
:param request_state: The current request state
:param custom_replacements: Optional dictionary of additional replacements
:return: Interpolated prompt
"""
notes = (request_state.instance.extra_data or {}).get("notes", [])
prompt = pdsqi.resolve_prompt(
summary_to_evaluate=(
request_state.result.completions[0].text
if request_state.result and request_state.result.completions
else ""
),
notes=notes,
target_specialty="emergency medicine",
output_mode=prep.OutputMode.EXPLAINED_SCORE,
)
return prompt[1]["content"]
82 changes: 77 additions & 5 deletions src/helm/benchmark/metrics/llm_jury_metrics.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,70 @@
from typing import Any, Dict, List
from dataclasses import dataclass
from typing import Any, Dict, List, Optional

from helm.benchmark.adaptation.adapter_spec import AdapterSpec
from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
from helm.common.hierarchical_logger import hlog, hwarn
from helm.benchmark.metrics.metric import Metric
from helm.benchmark.metrics.metric_name import MetricName
from helm.benchmark.metrics.metric_service import MetricService
from helm.benchmark.metrics.statistic import Stat


@dataclass
class RubricItem:
name: str
min: float
max: float
weight: float
higher_is_better: bool


@dataclass
class Rubric:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did not look at the rubric logic too closely, but let me know if there's anything you want me to check.

items: Dict[str, RubricItem]

@classmethod
def from_config(cls, rubric_config: Dict[str, Any]) -> "Rubric":
items = {}
for name, attrs in rubric_config.items():
item = RubricItem(
name=name,
min=attrs["min"],
max=attrs["max"],
weight=attrs["weight"],
higher_is_better=attrs["higher_is_better"],
)
items[name] = item
return cls(items)

def normalize(self, name: str, score: float) -> float:
"""Normalize the score according to the rubric item config."""
item = self.items[name]
raw = (score - item.min) / (item.max - item.min)
return raw if item.higher_is_better else 1 - raw

def aggregate(self, scores: Dict[str, float]) -> float:
"""Weighted aggregation of normalized scores."""
total = 0.0
weight_offset = 0.0
invalid_scores = [name for name in scores.keys() if not isinstance(scores[name], (int, float))]
if invalid_scores:
n_valid_scores = len(scores) - len(invalid_scores)
weight_offset = sum(self.items[name].weight for name in invalid_scores) / n_valid_scores
hwarn(
f"Invalid scores found for {invalid_scores}. "
f"Using average weight offset of {weight_offset} to adjust the total score."
)
for name, score in scores.items():
if not isinstance(score, (int, float)):
hwarn(f"Skipping non-numeric score for {name}: {score}")
continue
norm = self.normalize(name, score)
total += norm * (self.items[name].weight + weight_offset)
return total


class LLMJuryMetric(Metric):
"""Score metrics for LLM Jury."""

Expand All @@ -18,11 +74,13 @@ def __init__(
scenario_name: str,
annotator_models: Dict[str, AnnotatorModelInfo],
default_score: float = 0.0,
rubric: Optional[Rubric] = None,
):
self.metric_name = metric_name
self.scenario_name = scenario_name
self.annotator_models = annotator_models
self.default_score = default_score
self.rubric = rubric

def evaluate_generation(
self,
Expand All @@ -32,15 +90,29 @@ def evaluate_generation(
eval_cache_path: str,
) -> List[Stat]:
assert request_state.annotations
if self.rubric:
hlog(f"Using rubric for {self.scenario_name} with items: {list(self.rubric.items.keys())}")
else:
hlog(f"No rubric defined for {self.scenario_name}, using raw scores.")
annotations: Dict[str, Any] = request_state.annotations[self.scenario_name]
scores: List[int] = []
score = self.default_score
for annotation_key, annotation_dict in annotations.items():
if annotation_key in self.annotator_models.keys() and annotation_dict is not None:
for val in annotation_dict.values():
scores.append(int(val["score"]))
if scores:
score = sum(scores) / len(scores)
if self.rubric:
# Use rubric to normalize and aggregate scores
scores_dict = {
item: annotation_dict[item]["score"]
for item in self.rubric.items.keys()
if item in annotation_dict
}
score = self.rubric.aggregate(scores_dict)
else:
# Fallback to using the raw score
for val in annotation_dict.values():
scores.append(int(val["score"]))
if scores:
score = sum(scores) / len(scores)
return [
Stat(MetricName(self.metric_name)).add(score),
]
72 changes: 71 additions & 1 deletion src/helm/benchmark/run_specs/medhelm_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
Website: https://crfm.stanford.edu/helm/medhelm/
"""

from typing import Union
import importlib.resources as pkg_resources
import os
import yaml

from typing import Optional, Union

from helm.benchmark.adaptation.adapter_spec import (
ADAPT_MULTIPLE_CHOICE_JOINT,
Expand All @@ -13,6 +17,7 @@
get_multiple_choice_adapter_spec,
)
from helm.benchmark.annotation.annotator import AnnotatorSpec
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
from helm.benchmark.metrics.common_metric_specs import (
get_basic_metric_specs,
get_exact_match_metric_specs,
Expand All @@ -21,6 +26,7 @@
get_generic_metric_specs,
)
from helm.benchmark.metrics.metric import MetricSpec
from helm.benchmark.metrics.llm_jury_metrics import Rubric
from helm.benchmark.run_spec import RunSpec, run_spec_function
from helm.benchmark.scenarios.scenario import ScenarioSpec
from helm.common.gpu_utils import get_torch_device_name
Expand Down Expand Up @@ -1258,3 +1264,67 @@ def get_shc_proxy_spec(data_path: str) -> RunSpec:
metric_specs=get_exact_match_metric_specs(),
groups=["shc_proxy_med"],
)


@run_spec_function("note_summary")
def get_note_summary_spec(config_path: Optional[str] = None) -> RunSpec:
if config_path is None:
package = "helm.benchmark.scenarios"
config_path = str(pkg_resources.files(package).joinpath("note_summary_scenario.yaml"))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to add *.yaml to the manifest, or this file will not actually get included in the package.

recursive-include src/helm/benchmark/ *.json


assert os.path.exists(config_path), f"Config path not found: {config_path}."

with open(config_path, "r") as f:
config = yaml.safe_load(f)

scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.note_summary_scenario.NoteSummaryScenario",
args={
"data_path": config["data_path"],
},
)

adapter_spec = get_generation_adapter_spec(
instructions="",
input_noun=None,
newline_after_input_noun=False,
output_noun=None,
max_tokens=500,
stop_sequences=[],
max_train_instances=0,
)

annotator_models = {
judge["name"]: AnnotatorModelInfo(
model_name=judge["model"],
model_deployment=judge["model_deployment"],
)
for judge in config["judges"]
}

annotator_specs = [
AnnotatorSpec(
class_name="helm.benchmark.annotation.note_summary_annotator.NoteSummaryAnnotator",
args={"annotator_models": annotator_models},
)
]

metric_specs = get_basic_metric_specs([]) + [
MetricSpec(
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
args={
"metric_name": "note_summary_accuracy",
"scenario_name": "note_summary",
"annotator_models": annotator_models,
"rubric": Rubric.from_config(config["rubric"]),
},
)
]
return RunSpec(
name="note_summary",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
annotators=annotator_specs,
metric_specs=metric_specs,
groups=["note_summary"],
)
91 changes: 91 additions & 0 deletions src/helm/benchmark/scenarios/note_summary_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import pandas as pd

from typing import List
from helm.common.general import check_file_exists
from helm.benchmark.scenarios.scenario import (
Input,
Scenario,
Instance,
TEST_SPLIT,
)
from helm.benchmark.scenarios.note_summary_scenario_helper import Summarizer # type: ignore


def file_preprocessing(data_path: str) -> pd.DataFrame:
"""
Preprocess the data files to create a DataFrame with the necessary columns.
task_objective: 'brief_hospital_course' or 'discharge_instructions'
Use command to download: wget -r -N -c -np --user {PHYSIONET_USERNAME} \
--ask-password https://physionet.org/files/discharge-me/1.3/
data_path is directory that contains the downloaded files: '{base_dir}/physionet.org/'
"""
# Load the first CSV file
discharge_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge.csv.gz"
check_file_exists(
discharge_path, msg=f"[NoteSummaryScenario] Required discharge file not found: '{discharge_path}'"
)
radiology_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/radiology.csv.gz"
check_file_exists(
radiology_path, msg=f"[NoteSummaryScenario] Required radiology file not found: '{radiology_path}'"
)
df_discharge = pd.read_csv(discharge_path, compression="gzip", keep_default_na=False)
df_radiology = pd.read_csv(radiology_path, compression="gzip", keep_default_na=False)

final_df = pd.concat([df_discharge, df_radiology], ignore_index=True)
return final_df


class NoteSummaryScenario(Scenario):
"""
NoteSummary is a benchmark designed to evaluate clinical note summarization capabilities of LLMs.
In this scenario, we only consider the discharge text as well as the radiology report text.
We are using the phase I test set which is composed of 14,702 hospital admission instances.
@inproceedings{Xu_2024,
title={ Discharge me: Bionlp acl’24 shared task on streamlining discharge documentation.},
url={https://doi.org/10.13026/4a0k-4360},
DOI={10.13026/27pt-1259},
booktitle={ Proceedings of the 23rd Workshop on Biomedical Natural Language Processing (BioNLP) at ACL 2024},
publisher={Association for Computational Linguistics},
author={Xu, Justin and Delbrouck, Jean-Benoit and Johnston, Andrew and Blankemeier, Louis and Langlotz, Curtis},
year={2024}
}
"""

name = "note_summary"
description = "NoteSummary is a benchmark designed to evaluate clinical note summarization capabilities of LLMs."
tags = ["biomedical"]

def __init__(self, data_path: str):
super().__init__()
self.data_path = data_path

def get_instances(self, output_path: str) -> List[Instance]:
instances: List[Instance] = []
df = file_preprocessing(self.data_path)
admissions = df["hadm_id"].unique()
for admission in admissions:
df_admission = df[df["hadm_id"] == admission]
summarizer = Summarizer(
notes=df_admission["text"].tolist(),
authors=df_admission["note_type"].tolist(),
timestamps=df_admission["charttime"].tolist(),
target_specialty="emergency medicine",
)
prompt_di, _ = summarizer.build_prompt(anti_rules=0, omit_rules=0)
instances.append(
Instance(
input=Input(text=prompt_di),
references=[],
split=TEST_SPLIT,
extra_data={"notes": df_admission["text"].tolist()},
)
)

return instances

def read_file(self, file_path: str) -> List[str]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Delete unused method.

with open(file_path, "r") as file:
lines = file.readlines()
lines = [line.strip() for line in lines]
return lines
Loading