stanford-crfm · MiguelAFH · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 22, 2025
diff --git a/setup.cfg b/setup.cfg
@@ -309,6 +309,7 @@ medhelm =
     openpyxl~=3.1
     python-docx~=1.1
     transformers~=4.45,<4.50
+    evaluation-instruments @ git+https://github.com/epic-open-source/evaluation-instruments.git@1c4637e84fe4dc54f6695e438f3baca6b2cd4573
 
 audiolm =
     crfm-helm[openai]

diff --git a/src/helm/benchmark/annotation/model_as_judge.py b/src/helm/benchmark/annotation/model_as_judge.py
@@ -220,7 +220,7 @@ def annotate(self, request_state: RequestState) -> Dict[str, Any]:
         annotations: Dict[str, Union[Optional[str], Optional[bool], Dict[str, Any]]] = {"prompt_text": annotator_prompt}
 
         # Track failed annotations for each model
-        failed_counts: Dict[str, int] = {name: 0 for name in self._annotator_models}
+        failed_annotators: Set[str] = set()
 
         # Annotate using multiple models
         for annotator_name, annotator_model_info in self._annotator_models.items():
@@ -230,13 +230,15 @@ def annotate(self, request_state: RequestState) -> Dict[str, Any]:
                 if annotator_criteria is not None:
                     annotations[annotator_name] = annotator_criteria
                 else:
-                    failed_counts[annotator_name] += 1
+                    failed_annotators.add(annotator_name)
 
             except Exception as e:
                 hlog(f"ERROR annotating with {annotator_name}: {e}")
-                failed_counts[annotator_name] += 1
+                failed_annotators.add(annotator_name)
 
-        hlog(f"Failed model annotations: {failed_counts}")
+        total_failed = len(failed_annotators)
+        if total_failed != 0:
+            hlog(f"Some model annotations failed: {failed_annotators}")
         return annotations
 
     def _annotate_with_model(

diff --git a/src/helm/benchmark/annotation/note_summary_annotator.py b/src/helm/benchmark/annotation/note_summary_annotator.py
@@ -0,0 +1,51 @@
+import evaluation_instruments.instruments.pdsqi_9.pdsqi_prompt as pdsqi
+
+from typing import Dict, Optional
+
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
+from helm.clients.auto_client import AutoClient
+
+from evaluation_instruments import prep
+
+
+class NoteSummaryAnnotator(LLMAsJuryAnnotator):
+    """The NoteSummary autograder."""
+
+    name = "note_summary"
+
+    def __init__(
+        self,
+        auto_client: AutoClient,
+        annotator_models: Dict[str, AnnotatorModelInfo],
+        template_name: Optional[str] = None,
+    ):
+        super().__init__(
+            auto_client=auto_client,
+            prompt_template="",
+            annotation_criteria={},
+            annotator_models=annotator_models,
+        )
+
+    def _interpolate_prompt(
+        self, request_state: RequestState, custom_replacements: Optional[Dict[str, str]] = None
+    ) -> str:
+        """
+        Interpolate prompt template with request state information.
+
+        :param request_state: The current request state
+        :param custom_replacements: Optional dictionary of additional replacements
+        :return: Interpolated prompt
+        """
+        notes = (request_state.instance.extra_data or {}).get("notes", [])
+        prompt = pdsqi.resolve_prompt(
+            summary_to_evaluate=(
+                request_state.result.completions[0].text
+                if request_state.result and request_state.result.completions
+                else ""
+            ),
+            notes=notes,
+            target_specialty="emergency medicine",
+            output_mode=prep.OutputMode.EXPLAINED_SCORE,
+        )
+        return prompt[1]["content"]
diff --git a/src/helm/benchmark/metrics/llm_jury_metrics.py b/src/helm/benchmark/metrics/llm_jury_metrics.py
@@ -1,14 +1,70 @@
-from typing import Any, Dict, List
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
 
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
+from helm.common.hierarchical_logger import hlog, hwarn
 from helm.benchmark.metrics.metric import Metric
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
 
 
+@dataclass
+class RubricItem:
+    name: str
+    min: float
+    max: float
+    weight: float
+    higher_is_better: bool
+
+
+@dataclass
+class Rubric:
+    items: Dict[str, RubricItem]
+
+    @classmethod
+    def from_config(cls, rubric_config: Dict[str, Any]) -> "Rubric":
+        items = {}
+        for name, attrs in rubric_config.items():
+            item = RubricItem(
+                name=name,
+                min=attrs["min"],
+                max=attrs["max"],
+                weight=attrs["weight"],
+                higher_is_better=attrs["higher_is_better"],
+            )
+            items[name] = item
+        return cls(items)
+
+    def normalize(self, name: str, score: float) -> float:
+        """Normalize the score according to the rubric item config."""
+        item = self.items[name]
+        raw = (score - item.min) / (item.max - item.min)
+        return raw if item.higher_is_better else 1 - raw
+
+    def aggregate(self, scores: Dict[str, float]) -> float:
+        """Weighted aggregation of normalized scores."""
+        total = 0.0
+        weight_offset = 0.0
+        invalid_scores = [name for name in scores.keys() if not isinstance(scores[name], (int, float))]
+        if invalid_scores:
+            n_valid_scores = len(scores) - len(invalid_scores)
+            weight_offset = sum(self.items[name].weight for name in invalid_scores) / n_valid_scores
+            hwarn(
+                f"Invalid scores found for {invalid_scores}. "
+                f"Using average weight offset of {weight_offset} to adjust the total score."
+            )
+        for name, score in scores.items():
+            if not isinstance(score, (int, float)):
+                hwarn(f"Skipping non-numeric score for {name}: {score}")
+                continue
+            norm = self.normalize(name, score)
+            total += norm * (self.items[name].weight + weight_offset)
+        return total
+
+
 class LLMJuryMetric(Metric):
     """Score metrics for LLM Jury."""
 
@@ -18,11 +74,13 @@ def __init__(
         scenario_name: str,
         annotator_models: Dict[str, AnnotatorModelInfo],
         default_score: float = 0.0,
+        rubric: Optional[Rubric] = None,
     ):
         self.metric_name = metric_name
         self.scenario_name = scenario_name
         self.annotator_models = annotator_models
         self.default_score = default_score
+        self.rubric = rubric
 
     def evaluate_generation(
         self,
@@ -32,15 +90,29 @@ def evaluate_generation(
         eval_cache_path: str,
     ) -> List[Stat]:
         assert request_state.annotations
+        if self.rubric:
+            hlog(f"Using rubric for {self.scenario_name} with items: {list(self.rubric.items.keys())}")
+        else:
+            hlog(f"No rubric defined for {self.scenario_name}, using raw scores.")
         annotations: Dict[str, Any] = request_state.annotations[self.scenario_name]
         scores: List[int] = []
         score = self.default_score
         for annotation_key, annotation_dict in annotations.items():
             if annotation_key in self.annotator_models.keys() and annotation_dict is not None:
-                for val in annotation_dict.values():
-                    scores.append(int(val["score"]))
-        if scores:
-            score = sum(scores) / len(scores)
+                if self.rubric:
+                    # Use rubric to normalize and aggregate scores
+                    scores_dict = {
+                        item: annotation_dict[item]["score"]
+                        for item in self.rubric.items.keys()
+                        if item in annotation_dict
+                    }
+                    score = self.rubric.aggregate(scores_dict)
+                else:
+                    # Fallback to using the raw score
+                    for val in annotation_dict.values():
+                        scores.append(int(val["score"]))
+                    if scores:
+                        score = sum(scores) / len(scores)
         return [
             Stat(MetricName(self.metric_name)).add(score),
         ]
diff --git a/src/helm/benchmark/run_specs/medhelm_run_specs.py b/src/helm/benchmark/run_specs/medhelm_run_specs.py
@@ -3,7 +3,11 @@
 Website: https://crfm.stanford.edu/helm/medhelm/
 """
 
-from typing import Union
+import importlib.resources as pkg_resources
+import os
+import yaml
+
+from typing import Optional, Union
 
 from helm.benchmark.adaptation.adapter_spec import (
     ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -13,6 +17,7 @@
     get_multiple_choice_adapter_spec,
 )
 from helm.benchmark.annotation.annotator import AnnotatorSpec
+from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
 from helm.benchmark.metrics.common_metric_specs import (
     get_basic_metric_specs,
     get_exact_match_metric_specs,
@@ -21,6 +26,7 @@
     get_generic_metric_specs,
 )
 from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.metrics.llm_jury_metrics import Rubric
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.scenarios.scenario import ScenarioSpec
 from helm.common.gpu_utils import get_torch_device_name
@@ -1258,3 +1264,67 @@ def get_shc_proxy_spec(data_path: str) -> RunSpec:
         metric_specs=get_exact_match_metric_specs(),
         groups=["shc_proxy_med"],
     )
+
+
+@run_spec_function("note_summary")
+def get_note_summary_spec(config_path: Optional[str] = None) -> RunSpec:
+    if config_path is None:
+        package = "helm.benchmark.scenarios"
+        config_path = str(pkg_resources.files(package).joinpath("note_summary_scenario.yaml"))
 recursive-include src/helm/benchmark/ *.json 
 recursive-include src/helm/benchmark/ *.json 
+
+    assert os.path.exists(config_path), f"Config path not found: {config_path}."
+
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.note_summary_scenario.NoteSummaryScenario",
+        args={
+            "data_path": config["data_path"],
+        },
+    )
+
+    adapter_spec = get_generation_adapter_spec(
+        instructions="",
+        input_noun=None,
+        newline_after_input_noun=False,
+        output_noun=None,
+        max_tokens=500,
+        stop_sequences=[],
+        max_train_instances=0,
+    )
+
+    annotator_models = {
+        judge["name"]: AnnotatorModelInfo(
+            model_name=judge["model"],
+            model_deployment=judge["model_deployment"],
+        )
+        for judge in config["judges"]
+    }
+
+    annotator_specs = [
+        AnnotatorSpec(
+            class_name="helm.benchmark.annotation.note_summary_annotator.NoteSummaryAnnotator",
+            args={"annotator_models": annotator_models},
+        )
+    ]
+
+    metric_specs = get_basic_metric_specs([]) + [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
+            args={
+                "metric_name": "note_summary_accuracy",
+                "scenario_name": "note_summary",
+                "annotator_models": annotator_models,
+                "rubric": Rubric.from_config(config["rubric"]),
+            },
+        )
+    ]
+    return RunSpec(
+        name="note_summary",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        annotators=annotator_specs,
+        metric_specs=metric_specs,
+        groups=["note_summary"],
+    )
diff --git a/src/helm/benchmark/scenarios/note_summary_scenario.py b/src/helm/benchmark/scenarios/note_summary_scenario.py
@@ -0,0 +1,91 @@
+import pandas as pd
+
+from typing import List
+from helm.common.general import check_file_exists
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+)
+from helm.benchmark.scenarios.note_summary_scenario_helper import Summarizer  # type: ignore
+
+
+def file_preprocessing(data_path: str) -> pd.DataFrame:
+    """
+    Preprocess the data files to create a DataFrame with the necessary columns.
+    task_objective: 'brief_hospital_course' or 'discharge_instructions'
+    Use command to download: wget -r -N -c -np --user {PHYSIONET_USERNAME} \
+    --ask-password https://physionet.org/files/discharge-me/1.3/
+    data_path is directory that contains the downloaded files: '{base_dir}/physionet.org/'
+    """
+    # Load the first CSV file
+    discharge_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge.csv.gz"
+    check_file_exists(
+        discharge_path, msg=f"[NoteSummaryScenario] Required discharge file not found: '{discharge_path}'"
+    )
+    radiology_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/radiology.csv.gz"
+    check_file_exists(
+        radiology_path, msg=f"[NoteSummaryScenario] Required radiology file not found: '{radiology_path}'"
+    )
+    df_discharge = pd.read_csv(discharge_path, compression="gzip", keep_default_na=False)
+    df_radiology = pd.read_csv(radiology_path, compression="gzip", keep_default_na=False)
+
+    final_df = pd.concat([df_discharge, df_radiology], ignore_index=True)
+    return final_df
+
+
+class NoteSummaryScenario(Scenario):
+    """
+    NoteSummary is a benchmark designed to evaluate clinical note summarization capabilities of LLMs.
+    In this scenario, we only consider the discharge text as well as the radiology report text.
+    We are using the phase I test set which is composed of 14,702 hospital admission instances.
+
+    @inproceedings{Xu_2024,
+        title={ Discharge me: Bionlp acl’24 shared task on streamlining discharge documentation.},
+        url={https://doi.org/10.13026/4a0k-4360},
+        DOI={10.13026/27pt-1259},
+        booktitle={ Proceedings of the 23rd Workshop on Biomedical Natural Language Processing (BioNLP) at ACL 2024},
+        publisher={Association for Computational Linguistics},
+        author={Xu, Justin and Delbrouck, Jean-Benoit and Johnston, Andrew and Blankemeier, Louis and Langlotz, Curtis},
+        year={2024}
+    }
+    """
+
+    name = "note_summary"
+    description = "NoteSummary is a benchmark designed to evaluate clinical note summarization capabilities of LLMs."
+    tags = ["biomedical"]
+
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        df = file_preprocessing(self.data_path)
+        admissions = df["hadm_id"].unique()
+        for admission in admissions:
+            df_admission = df[df["hadm_id"] == admission]
+            summarizer = Summarizer(
+                notes=df_admission["text"].tolist(),
+                authors=df_admission["note_type"].tolist(),
+                timestamps=df_admission["charttime"].tolist(),
+                target_specialty="emergency medicine",
+            )
+            prompt_di, _ = summarizer.build_prompt(anti_rules=0, omit_rules=0)
+            instances.append(
+                Instance(
+                    input=Input(text=prompt_di),
+                    references=[],
+                    split=TEST_SPLIT,
+                    extra_data={"notes": df_admission["text"].tolist()},
+                )
+            )
+
+        return instances
+
+    def read_file(self, file_path: str) -> List[str]:
+        with open(file_path, "r") as file:
+            lines = file.readlines()
+        lines = [line.strip() for line in lines]
+        return lines