diff --git a/setup.cfg b/setup.cfg
index 79922f4be1b..f4c216d7e8c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -309,6 +309,7 @@ medhelm =
     openpyxl~=3.1
     python-docx~=1.1
     transformers~=4.45,<4.50
+    evaluation-instruments @ git+https://github.com/epic-open-source/evaluation-instruments.git@1c4637e84fe4dc54f6695e438f3baca6b2cd4573
 
 audiolm =
     crfm-helm[openai]
diff --git a/src/helm/benchmark/annotation/model_as_judge.py b/src/helm/benchmark/annotation/model_as_judge.py
index 64a9e0b3fa1..a3c22ee489a 100644
--- a/src/helm/benchmark/annotation/model_as_judge.py
+++ b/src/helm/benchmark/annotation/model_as_judge.py
@@ -220,7 +220,7 @@ def annotate(self, request_state: RequestState) -> Dict[str, Any]:
         annotations: Dict[str, Union[Optional[str], Optional[bool], Dict[str, Any]]] = {"prompt_text": annotator_prompt}
 
         # Track failed annotations for each model
-        failed_counts: Dict[str, int] = {name: 0 for name in self._annotator_models}
+        failed_annotators: Set[str] = set()
 
         # Annotate using multiple models
         for annotator_name, annotator_model_info in self._annotator_models.items():
@@ -230,13 +230,15 @@ def annotate(self, request_state: RequestState) -> Dict[str, Any]:
                 if annotator_criteria is not None:
                     annotations[annotator_name] = annotator_criteria
                 else:
-                    failed_counts[annotator_name] += 1
+                    failed_annotators.add(annotator_name)
 
             except Exception as e:
                 hlog(f"ERROR annotating with {annotator_name}: {e}")
-                failed_counts[annotator_name] += 1
+                failed_annotators.add(annotator_name)
 
-        hlog(f"Failed model annotations: {failed_counts}")
+        total_failed = len(failed_annotators)
+        if total_failed != 0:
+            hlog(f"Some model annotations failed: {failed_annotators}")
         return annotations
 
     def _annotate_with_model(
diff --git a/src/helm/benchmark/annotation/note_summary_annotator.py b/src/helm/benchmark/annotation/note_summary_annotator.py
new file mode 100644
index 00000000000..d2d5ce9d58e
--- /dev/null
+++ b/src/helm/benchmark/annotation/note_summary_annotator.py
@@ -0,0 +1,51 @@
+import evaluation_instruments.instruments.pdsqi_9.pdsqi_prompt as pdsqi
+
+from typing import Dict, Optional
+
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
+from helm.clients.auto_client import AutoClient
+
+from evaluation_instruments import prep
+
+
+class NoteSummaryAnnotator(LLMAsJuryAnnotator):
+    """The NoteSummary autograder."""
+
+    name = "note_summary"
+
+    def __init__(
+        self,
+        auto_client: AutoClient,
+        annotator_models: Dict[str, AnnotatorModelInfo],
+        template_name: Optional[str] = None,
+    ):
+        super().__init__(
+            auto_client=auto_client,
+            prompt_template="",
+            annotation_criteria={},
+            annotator_models=annotator_models,
+        )
+
+    def _interpolate_prompt(
+        self, request_state: RequestState, custom_replacements: Optional[Dict[str, str]] = None
+    ) -> str:
+        """
+        Interpolate prompt template with request state information.
+
+        :param request_state: The current request state
+        :param custom_replacements: Optional dictionary of additional replacements
+        :return: Interpolated prompt
+        """
+        notes = (request_state.instance.extra_data or {}).get("notes", [])
+        prompt = pdsqi.resolve_prompt(
+            summary_to_evaluate=(
+                request_state.result.completions[0].text
+                if request_state.result and request_state.result.completions
+                else ""
+            ),
+            notes=notes,
+            target_specialty="emergency medicine",
+            output_mode=prep.OutputMode.EXPLAINED_SCORE,
+        )
+        return prompt[1]["content"]
diff --git a/src/helm/benchmark/metrics/llm_jury_metrics.py b/src/helm/benchmark/metrics/llm_jury_metrics.py
index 81e05f29f67..d3cffc99f47 100644
--- a/src/helm/benchmark/metrics/llm_jury_metrics.py
+++ b/src/helm/benchmark/metrics/llm_jury_metrics.py
@@ -1,14 +1,70 @@
-from typing import Any, Dict, List
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
 
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
+from helm.common.hierarchical_logger import hlog, hwarn
 from helm.benchmark.metrics.metric import Metric
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
 
 
+@dataclass
+class RubricItem:
+    name: str
+    min: float
+    max: float
+    weight: float
+    higher_is_better: bool
+
+
+@dataclass
+class Rubric:
+    items: Dict[str, RubricItem]
+
+    @classmethod
+    def from_config(cls, rubric_config: Dict[str, Any]) -> "Rubric":
+        items = {}
+        for name, attrs in rubric_config.items():
+            item = RubricItem(
+                name=name,
+                min=attrs["min"],
+                max=attrs["max"],
+                weight=attrs["weight"],
+                higher_is_better=attrs["higher_is_better"],
+            )
+            items[name] = item
+        return cls(items)
+
+    def normalize(self, name: str, score: float) -> float:
+        """Normalize the score according to the rubric item config."""
+        item = self.items[name]
+        raw = (score - item.min) / (item.max - item.min)
+        return raw if item.higher_is_better else 1 - raw
+
+    def aggregate(self, scores: Dict[str, float]) -> float:
+        """Weighted aggregation of normalized scores."""
+        total = 0.0
+        weight_offset = 0.0
+        invalid_scores = [name for name in scores.keys() if not isinstance(scores[name], (int, float))]
+        if invalid_scores:
+            n_valid_scores = len(scores) - len(invalid_scores)
+            weight_offset = sum(self.items[name].weight for name in invalid_scores) / n_valid_scores
+            hwarn(
+                f"Invalid scores found for {invalid_scores}. "
+                f"Using average weight offset of {weight_offset} to adjust the total score."
+            )
+        for name, score in scores.items():
+            if not isinstance(score, (int, float)):
+                hwarn(f"Skipping non-numeric score for {name}: {score}")
+                continue
+            norm = self.normalize(name, score)
+            total += norm * (self.items[name].weight + weight_offset)
+        return total
+
+
 class LLMJuryMetric(Metric):
     """Score metrics for LLM Jury."""
 
@@ -18,11 +74,13 @@ def __init__(
         scenario_name: str,
         annotator_models: Dict[str, AnnotatorModelInfo],
         default_score: float = 0.0,
+        rubric: Optional[Rubric] = None,
     ):
         self.metric_name = metric_name
         self.scenario_name = scenario_name
         self.annotator_models = annotator_models
         self.default_score = default_score
+        self.rubric = rubric
 
     def evaluate_generation(
         self,
@@ -32,15 +90,29 @@ def evaluate_generation(
         eval_cache_path: str,
     ) -> List[Stat]:
         assert request_state.annotations
+        if self.rubric:
+            hlog(f"Using rubric for {self.scenario_name} with items: {list(self.rubric.items.keys())}")
+        else:
+            hlog(f"No rubric defined for {self.scenario_name}, using raw scores.")
         annotations: Dict[str, Any] = request_state.annotations[self.scenario_name]
         scores: List[int] = []
         score = self.default_score
         for annotation_key, annotation_dict in annotations.items():
             if annotation_key in self.annotator_models.keys() and annotation_dict is not None:
-                for val in annotation_dict.values():
-                    scores.append(int(val["score"]))
-        if scores:
-            score = sum(scores) / len(scores)
+                if self.rubric:
+                    # Use rubric to normalize and aggregate scores
+                    scores_dict = {
+                        item: annotation_dict[item]["score"]
+                        for item in self.rubric.items.keys()
+                        if item in annotation_dict
+                    }
+                    score = self.rubric.aggregate(scores_dict)
+                else:
+                    # Fallback to using the raw score
+                    for val in annotation_dict.values():
+                        scores.append(int(val["score"]))
+                    if scores:
+                        score = sum(scores) / len(scores)
         return [
             Stat(MetricName(self.metric_name)).add(score),
         ]
diff --git a/src/helm/benchmark/run_specs/medhelm_run_specs.py b/src/helm/benchmark/run_specs/medhelm_run_specs.py
index e450b2a7e58..bc4969f0551 100644
--- a/src/helm/benchmark/run_specs/medhelm_run_specs.py
+++ b/src/helm/benchmark/run_specs/medhelm_run_specs.py
@@ -3,7 +3,11 @@
 Website: https://crfm.stanford.edu/helm/medhelm/
 """
 
-from typing import Union
+import importlib.resources as pkg_resources
+import os
+import yaml
+
+from typing import Optional, Union
 
 from helm.benchmark.adaptation.adapter_spec import (
     ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -13,6 +17,7 @@
     get_multiple_choice_adapter_spec,
 )
 from helm.benchmark.annotation.annotator import AnnotatorSpec
+from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
 from helm.benchmark.metrics.common_metric_specs import (
     get_basic_metric_specs,
     get_exact_match_metric_specs,
@@ -21,6 +26,7 @@
     get_generic_metric_specs,
 )
 from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.metrics.llm_jury_metrics import Rubric
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.scenarios.scenario import ScenarioSpec
 from helm.common.gpu_utils import get_torch_device_name
@@ -1258,3 +1264,67 @@ def get_shc_proxy_spec(data_path: str) -> RunSpec:
         metric_specs=get_exact_match_metric_specs(),
         groups=["shc_proxy_med"],
     )
+
+
+@run_spec_function("note_summary")
+def get_note_summary_spec(config_path: Optional[str] = None) -> RunSpec:
+    if config_path is None:
+        package = "helm.benchmark.scenarios"
+        config_path = str(pkg_resources.files(package).joinpath("note_summary_scenario.yaml"))
+
+    assert os.path.exists(config_path), f"Config path not found: {config_path}."
+
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.note_summary_scenario.NoteSummaryScenario",
+        args={
+            "data_path": config["data_path"],
+        },
+    )
+
+    adapter_spec = get_generation_adapter_spec(
+        instructions="",
+        input_noun=None,
+        newline_after_input_noun=False,
+        output_noun=None,
+        max_tokens=500,
+        stop_sequences=[],
+        max_train_instances=0,
+    )
+
+    annotator_models = {
+        judge["name"]: AnnotatorModelInfo(
+            model_name=judge["model"],
+            model_deployment=judge["model_deployment"],
+        )
+        for judge in config["judges"]
+    }
+
+    annotator_specs = [
+        AnnotatorSpec(
+            class_name="helm.benchmark.annotation.note_summary_annotator.NoteSummaryAnnotator",
+            args={"annotator_models": annotator_models},
+        )
+    ]
+
+    metric_specs = get_basic_metric_specs([]) + [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
+            args={
+                "metric_name": "note_summary_accuracy",
+                "scenario_name": "note_summary",
+                "annotator_models": annotator_models,
+                "rubric": Rubric.from_config(config["rubric"]),
+            },
+        )
+    ]
+    return RunSpec(
+        name="note_summary",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        annotators=annotator_specs,
+        metric_specs=metric_specs,
+        groups=["note_summary"],
+    )
diff --git a/src/helm/benchmark/scenarios/note_summary_scenario.py b/src/helm/benchmark/scenarios/note_summary_scenario.py
new file mode 100644
index 00000000000..5d7c678a39b
--- /dev/null
+++ b/src/helm/benchmark/scenarios/note_summary_scenario.py
@@ -0,0 +1,91 @@
+import pandas as pd
+
+from typing import List
+from helm.common.general import check_file_exists
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+)
+from helm.benchmark.scenarios.note_summary_scenario_helper import Summarizer  # type: ignore
+
+
+def file_preprocessing(data_path: str) -> pd.DataFrame:
+    """
+    Preprocess the data files to create a DataFrame with the necessary columns.
+    task_objective: 'brief_hospital_course' or 'discharge_instructions'
+    Use command to download: wget -r -N -c -np --user {PHYSIONET_USERNAME} \
+    --ask-password https://physionet.org/files/discharge-me/1.3/
+    data_path is directory that contains the downloaded files: '{base_dir}/physionet.org/'
+    """
+    # Load the first CSV file
+    discharge_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge.csv.gz"
+    check_file_exists(
+        discharge_path, msg=f"[NoteSummaryScenario] Required discharge file not found: '{discharge_path}'"
+    )
+    radiology_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/radiology.csv.gz"
+    check_file_exists(
+        radiology_path, msg=f"[NoteSummaryScenario] Required radiology file not found: '{radiology_path}'"
+    )
+    df_discharge = pd.read_csv(discharge_path, compression="gzip", keep_default_na=False)
+    df_radiology = pd.read_csv(radiology_path, compression="gzip", keep_default_na=False)
+
+    final_df = pd.concat([df_discharge, df_radiology], ignore_index=True)
+    return final_df
+
+
+class NoteSummaryScenario(Scenario):
+    """
+    NoteSummary is a benchmark designed to evaluate clinical note summarization capabilities of LLMs.
+    In this scenario, we only consider the discharge text as well as the radiology report text.
+    We are using the phase I test set which is composed of 14,702 hospital admission instances.
+
+    @inproceedings{Xu_2024,
+        title={ Discharge me: Bionlp acl’24 shared task on streamlining discharge documentation.},
+        url={https://doi.org/10.13026/4a0k-4360},
+        DOI={10.13026/27pt-1259},
+        booktitle={ Proceedings of the 23rd Workshop on Biomedical Natural Language Processing (BioNLP) at ACL 2024},
+        publisher={Association for Computational Linguistics},
+        author={Xu, Justin and Delbrouck, Jean-Benoit and Johnston, Andrew and Blankemeier, Louis and Langlotz, Curtis},
+        year={2024}
+    }
+    """
+
+    name = "note_summary"
+    description = "NoteSummary is a benchmark designed to evaluate clinical note summarization capabilities of LLMs."
+    tags = ["biomedical"]
+
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        df = file_preprocessing(self.data_path)
+        admissions = df["hadm_id"].unique()
+        for admission in admissions:
+            df_admission = df[df["hadm_id"] == admission]
+            summarizer = Summarizer(
+                notes=df_admission["text"].tolist(),
+                authors=df_admission["note_type"].tolist(),
+                timestamps=df_admission["charttime"].tolist(),
+                target_specialty="emergency medicine",
+            )
+            prompt_di, _ = summarizer.build_prompt(anti_rules=0, omit_rules=0)
+            instances.append(
+                Instance(
+                    input=Input(text=prompt_di),
+                    references=[],
+                    split=TEST_SPLIT,
+                    extra_data={"notes": df_admission["text"].tolist()},
+                )
+            )
+
+        return instances
+
+    def read_file(self, file_path: str) -> List[str]:
+        with open(file_path, "r") as file:
+            lines = file.readlines()
+        lines = [line.strip() for line in lines]
+        return lines
diff --git a/src/helm/benchmark/scenarios/note_summary_scenario.yaml b/src/helm/benchmark/scenarios/note_summary_scenario.yaml
new file mode 100644
index 00000000000..01bdef3ff43
--- /dev/null
+++ b/src/helm/benchmark/scenarios/note_summary_scenario.yaml
@@ -0,0 +1,70 @@
+
+# Data path from which the data will be loaded
+data_path: /share/pi/nigam/data/physionet.org
+
+# The rubric criteria for evaluating the note summary scenario
+rubric:
+  citation:
+    min: 1
+    max: 5
+    weight: 0.1
+    higher_is_better: true
+  accurate:
+    min: 1
+    max: 5
+    weight: 0.2
+    higher_is_better: true
+  thorough:
+    min: 1
+    max: 5
+    weight: 0.1
+    higher_is_better: true
+  useful:
+    min: 1
+    max: 5
+    weight: 0.1
+    higher_is_better: true
+  organized:
+    min: 1
+    max: 5
+    weight: 0.1
+    higher_is_better: true
+  comprehensible:
+    min: 1
+    max: 5
+    weight: 0.1
+    higher_is_better: true
+  succinct:
+    min: 1
+    max: 5
+    weight: 0.1
+    higher_is_better: true
+  abstraction:
+    min: 0
+    max: 1
+    weight: 0.05
+    higher_is_better: true
+  synthesized:
+    min: 1
+    max: 5
+    weight: 0.1
+    higher_is_better: true
+  voice_summ:
+    min: 0
+    max: 1
+    weight: 0.025
+    higher_is_better: false
+  voice_note:
+    min: 0
+    max: 1
+    weight: 0.025
+    higher_is_better: false
+
+# The judges to be used for evaluating the note summary scenario.
+# name: The short name for the judge.
+# model: The field value matching the 'model_name' field under model_deployments.yaml
+# model_deployment: The field value matching the 'name' under model_deployments.yaml.
+judges:
+  - name: "gpt_4o"
+    model: "openai/gpt-4o-2024-05-13"
+    model_deployment: "stanfordhealthcare/gpt-4o-2024-05-13"
diff --git a/src/helm/benchmark/scenarios/note_summary_scenario_helper.py b/src/helm/benchmark/scenarios/note_summary_scenario_helper.py
new file mode 100644
index 00000000000..84e2f8d954b
--- /dev/null
+++ b/src/helm/benchmark/scenarios/note_summary_scenario_helper.py
@@ -0,0 +1,426 @@
+# The following code is copied verbatim from:
+# https://pages.doit.wisc.edu/smph-public/dom/uw-icu-data-science-lab-public/pdsqi-9/-/blob/main/05_Summary_Generation/summarize_prompt.py?ref_type=heads
+# under the following license:
+#
+#                                 Apache License
+#                            Version 2.0, January 2004
+#                         http://www.apache.org/licenses/
+#
+#    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+#
+#    1. Definitions.
+#
+#       "License" shall mean the terms and conditions for use, reproduction,
+#       and distribution as defined by Sections 1 through 9 of this document.
+#
+#       "Licensor" shall mean the copyright owner or entity authorized by
+#       the copyright owner that is granting the License.
+#
+#       "Legal Entity" shall mean the union of the acting entity and all
+#       other entities that control, are controlled by, or are under common
+#       control with that entity. For the purposes of this definition,
+#       "control" means (i) the power, direct or indirect, to cause the
+#       direction or management of such entity, whether by contract or
+#       otherwise, or (ii) ownership of fifty percent (50%) or more of the
+#       outstanding shares, or (iii) beneficial ownership of such entity.
+#
+#       "You" (or "Your") shall mean an individual or Legal Entity
+#       exercising permissions granted by this License.
+#
+#       "Source" form shall mean the preferred form for making modifications,
+#       including but not limited to software source code, documentation
+#       source, and configuration files.
+#
+#       "Object" form shall mean any form resulting from mechanical
+#       transformation or translation of a Source form, including but
+#       not limited to compiled object code, generated documentation,
+#       and conversions to other media types.
+#
+#       "Work" shall mean the work of authorship, whether in Source or
+#       Object form, made available under the License, as indicated by a
+#       copyright notice that is included in or attached to the work
+#       (an example is provided in the Appendix below).
+#
+#       "Derivative Works" shall mean any work, whether in Source or Object
+#       form, that is based on (or derived from) the Work and for which the
+#       editorial revisions, annotations, elaborations, or other modifications
+#       represent, as a whole, an original work of authorship. For the purposes
+#       of this License, Derivative Works shall not include works that remain
+#       separable from, or merely link (or bind by name) to the interfaces of,
+#       the Work and Derivative Works thereof.
+#
+#       "Contribution" shall mean any work of authorship, including
+#       the original version of the Work and any modifications or additions
+#       to that Work or Derivative Works thereof, that is intentionally
+#       submitted to Licensor for inclusion in the Work by the copyright owner
+#       or by an individual or Legal Entity authorized to submit on behalf of
+#       the copyright owner. For the purposes of this definition, "submitted"
+#       means any form of electronic, verbal, or written communication sent
+#       to the Licensor or its representatives, including but not limited to
+#       communication on electronic mailing lists, source code control systems,
+#       and issue tracking systems that are managed by, or on behalf of, the
+#       Licensor for the purpose of discussing and improving the Work, but
+#       excluding communication that is conspicuously marked or otherwise
+#       designated in writing by the copyright owner as "Not a Contribution."
+#
+#       "Contributor" shall mean Licensor and any individual or Legal Entity
+#       on behalf of whom a Contribution has been received by Licensor and
+#       subsequently incorporated within the Work.
+#
+#    2. Grant of Copyright License. Subject to the terms and conditions of
+#       this License, each Contributor hereby grants to You a perpetual,
+#       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+#       copyright license to reproduce, prepare Derivative Works of,
+#       publicly display, publicly perform, sublicense, and distribute the
+#       Work and such Derivative Works in Source or Object form.
+#
+#    3. Grant of Patent License. Subject to the terms and conditions of
+#       this License, each Contributor hereby grants to You a perpetual,
+#       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+#       (except as stated in this section) patent license to make, have made,
+#       use, offer to sell, sell, import, and otherwise transfer the Work,
+#       where such license applies only to those patent claims licensable
+#       by such Contributor that are necessarily infringed by their
+#       Contribution(s) alone or by combination of their Contribution(s)
+#       with the Work to which such Contribution(s) was submitted. If You
+#       institute patent litigation against any entity (including a
+#       cross-claim or counterclaim in a lawsuit) alleging that the Work
+#       or a Contribution incorporated within the Work constitutes direct
+#       or contributory patent infringement, then any patent licenses
+#       granted to You under this License for that Work shall terminate
+#       as of the date such litigation is filed.
+#
+#    4. Redistribution. You may reproduce and distribute copies of the
+#       Work or Derivative Works thereof in any medium, with or without
+#       modifications, and in Source or Object form, provided that You
+#       meet the following conditions:
+#
+#       (a) You must give any other recipients of the Work or
+#           Derivative Works a copy of this License; and
+#
+#       (b) You must cause any modified files to carry prominent notices
+#           stating that You changed the files; and
+#
+#       (c) You must retain, in the Source form of any Derivative Works
+#           that You distribute, all copyright, patent, trademark, and
+#           attribution notices from the Source form of the Work,
+#           excluding those notices that do not pertain to any part of
+#           the Derivative Works; and
+#
+#       (d) If the Work includes a "NOTICE" text file as part of its
+#           distribution, then any Derivative Works that You distribute must
+#           include a readable copy of the attribution notices contained
+#           within such NOTICE file, excluding those notices that do not
+#           pertain to any part of the Derivative Works, in at least one
+#           of the following places: within a NOTICE text file distributed
+#           as part of the Derivative Works; within the Source form or
+#           documentation, if provided along with the Derivative Works; or,
+#           within a display generated by the Derivative Works, if and
+#           wherever such third-party notices normally appear. The contents
+#           of the NOTICE file are for informational purposes only and
+#           do not modify the License. You may add Your own attribution
+#           notices within Derivative Works that You distribute, alongside
+#           or as an addendum to the NOTICE text from the Work, provided
+#           that such additional attribution notices cannot be construed
+#           as modifying the License.
+#
+#       You may add Your own copyright statement to Your modifications and
+#       may provide additional or different license terms and conditions
+#       for use, reproduction, or distribution of Your modifications, or
+#       for any such Derivative Works as a whole, provided Your use,
+#       reproduction, and distribution of the Work otherwise complies with
+#       the conditions stated in this License.
+#
+#    5. Submission of Contributions. Unless You explicitly state otherwise,
+#       any Contribution intentionally submitted for inclusion in the Work
+#       by You to the Licensor shall be under the terms and conditions of
+#       this License, without any additional terms or conditions.
+#       Notwithstanding the above, nothing herein shall supersede or modify
+#       the terms of any separate license agreement you may have executed
+#       with Licensor regarding such Contributions.
+#
+#    6. Trademarks. This License does not grant permission to use the trade
+#       names, trademarks, service marks, or product names of the Licensor,
+#       except as required for reasonable and customary use in describing the
+#       origin of the Work and reproducing the content of the NOTICE file.
+#
+#    7. Disclaimer of Warranty. Unless required by applicable law or
+#       agreed to in writing, Licensor provides the Work (and each
+#       Contributor provides its Contributions) on an "AS IS" BASIS,
+#       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+#       implied, including, without limitation, any warranties or conditions
+#       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+#       PARTICULAR PURPOSE. You are solely responsible for determining the
+#       appropriateness of using or redistributing the Work and assume any
+#       risks associated with Your exercise of permissions under this License.
+#
+#    8. Limitation of Liability. In no event and under no legal theory,
+#       whether in tort (including negligence), contract, or otherwise,
+#       unless required by applicable law (such as deliberate and grossly
+#       negligent acts) or agreed to in writing, shall any Contributor be
+#       liable to You for damages, including any direct, indirect, special,
+#       incidental, or consequential damages of any character arising as a
+#       result of this License or out of the use or inability to use the
+#       Work (including but not limited to damages for loss of goodwill,
+#       work stoppage, computer failure or malfunction, or any and all
+#       other commercial damages or losses), even if such Contributor
+#       has been advised of the possibility of such damages.
+#
+#    9. Accepting Warranty or Additional Liability. While redistributing
+#       the Work or Derivative Works thereof, You may choose to offer,
+#       and charge a fee for, acceptance of support, warranty, indemnity,
+#       or other liability obligations and/or rights consistent with this
+#       License. However, in accepting such obligations, You may act only
+#       on Your own behalf and on Your sole responsibility, not on behalf
+#       of any other Contributor, and only if You agree to indemnify,
+#       defend, and hold each Contributor harmless for any liability
+#       incurred by, or claims asserted against, such Contributor by reason
+#       of your accepting any such warranty or additional liability.
+#
+#    END OF TERMS AND CONDITIONS
+#
+#    APPENDIX: How to apply the Apache License to your work.
+#
+#       To apply the Apache License to your work, attach the following
+#       boilerplate notice, with the fields enclosed by brackets "[]"
+#       replaced with your own identifying information. (Don't include
+#       the brackets!)  The text should be enclosed in the appropriate
+#       comment syntax for the file format. We also recommend that a
+#       file or class name and description of purpose be included on the
+#       same "printed page" as the copyright notice for easier
+#       identification within third-party archives.
+#
+#
+#  ©2021 Board of Regents of the University of Wisconsin System
+#
+#  The above copyright notice shall be included in all copies or substantial portions of the Software and permissions assigned in attached license.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# flake8: noqa
+# type: ignore
+# fmt: off
+
+import random
+
+
+class Summarizer:
+    def __init__(self):
+        pass
+
+    def __init__(self, notes: list, authors: list, timestamps: list, target_specialty: str):
+        self.set_input_data(notes, authors, timestamps, target_specialty)
+
+    def set_input_data(self, notes: list, authors: list, timestamps: list, target_specialty: str):
+        self.target_specialty = target_specialty
+        self.prompt_notes = ""
+        for i in range(len(notes)):
+            self.prompt_notes += f"""<NoteID:{i+1}>
+Written By: {authors[i]}
+Timestamp: {timestamps[i]}
+Note: {notes[i]}
+<\\NoteID:{i+1}>
+"""
+        self.define_rules()
+        self.define_anti_rules()
+
+    # NOTE: Call "set_input_data()" before running "build_prompt()"!
+    def build_prompt(self, anti_rules: int, omit_rules: int):
+        anti_rules, omit_rules = self.validate_state(anti_rules, omit_rules)
+
+        # Establish Directory
+        # directory -> Key = Index of Rule
+        # directory -> Value = "rule", "anti", or "omit"
+        directory = {}
+        for i in range(len(self.rules)):
+            directory[i] = "rule"
+
+        # Add Anti-Rules & Omissions to Directory
+        available_to_replace = [i for i in range(len(self.rules))]
+        random.shuffle(available_to_replace)
+        anti_rules_added = 0
+        omit_rules_added = 0
+        for rand_i in available_to_replace:
+            if anti_rules_added < anti_rules:
+                directory[rand_i] = "anti"
+                anti_rules_added += 1
+            elif omit_rules_added < omit_rules:
+                directory[rand_i] = "omit"
+                omit_rules_added += 1
+            else:
+                break
+
+        # Build Prompt
+        prompt = f"""You are an expert doctor.
+Your task is to write a summary for a specialty of {self.target_specialty}, after reviewing a set of notes about a patient."""
+
+        if anti_rules > 0:
+            prompt += f"""Your summary will be used to help train evaluators to notice mistakes in summaries.
+Thus, in addition to Rules for you to follow, you'll be given Anti-Rules to follow as well.
+These Anti-Rules will outline intentional mistakes. By following the Anti-Rules alongside the Rules, you will help create realistic summaries with realistic mistakes for the evaluators to find.
+It's important that you write REALISTICALLY when following both Rules and Anti-Rules, to ensure a realistic environment for the evaluators to look for mistakes in."""
+
+        prompt += "\n\nRules for writing the summary:"
+
+        for i in range(len(self.rules)):
+            if directory[i] == "rule":
+                prompt += "\n" + self.rules[i]
+
+        if anti_rules > 0:
+            prompt += f"""\n\nAnti-Rules (intentional mistakes for the summary):"""
+            for i in range(len(self.rules)):
+                if directory[i] == "anti":
+                    prompt += "\n" + self.rules[i]
+
+        prompt += f"""\n\nSummarize the following <NoteSet>, which are presented to you in chronological order split by <Note ID>:
+
+<NoteSet> 
+{self.prompt_notes}
+</NoteSet>
+"""
+        return prompt, directory
+
+    # Helper Method
+    def define_rules(self):
+        self.rules = []
+        self.rules.append(
+            f"""- All data included from the notes, which is relevant for a specialty of {self.target_specialty}, is in the summary."""
+        )
+        self.rules.append(
+            f"""- All assertions can be traced back to the notes; NEVER include assertions which cannot be traced back to the notes."""
+        )
+        self.rules.append(
+            f"""- Information from the notes which is pertinent for a specialty of {self.target_specialty}, or potentially pertinent for a specialty of {self.target_specialty}, is NEVER omitted."""
+        )
+        self.rules.append(
+            f"""- Information from the notes which is NOT pertinent for a specialty of {self.target_specialty} IS omitted from the summary."""
+        )
+        self.rules.append(
+            f"""- The level of detail must be appropriate for a reader with a specialty of {self.target_specialty}."""
+        )
+        self.rules.append(
+            f"""- All assertions must be made with logical order and grouping (temporal or systems/problem based)."""
+        )
+        self.rules.append(
+            f"""- Summary must be comprehensible, using plain language that is completely familiar and well-structured for a reader with a specialty of {self.target_specialty}."""
+        )
+        self.rules.append(
+            f"""- All assertions are captured with fewest words possible and without any redundancy in syntax or semantics."""
+        )
+        self.rules.append(
+            f"""- Where applicable, go beyond relevant groups of events and generate reasoning over the events into a summary that is fully integrated for an overall clinical synopsis with prioritized information."""
+        )
+        self.rules.append(f"""- Avoid stigmatizing words as defined in guidelines and policy (OCR, NIDA, etc).""")
+        self.rules.append(f"""- Keep the summary succinct; summarize all the notes in a single paragraph.""")
+        self.rules.append(f"""- If there are medicine changes in the notes, mention them in the summary.""")
+        self.rules.append(
+            f"""- For every event (e.g., medicine change, new diagnosis, etc.) mentioned in your summary, mention WHEN it happened (communicate the timing of events) if that information is available in the note."""
+        )
+        self.rules.append(
+            f"""- If it's unclear WHEN an event happened in the notes, instead explain that the event was mentioned by a note written at [timestamp of the note]."""
+        )
+        self.rules.append(
+            f"""- For each SENTENCE in the summary, cite the <Note ID> source in the summary using the format <Note ID:IDVAL>, where IDVAL is the ID of the note."""
+        )
+        self.rules.append(
+            f"""- Cite each note tag individually; when citing multiple notes, use the format <Note ID:IDVAL>, <Note ID:IDVAL>."""
+        )
+        self.rules.append(f"""- Prioritize citation order by relevance to the assertion.""")
+        self.rules.append(f"""- Put the citations immediately after each sentence, where they are applicable.""")
+        self.rules.append(f"""- NEVER group all the citations together on the last line.""")
+        self.rules.append(f"""- ALL sentences MUST have a citation. ALL citations MUST be in <Note ID:IDVAL> format.""")
+        self.rules.append(
+            f"""- It is CRITICALLY IMPORTANT that you cite information to the note it came from! Wrongful citations are HARMFUL!"""
+        )
+
+    # Helper Method
+    def define_anti_rules(self):
+        self.anti_rules = []
+        self.anti_rules.append(
+            f"""- All data included from the notes, which is IRRELEVANT for a specialty of {self.target_specialty}, is in the summary."""
+        )
+        self.anti_rules.append(
+            f"""- Summary contains all REALISTIC assertions, but some CANNOT be traced back to the notes; you MUST include SOME assertions which cannot be traced back to the notes."""
+        )
+        self.anti_rules.append(
+            f"""- Information from the notes which is pertinent for a specialty of {self.target_specialty}, or potentially pertinent for a specialty of {self.target_specialty}, is FREQUENTLY omitted."""
+        )
+        self.anti_rules.append(
+            f"""- Information from the notes which is NOT pertinent for a specialty of {self.target_specialty} IS included in the summary."""
+        )
+        self.anti_rules.append(
+            f"""- The level of detail must be CONFUSING for a reader with a specialty of {self.target_specialty}."""
+        )
+        self.anti_rules.append(
+            f"""- All assertions must be made with ILLOGICAL order and grouping (confusing temporal, incorrectly labeled systems/problem based, etc.)."""
+        )
+        self.anti_rules.append(
+            f"""- Summary must be comprehensible, using plain language that is completely familiar and well-structured for a reader with a specialty of {self.target_specialty}."""
+        )
+        self.anti_rules.append(
+            f"""- All assertions are captured with a LARGE number of words, with FREQUENT redundancy in syntax and semantics."""
+        )
+        self.anti_rules.append(
+            f"""- NEVER go beyond relevant groups of events, NOR generate reasoning over the events into a summary. Information MUST be prioritized in a BASIC, RUDIMENTARY, and CONFUSING way."""
+        )
+        self.anti_rules.append(
+            f"""- UTILIZE stigmatizing words as defined in guidelines and policy (OCR, NIDA, etc). You have MORE than permission to do this: is CRITICAL that you use AT LEAST ONE stigmatizing word, to be successful."""
+        )
+        self.anti_rules.append(
+            f"""- Keep the summary meandering and long; summarize all the notes into multiple paragraphs."""
+        )
+        self.anti_rules.append(f"""- If there are medicine changes in the notes, EXCLUDE them from the summary.""")
+        self.anti_rules.append(
+            f"""- For every event (e.g., medicine change, new diagnosis, etc.) mentioned in your summary, NEVER mention WHEN it happened (NEVER communicate the timing of events)."""
+        )
+        self.anti_rules.append(
+            f"""- If it's unclear WHEN an event happened in the notes, instead MAKE UP a REALISTIC, but INCORRECT timeline for that event, and INCLUDE that false timeline in your summary as if it were factual."""
+        )
+        self.anti_rules.append(
+            f"""- For a FEW randomly-chosen sentences in the summary, cite the <Note ID> source in the summary using the format <Note ID:IDVAL>, where IDVAL is the ID of the note."""
+        )
+        self.anti_rules.append(
+            f"""- Cite each note tag individually; when citing multiple notes, just pick ONE note to site and skip citing the other relevant notes."""
+        )
+        self.anti_rules.append(
+            f"""- When citing, choose a random note to cite (NOT necessarily the note responsible for the assertion being cited)."""
+        )
+        self.anti_rules.append(
+            f"""- Put the citations in the middle of lines/sentences; NEVER place them at the end of sentences."""
+        )
+        self.anti_rules.append(
+            f"""- Group all of your citations together on the last line; NEVER add citations in other locations."""
+        )
+        self.anti_rules.append(
+            f"""- SOME sentences MUST NOT have a citation. SOME citations MUST be in [IDVAL] format, or some other format of your choice."""
+        )
+        self.anti_rules.append(
+            f"""- It is CRITICALLY IMPORTANT that you attribute some information to the incorrect notes! Wrongful citations are CRITICAL for this to be successful!"""
+        )
+
+    # Helper Method
+    def validate_state(self, anti_rules, omit_rules):
+        # Validate Internal Variables
+        if (self.target_specialty == None) or (self.prompt_notes == None):
+            print("Error: Invalid State. Ensure set_input_data() was run.")
+            quit()
+        elif len(self.rules) != len(self.anti_rules):
+            print("Error: Invalid State. Ensure rules/anti-rules are parallel in the code.")
+            quit()
+        # Bound Range of Parameters
+        omit_rules = min(omit_rules, len(self.rules))
+        omit_rules = max(omit_rules, 0)
+        anti_rules = min(anti_rules, (len(self.rules) - omit_rules))
+        anti_rules = max(anti_rules, 0)
+
+        return anti_rules, omit_rules
diff --git a/src/helm/benchmark/static/schema_medhelm.yaml b/src/helm/benchmark/static/schema_medhelm.yaml
index 6e851f665b4..6e3e2c00710 100644
--- a/src/helm/benchmark/static/schema_medhelm.yaml
+++ b/src/helm/benchmark/static/schema_medhelm.yaml
@@ -223,6 +223,11 @@ metrics:
     short_display_name: Jury Score
     description: Measures the average score assigned by an LLM-based jury evaluating task performance.
     lower_is_better: false
+  - name: note_summary_accuracy
+    display_name: Note Summary Jury Score
+    short_display_name: Jury Score
+    description: Measures the average score assigned by an LLM-based jury evaluating task performance.
+    lower_is_better: false
   - name: mtsamples_procedures_accuracy
     display_name: MTSamples Procedures Jury Score
     short_display_name: Jury Score
@@ -499,6 +504,7 @@ run_groups:
       - mimic_rrs
       - mimic_bhc
       - chw_care_plan
+      - note_summary
   
   - name: patient_communication
     display_name: Patient Communication and Education
@@ -1137,4 +1143,22 @@ run_groups:
       what: Identify referrals for ENT specialists
       who: Hospital Admistrator
       when: Any
+      language: English
+
+  - name: note_summary
+    display_name: NoteSummary
+    short_display_name: NoteSummary
+    description: NoteSummary is a benchmark designed to evaluate clinical note summarization capabilities of LLMs.
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: note_summary_accuracy
+      main_split: test
+    taxonomy:
+      task: Text generation
+      what: Summarize clinical notes into concise, informative summaries
+      who: Clinician
+      when: Upon hospital discharge
       language: English
\ No newline at end of file