diff --git a/setup.cfg b/setup.cfg index 79922f4be1b..f4c216d7e8c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -309,6 +309,7 @@ medhelm = openpyxl~=3.1 python-docx~=1.1 transformers~=4.45,<4.50 + evaluation-instruments @ git+https://github.com/epic-open-source/evaluation-instruments.git@1c4637e84fe4dc54f6695e438f3baca6b2cd4573 audiolm = crfm-helm[openai] diff --git a/src/helm/benchmark/annotation/model_as_judge.py b/src/helm/benchmark/annotation/model_as_judge.py index 64a9e0b3fa1..a3c22ee489a 100644 --- a/src/helm/benchmark/annotation/model_as_judge.py +++ b/src/helm/benchmark/annotation/model_as_judge.py @@ -220,7 +220,7 @@ def annotate(self, request_state: RequestState) -> Dict[str, Any]: annotations: Dict[str, Union[Optional[str], Optional[bool], Dict[str, Any]]] = {"prompt_text": annotator_prompt} # Track failed annotations for each model - failed_counts: Dict[str, int] = {name: 0 for name in self._annotator_models} + failed_annotators: Set[str] = set() # Annotate using multiple models for annotator_name, annotator_model_info in self._annotator_models.items(): @@ -230,13 +230,15 @@ def annotate(self, request_state: RequestState) -> Dict[str, Any]: if annotator_criteria is not None: annotations[annotator_name] = annotator_criteria else: - failed_counts[annotator_name] += 1 + failed_annotators.add(annotator_name) except Exception as e: hlog(f"ERROR annotating with {annotator_name}: {e}") - failed_counts[annotator_name] += 1 + failed_annotators.add(annotator_name) - hlog(f"Failed model annotations: {failed_counts}") + total_failed = len(failed_annotators) + if total_failed != 0: + hlog(f"Some model annotations failed: {failed_annotators}") return annotations def _annotate_with_model( diff --git a/src/helm/benchmark/annotation/note_summary_annotator.py b/src/helm/benchmark/annotation/note_summary_annotator.py new file mode 100644 index 00000000000..d2d5ce9d58e --- /dev/null +++ b/src/helm/benchmark/annotation/note_summary_annotator.py @@ -0,0 +1,51 @@ +import evaluation_instruments.instruments.pdsqi_9.pdsqi_prompt as pdsqi + +from typing import Dict, Optional + +from helm.benchmark.adaptation.request_state import RequestState +from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator +from helm.clients.auto_client import AutoClient + +from evaluation_instruments import prep + + +class NoteSummaryAnnotator(LLMAsJuryAnnotator): + """The NoteSummary autograder.""" + + name = "note_summary" + + def __init__( + self, + auto_client: AutoClient, + annotator_models: Dict[str, AnnotatorModelInfo], + template_name: Optional[str] = None, + ): + super().__init__( + auto_client=auto_client, + prompt_template="", + annotation_criteria={}, + annotator_models=annotator_models, + ) + + def _interpolate_prompt( + self, request_state: RequestState, custom_replacements: Optional[Dict[str, str]] = None + ) -> str: + """ + Interpolate prompt template with request state information. + + :param request_state: The current request state + :param custom_replacements: Optional dictionary of additional replacements + :return: Interpolated prompt + """ + notes = (request_state.instance.extra_data or {}).get("notes", []) + prompt = pdsqi.resolve_prompt( + summary_to_evaluate=( + request_state.result.completions[0].text + if request_state.result and request_state.result.completions + else "" + ), + notes=notes, + target_specialty="emergency medicine", + output_mode=prep.OutputMode.EXPLAINED_SCORE, + ) + return prompt[1]["content"] diff --git a/src/helm/benchmark/metrics/llm_jury_metrics.py b/src/helm/benchmark/metrics/llm_jury_metrics.py index 81e05f29f67..d3cffc99f47 100644 --- a/src/helm/benchmark/metrics/llm_jury_metrics.py +++ b/src/helm/benchmark/metrics/llm_jury_metrics.py @@ -1,14 +1,70 @@ -from typing import Any, Dict, List +from dataclasses import dataclass +from typing import Any, Dict, List, Optional from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.adaptation.request_state import RequestState from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo +from helm.common.hierarchical_logger import hlog, hwarn from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric_name import MetricName from helm.benchmark.metrics.metric_service import MetricService from helm.benchmark.metrics.statistic import Stat +@dataclass +class RubricItem: + name: str + min: float + max: float + weight: float + higher_is_better: bool + + +@dataclass +class Rubric: + items: Dict[str, RubricItem] + + @classmethod + def from_config(cls, rubric_config: Dict[str, Any]) -> "Rubric": + items = {} + for name, attrs in rubric_config.items(): + item = RubricItem( + name=name, + min=attrs["min"], + max=attrs["max"], + weight=attrs["weight"], + higher_is_better=attrs["higher_is_better"], + ) + items[name] = item + return cls(items) + + def normalize(self, name: str, score: float) -> float: + """Normalize the score according to the rubric item config.""" + item = self.items[name] + raw = (score - item.min) / (item.max - item.min) + return raw if item.higher_is_better else 1 - raw + + def aggregate(self, scores: Dict[str, float]) -> float: + """Weighted aggregation of normalized scores.""" + total = 0.0 + weight_offset = 0.0 + invalid_scores = [name for name in scores.keys() if not isinstance(scores[name], (int, float))] + if invalid_scores: + n_valid_scores = len(scores) - len(invalid_scores) + weight_offset = sum(self.items[name].weight for name in invalid_scores) / n_valid_scores + hwarn( + f"Invalid scores found for {invalid_scores}. " + f"Using average weight offset of {weight_offset} to adjust the total score." + ) + for name, score in scores.items(): + if not isinstance(score, (int, float)): + hwarn(f"Skipping non-numeric score for {name}: {score}") + continue + norm = self.normalize(name, score) + total += norm * (self.items[name].weight + weight_offset) + return total + + class LLMJuryMetric(Metric): """Score metrics for LLM Jury.""" @@ -18,11 +74,13 @@ def __init__( scenario_name: str, annotator_models: Dict[str, AnnotatorModelInfo], default_score: float = 0.0, + rubric: Optional[Rubric] = None, ): self.metric_name = metric_name self.scenario_name = scenario_name self.annotator_models = annotator_models self.default_score = default_score + self.rubric = rubric def evaluate_generation( self, @@ -32,15 +90,29 @@ def evaluate_generation( eval_cache_path: str, ) -> List[Stat]: assert request_state.annotations + if self.rubric: + hlog(f"Using rubric for {self.scenario_name} with items: {list(self.rubric.items.keys())}") + else: + hlog(f"No rubric defined for {self.scenario_name}, using raw scores.") annotations: Dict[str, Any] = request_state.annotations[self.scenario_name] scores: List[int] = [] score = self.default_score for annotation_key, annotation_dict in annotations.items(): if annotation_key in self.annotator_models.keys() and annotation_dict is not None: - for val in annotation_dict.values(): - scores.append(int(val["score"])) - if scores: - score = sum(scores) / len(scores) + if self.rubric: + # Use rubric to normalize and aggregate scores + scores_dict = { + item: annotation_dict[item]["score"] + for item in self.rubric.items.keys() + if item in annotation_dict + } + score = self.rubric.aggregate(scores_dict) + else: + # Fallback to using the raw score + for val in annotation_dict.values(): + scores.append(int(val["score"])) + if scores: + score = sum(scores) / len(scores) return [ Stat(MetricName(self.metric_name)).add(score), ] diff --git a/src/helm/benchmark/run_specs/medhelm_run_specs.py b/src/helm/benchmark/run_specs/medhelm_run_specs.py index e450b2a7e58..bc4969f0551 100644 --- a/src/helm/benchmark/run_specs/medhelm_run_specs.py +++ b/src/helm/benchmark/run_specs/medhelm_run_specs.py @@ -3,7 +3,11 @@ Website: https://crfm.stanford.edu/helm/medhelm/ """ -from typing import Union +import importlib.resources as pkg_resources +import os +import yaml + +from typing import Optional, Union from helm.benchmark.adaptation.adapter_spec import ( ADAPT_MULTIPLE_CHOICE_JOINT, @@ -13,6 +17,7 @@ get_multiple_choice_adapter_spec, ) from helm.benchmark.annotation.annotator import AnnotatorSpec +from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo from helm.benchmark.metrics.common_metric_specs import ( get_basic_metric_specs, get_exact_match_metric_specs, @@ -21,6 +26,7 @@ get_generic_metric_specs, ) from helm.benchmark.metrics.metric import MetricSpec +from helm.benchmark.metrics.llm_jury_metrics import Rubric from helm.benchmark.run_spec import RunSpec, run_spec_function from helm.benchmark.scenarios.scenario import ScenarioSpec from helm.common.gpu_utils import get_torch_device_name @@ -1258,3 +1264,67 @@ def get_shc_proxy_spec(data_path: str) -> RunSpec: metric_specs=get_exact_match_metric_specs(), groups=["shc_proxy_med"], ) + + +@run_spec_function("note_summary") +def get_note_summary_spec(config_path: Optional[str] = None) -> RunSpec: + if config_path is None: + package = "helm.benchmark.scenarios" + config_path = str(pkg_resources.files(package).joinpath("note_summary_scenario.yaml")) + + assert os.path.exists(config_path), f"Config path not found: {config_path}." + + with open(config_path, "r") as f: + config = yaml.safe_load(f) + + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.note_summary_scenario.NoteSummaryScenario", + args={ + "data_path": config["data_path"], + }, + ) + + adapter_spec = get_generation_adapter_spec( + instructions="", + input_noun=None, + newline_after_input_noun=False, + output_noun=None, + max_tokens=500, + stop_sequences=[], + max_train_instances=0, + ) + + annotator_models = { + judge["name"]: AnnotatorModelInfo( + model_name=judge["model"], + model_deployment=judge["model_deployment"], + ) + for judge in config["judges"] + } + + annotator_specs = [ + AnnotatorSpec( + class_name="helm.benchmark.annotation.note_summary_annotator.NoteSummaryAnnotator", + args={"annotator_models": annotator_models}, + ) + ] + + metric_specs = get_basic_metric_specs([]) + [ + MetricSpec( + class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric", + args={ + "metric_name": "note_summary_accuracy", + "scenario_name": "note_summary", + "annotator_models": annotator_models, + "rubric": Rubric.from_config(config["rubric"]), + }, + ) + ] + return RunSpec( + name="note_summary", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + annotators=annotator_specs, + metric_specs=metric_specs, + groups=["note_summary"], + ) diff --git a/src/helm/benchmark/scenarios/note_summary_scenario.py b/src/helm/benchmark/scenarios/note_summary_scenario.py new file mode 100644 index 00000000000..5d7c678a39b --- /dev/null +++ b/src/helm/benchmark/scenarios/note_summary_scenario.py @@ -0,0 +1,91 @@ +import pandas as pd + +from typing import List +from helm.common.general import check_file_exists +from helm.benchmark.scenarios.scenario import ( + Input, + Scenario, + Instance, + TEST_SPLIT, +) +from helm.benchmark.scenarios.note_summary_scenario_helper import Summarizer # type: ignore + + +def file_preprocessing(data_path: str) -> pd.DataFrame: + """ + Preprocess the data files to create a DataFrame with the necessary columns. + task_objective: 'brief_hospital_course' or 'discharge_instructions' + Use command to download: wget -r -N -c -np --user {PHYSIONET_USERNAME} \ + --ask-password https://physionet.org/files/discharge-me/1.3/ + data_path is directory that contains the downloaded files: '{base_dir}/physionet.org/' + """ + # Load the first CSV file + discharge_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge.csv.gz" + check_file_exists( + discharge_path, msg=f"[NoteSummaryScenario] Required discharge file not found: '{discharge_path}'" + ) + radiology_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/radiology.csv.gz" + check_file_exists( + radiology_path, msg=f"[NoteSummaryScenario] Required radiology file not found: '{radiology_path}'" + ) + df_discharge = pd.read_csv(discharge_path, compression="gzip", keep_default_na=False) + df_radiology = pd.read_csv(radiology_path, compression="gzip", keep_default_na=False) + + final_df = pd.concat([df_discharge, df_radiology], ignore_index=True) + return final_df + + +class NoteSummaryScenario(Scenario): + """ + NoteSummary is a benchmark designed to evaluate clinical note summarization capabilities of LLMs. + In this scenario, we only consider the discharge text as well as the radiology report text. + We are using the phase I test set which is composed of 14,702 hospital admission instances. + + @inproceedings{Xu_2024, + title={ Discharge me: Bionlp acl’24 shared task on streamlining discharge documentation.}, + url={https://doi.org/10.13026/4a0k-4360}, + DOI={10.13026/27pt-1259}, + booktitle={ Proceedings of the 23rd Workshop on Biomedical Natural Language Processing (BioNLP) at ACL 2024}, + publisher={Association for Computational Linguistics}, + author={Xu, Justin and Delbrouck, Jean-Benoit and Johnston, Andrew and Blankemeier, Louis and Langlotz, Curtis}, + year={2024} + } + """ + + name = "note_summary" + description = "NoteSummary is a benchmark designed to evaluate clinical note summarization capabilities of LLMs." + tags = ["biomedical"] + + def __init__(self, data_path: str): + super().__init__() + self.data_path = data_path + + def get_instances(self, output_path: str) -> List[Instance]: + instances: List[Instance] = [] + df = file_preprocessing(self.data_path) + admissions = df["hadm_id"].unique() + for admission in admissions: + df_admission = df[df["hadm_id"] == admission] + summarizer = Summarizer( + notes=df_admission["text"].tolist(), + authors=df_admission["note_type"].tolist(), + timestamps=df_admission["charttime"].tolist(), + target_specialty="emergency medicine", + ) + prompt_di, _ = summarizer.build_prompt(anti_rules=0, omit_rules=0) + instances.append( + Instance( + input=Input(text=prompt_di), + references=[], + split=TEST_SPLIT, + extra_data={"notes": df_admission["text"].tolist()}, + ) + ) + + return instances + + def read_file(self, file_path: str) -> List[str]: + with open(file_path, "r") as file: + lines = file.readlines() + lines = [line.strip() for line in lines] + return lines diff --git a/src/helm/benchmark/scenarios/note_summary_scenario.yaml b/src/helm/benchmark/scenarios/note_summary_scenario.yaml new file mode 100644 index 00000000000..01bdef3ff43 --- /dev/null +++ b/src/helm/benchmark/scenarios/note_summary_scenario.yaml @@ -0,0 +1,70 @@ + +# Data path from which the data will be loaded +data_path: /share/pi/nigam/data/physionet.org + +# The rubric criteria for evaluating the note summary scenario +rubric: + citation: + min: 1 + max: 5 + weight: 0.1 + higher_is_better: true + accurate: + min: 1 + max: 5 + weight: 0.2 + higher_is_better: true + thorough: + min: 1 + max: 5 + weight: 0.1 + higher_is_better: true + useful: + min: 1 + max: 5 + weight: 0.1 + higher_is_better: true + organized: + min: 1 + max: 5 + weight: 0.1 + higher_is_better: true + comprehensible: + min: 1 + max: 5 + weight: 0.1 + higher_is_better: true + succinct: + min: 1 + max: 5 + weight: 0.1 + higher_is_better: true + abstraction: + min: 0 + max: 1 + weight: 0.05 + higher_is_better: true + synthesized: + min: 1 + max: 5 + weight: 0.1 + higher_is_better: true + voice_summ: + min: 0 + max: 1 + weight: 0.025 + higher_is_better: false + voice_note: + min: 0 + max: 1 + weight: 0.025 + higher_is_better: false + +# The judges to be used for evaluating the note summary scenario. +# name: The short name for the judge. +# model: The field value matching the 'model_name' field under model_deployments.yaml +# model_deployment: The field value matching the 'name' under model_deployments.yaml. +judges: + - name: "gpt_4o" + model: "openai/gpt-4o-2024-05-13" + model_deployment: "stanfordhealthcare/gpt-4o-2024-05-13" diff --git a/src/helm/benchmark/scenarios/note_summary_scenario_helper.py b/src/helm/benchmark/scenarios/note_summary_scenario_helper.py new file mode 100644 index 00000000000..84e2f8d954b --- /dev/null +++ b/src/helm/benchmark/scenarios/note_summary_scenario_helper.py @@ -0,0 +1,426 @@ +# The following code is copied verbatim from: +# https://pages.doit.wisc.edu/smph-public/dom/uw-icu-data-science-lab-public/pdsqi-9/-/blob/main/05_Summary_Generation/summarize_prompt.py?ref_type=heads +# under the following license: +# +# Apache License +# Version 2.0, January 2004 +# http://www.apache.org/licenses/ +# +# TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION +# +# 1. Definitions. +# +# "License" shall mean the terms and conditions for use, reproduction, +# and distribution as defined by Sections 1 through 9 of this document. +# +# "Licensor" shall mean the copyright owner or entity authorized by +# the copyright owner that is granting the License. +# +# "Legal Entity" shall mean the union of the acting entity and all +# other entities that control, are controlled by, or are under common +# control with that entity. For the purposes of this definition, +# "control" means (i) the power, direct or indirect, to cause the +# direction or management of such entity, whether by contract or +# otherwise, or (ii) ownership of fifty percent (50%) or more of the +# outstanding shares, or (iii) beneficial ownership of such entity. +# +# "You" (or "Your") shall mean an individual or Legal Entity +# exercising permissions granted by this License. +# +# "Source" form shall mean the preferred form for making modifications, +# including but not limited to software source code, documentation +# source, and configuration files. +# +# "Object" form shall mean any form resulting from mechanical +# transformation or translation of a Source form, including but +# not limited to compiled object code, generated documentation, +# and conversions to other media types. +# +# "Work" shall mean the work of authorship, whether in Source or +# Object form, made available under the License, as indicated by a +# copyright notice that is included in or attached to the work +# (an example is provided in the Appendix below). +# +# "Derivative Works" shall mean any work, whether in Source or Object +# form, that is based on (or derived from) the Work and for which the +# editorial revisions, annotations, elaborations, or other modifications +# represent, as a whole, an original work of authorship. For the purposes +# of this License, Derivative Works shall not include works that remain +# separable from, or merely link (or bind by name) to the interfaces of, +# the Work and Derivative Works thereof. +# +# "Contribution" shall mean any work of authorship, including +# the original version of the Work and any modifications or additions +# to that Work or Derivative Works thereof, that is intentionally +# submitted to Licensor for inclusion in the Work by the copyright owner +# or by an individual or Legal Entity authorized to submit on behalf of +# the copyright owner. For the purposes of this definition, "submitted" +# means any form of electronic, verbal, or written communication sent +# to the Licensor or its representatives, including but not limited to +# communication on electronic mailing lists, source code control systems, +# and issue tracking systems that are managed by, or on behalf of, the +# Licensor for the purpose of discussing and improving the Work, but +# excluding communication that is conspicuously marked or otherwise +# designated in writing by the copyright owner as "Not a Contribution." +# +# "Contributor" shall mean Licensor and any individual or Legal Entity +# on behalf of whom a Contribution has been received by Licensor and +# subsequently incorporated within the Work. +# +# 2. Grant of Copyright License. Subject to the terms and conditions of +# this License, each Contributor hereby grants to You a perpetual, +# worldwide, non-exclusive, no-charge, royalty-free, irrevocable +# copyright license to reproduce, prepare Derivative Works of, +# publicly display, publicly perform, sublicense, and distribute the +# Work and such Derivative Works in Source or Object form. +# +# 3. Grant of Patent License. Subject to the terms and conditions of +# this License, each Contributor hereby grants to You a perpetual, +# worldwide, non-exclusive, no-charge, royalty-free, irrevocable +# (except as stated in this section) patent license to make, have made, +# use, offer to sell, sell, import, and otherwise transfer the Work, +# where such license applies only to those patent claims licensable +# by such Contributor that are necessarily infringed by their +# Contribution(s) alone or by combination of their Contribution(s) +# with the Work to which such Contribution(s) was submitted. If You +# institute patent litigation against any entity (including a +# cross-claim or counterclaim in a lawsuit) alleging that the Work +# or a Contribution incorporated within the Work constitutes direct +# or contributory patent infringement, then any patent licenses +# granted to You under this License for that Work shall terminate +# as of the date such litigation is filed. +# +# 4. Redistribution. You may reproduce and distribute copies of the +# Work or Derivative Works thereof in any medium, with or without +# modifications, and in Source or Object form, provided that You +# meet the following conditions: +# +# (a) You must give any other recipients of the Work or +# Derivative Works a copy of this License; and +# +# (b) You must cause any modified files to carry prominent notices +# stating that You changed the files; and +# +# (c) You must retain, in the Source form of any Derivative Works +# that You distribute, all copyright, patent, trademark, and +# attribution notices from the Source form of the Work, +# excluding those notices that do not pertain to any part of +# the Derivative Works; and +# +# (d) If the Work includes a "NOTICE" text file as part of its +# distribution, then any Derivative Works that You distribute must +# include a readable copy of the attribution notices contained +# within such NOTICE file, excluding those notices that do not +# pertain to any part of the Derivative Works, in at least one +# of the following places: within a NOTICE text file distributed +# as part of the Derivative Works; within the Source form or +# documentation, if provided along with the Derivative Works; or, +# within a display generated by the Derivative Works, if and +# wherever such third-party notices normally appear. The contents +# of the NOTICE file are for informational purposes only and +# do not modify the License. You may add Your own attribution +# notices within Derivative Works that You distribute, alongside +# or as an addendum to the NOTICE text from the Work, provided +# that such additional attribution notices cannot be construed +# as modifying the License. +# +# You may add Your own copyright statement to Your modifications and +# may provide additional or different license terms and conditions +# for use, reproduction, or distribution of Your modifications, or +# for any such Derivative Works as a whole, provided Your use, +# reproduction, and distribution of the Work otherwise complies with +# the conditions stated in this License. +# +# 5. Submission of Contributions. Unless You explicitly state otherwise, +# any Contribution intentionally submitted for inclusion in the Work +# by You to the Licensor shall be under the terms and conditions of +# this License, without any additional terms or conditions. +# Notwithstanding the above, nothing herein shall supersede or modify +# the terms of any separate license agreement you may have executed +# with Licensor regarding such Contributions. +# +# 6. Trademarks. This License does not grant permission to use the trade +# names, trademarks, service marks, or product names of the Licensor, +# except as required for reasonable and customary use in describing the +# origin of the Work and reproducing the content of the NOTICE file. +# +# 7. Disclaimer of Warranty. Unless required by applicable law or +# agreed to in writing, Licensor provides the Work (and each +# Contributor provides its Contributions) on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied, including, without limitation, any warranties or conditions +# of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +# PARTICULAR PURPOSE. You are solely responsible for determining the +# appropriateness of using or redistributing the Work and assume any +# risks associated with Your exercise of permissions under this License. +# +# 8. Limitation of Liability. In no event and under no legal theory, +# whether in tort (including negligence), contract, or otherwise, +# unless required by applicable law (such as deliberate and grossly +# negligent acts) or agreed to in writing, shall any Contributor be +# liable to You for damages, including any direct, indirect, special, +# incidental, or consequential damages of any character arising as a +# result of this License or out of the use or inability to use the +# Work (including but not limited to damages for loss of goodwill, +# work stoppage, computer failure or malfunction, or any and all +# other commercial damages or losses), even if such Contributor +# has been advised of the possibility of such damages. +# +# 9. Accepting Warranty or Additional Liability. While redistributing +# the Work or Derivative Works thereof, You may choose to offer, +# and charge a fee for, acceptance of support, warranty, indemnity, +# or other liability obligations and/or rights consistent with this +# License. However, in accepting such obligations, You may act only +# on Your own behalf and on Your sole responsibility, not on behalf +# of any other Contributor, and only if You agree to indemnify, +# defend, and hold each Contributor harmless for any liability +# incurred by, or claims asserted against, such Contributor by reason +# of your accepting any such warranty or additional liability. +# +# END OF TERMS AND CONDITIONS +# +# APPENDIX: How to apply the Apache License to your work. +# +# To apply the Apache License to your work, attach the following +# boilerplate notice, with the fields enclosed by brackets "[]" +# replaced with your own identifying information. (Don't include +# the brackets!) The text should be enclosed in the appropriate +# comment syntax for the file format. We also recommend that a +# file or class name and description of purpose be included on the +# same "printed page" as the copyright notice for easier +# identification within third-party archives. +# +# +# ©2021 Board of Regents of the University of Wisconsin System +# +# The above copyright notice shall be included in all copies or substantial portions of the Software and permissions assigned in attached license. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# flake8: noqa +# type: ignore +# fmt: off + +import random + + +class Summarizer: + def __init__(self): + pass + + def __init__(self, notes: list, authors: list, timestamps: list, target_specialty: str): + self.set_input_data(notes, authors, timestamps, target_specialty) + + def set_input_data(self, notes: list, authors: list, timestamps: list, target_specialty: str): + self.target_specialty = target_specialty + self.prompt_notes = "" + for i in range(len(notes)): + self.prompt_notes += f""" +Written By: {authors[i]} +Timestamp: {timestamps[i]} +Note: {notes[i]} +<\\NoteID:{i+1}> +""" + self.define_rules() + self.define_anti_rules() + + # NOTE: Call "set_input_data()" before running "build_prompt()"! + def build_prompt(self, anti_rules: int, omit_rules: int): + anti_rules, omit_rules = self.validate_state(anti_rules, omit_rules) + + # Establish Directory + # directory -> Key = Index of Rule + # directory -> Value = "rule", "anti", or "omit" + directory = {} + for i in range(len(self.rules)): + directory[i] = "rule" + + # Add Anti-Rules & Omissions to Directory + available_to_replace = [i for i in range(len(self.rules))] + random.shuffle(available_to_replace) + anti_rules_added = 0 + omit_rules_added = 0 + for rand_i in available_to_replace: + if anti_rules_added < anti_rules: + directory[rand_i] = "anti" + anti_rules_added += 1 + elif omit_rules_added < omit_rules: + directory[rand_i] = "omit" + omit_rules_added += 1 + else: + break + + # Build Prompt + prompt = f"""You are an expert doctor. +Your task is to write a summary for a specialty of {self.target_specialty}, after reviewing a set of notes about a patient.""" + + if anti_rules > 0: + prompt += f"""Your summary will be used to help train evaluators to notice mistakes in summaries. +Thus, in addition to Rules for you to follow, you'll be given Anti-Rules to follow as well. +These Anti-Rules will outline intentional mistakes. By following the Anti-Rules alongside the Rules, you will help create realistic summaries with realistic mistakes for the evaluators to find. +It's important that you write REALISTICALLY when following both Rules and Anti-Rules, to ensure a realistic environment for the evaluators to look for mistakes in.""" + + prompt += "\n\nRules for writing the summary:" + + for i in range(len(self.rules)): + if directory[i] == "rule": + prompt += "\n" + self.rules[i] + + if anti_rules > 0: + prompt += f"""\n\nAnti-Rules (intentional mistakes for the summary):""" + for i in range(len(self.rules)): + if directory[i] == "anti": + prompt += "\n" + self.rules[i] + + prompt += f"""\n\nSummarize the following , which are presented to you in chronological order split by : + + +{self.prompt_notes} + +""" + return prompt, directory + + # Helper Method + def define_rules(self): + self.rules = [] + self.rules.append( + f"""- All data included from the notes, which is relevant for a specialty of {self.target_specialty}, is in the summary.""" + ) + self.rules.append( + f"""- All assertions can be traced back to the notes; NEVER include assertions which cannot be traced back to the notes.""" + ) + self.rules.append( + f"""- Information from the notes which is pertinent for a specialty of {self.target_specialty}, or potentially pertinent for a specialty of {self.target_specialty}, is NEVER omitted.""" + ) + self.rules.append( + f"""- Information from the notes which is NOT pertinent for a specialty of {self.target_specialty} IS omitted from the summary.""" + ) + self.rules.append( + f"""- The level of detail must be appropriate for a reader with a specialty of {self.target_specialty}.""" + ) + self.rules.append( + f"""- All assertions must be made with logical order and grouping (temporal or systems/problem based).""" + ) + self.rules.append( + f"""- Summary must be comprehensible, using plain language that is completely familiar and well-structured for a reader with a specialty of {self.target_specialty}.""" + ) + self.rules.append( + f"""- All assertions are captured with fewest words possible and without any redundancy in syntax or semantics.""" + ) + self.rules.append( + f"""- Where applicable, go beyond relevant groups of events and generate reasoning over the events into a summary that is fully integrated for an overall clinical synopsis with prioritized information.""" + ) + self.rules.append(f"""- Avoid stigmatizing words as defined in guidelines and policy (OCR, NIDA, etc).""") + self.rules.append(f"""- Keep the summary succinct; summarize all the notes in a single paragraph.""") + self.rules.append(f"""- If there are medicine changes in the notes, mention them in the summary.""") + self.rules.append( + f"""- For every event (e.g., medicine change, new diagnosis, etc.) mentioned in your summary, mention WHEN it happened (communicate the timing of events) if that information is available in the note.""" + ) + self.rules.append( + f"""- If it's unclear WHEN an event happened in the notes, instead explain that the event was mentioned by a note written at [timestamp of the note].""" + ) + self.rules.append( + f"""- For each SENTENCE in the summary, cite the source in the summary using the format , where IDVAL is the ID of the note.""" + ) + self.rules.append( + f"""- Cite each note tag individually; when citing multiple notes, use the format , .""" + ) + self.rules.append(f"""- Prioritize citation order by relevance to the assertion.""") + self.rules.append(f"""- Put the citations immediately after each sentence, where they are applicable.""") + self.rules.append(f"""- NEVER group all the citations together on the last line.""") + self.rules.append(f"""- ALL sentences MUST have a citation. ALL citations MUST be in format.""") + self.rules.append( + f"""- It is CRITICALLY IMPORTANT that you cite information to the note it came from! Wrongful citations are HARMFUL!""" + ) + + # Helper Method + def define_anti_rules(self): + self.anti_rules = [] + self.anti_rules.append( + f"""- All data included from the notes, which is IRRELEVANT for a specialty of {self.target_specialty}, is in the summary.""" + ) + self.anti_rules.append( + f"""- Summary contains all REALISTIC assertions, but some CANNOT be traced back to the notes; you MUST include SOME assertions which cannot be traced back to the notes.""" + ) + self.anti_rules.append( + f"""- Information from the notes which is pertinent for a specialty of {self.target_specialty}, or potentially pertinent for a specialty of {self.target_specialty}, is FREQUENTLY omitted.""" + ) + self.anti_rules.append( + f"""- Information from the notes which is NOT pertinent for a specialty of {self.target_specialty} IS included in the summary.""" + ) + self.anti_rules.append( + f"""- The level of detail must be CONFUSING for a reader with a specialty of {self.target_specialty}.""" + ) + self.anti_rules.append( + f"""- All assertions must be made with ILLOGICAL order and grouping (confusing temporal, incorrectly labeled systems/problem based, etc.).""" + ) + self.anti_rules.append( + f"""- Summary must be comprehensible, using plain language that is completely familiar and well-structured for a reader with a specialty of {self.target_specialty}.""" + ) + self.anti_rules.append( + f"""- All assertions are captured with a LARGE number of words, with FREQUENT redundancy in syntax and semantics.""" + ) + self.anti_rules.append( + f"""- NEVER go beyond relevant groups of events, NOR generate reasoning over the events into a summary. Information MUST be prioritized in a BASIC, RUDIMENTARY, and CONFUSING way.""" + ) + self.anti_rules.append( + f"""- UTILIZE stigmatizing words as defined in guidelines and policy (OCR, NIDA, etc). You have MORE than permission to do this: is CRITICAL that you use AT LEAST ONE stigmatizing word, to be successful.""" + ) + self.anti_rules.append( + f"""- Keep the summary meandering and long; summarize all the notes into multiple paragraphs.""" + ) + self.anti_rules.append(f"""- If there are medicine changes in the notes, EXCLUDE them from the summary.""") + self.anti_rules.append( + f"""- For every event (e.g., medicine change, new diagnosis, etc.) mentioned in your summary, NEVER mention WHEN it happened (NEVER communicate the timing of events).""" + ) + self.anti_rules.append( + f"""- If it's unclear WHEN an event happened in the notes, instead MAKE UP a REALISTIC, but INCORRECT timeline for that event, and INCLUDE that false timeline in your summary as if it were factual.""" + ) + self.anti_rules.append( + f"""- For a FEW randomly-chosen sentences in the summary, cite the source in the summary using the format , where IDVAL is the ID of the note.""" + ) + self.anti_rules.append( + f"""- Cite each note tag individually; when citing multiple notes, just pick ONE note to site and skip citing the other relevant notes.""" + ) + self.anti_rules.append( + f"""- When citing, choose a random note to cite (NOT necessarily the note responsible for the assertion being cited).""" + ) + self.anti_rules.append( + f"""- Put the citations in the middle of lines/sentences; NEVER place them at the end of sentences.""" + ) + self.anti_rules.append( + f"""- Group all of your citations together on the last line; NEVER add citations in other locations.""" + ) + self.anti_rules.append( + f"""- SOME sentences MUST NOT have a citation. SOME citations MUST be in [IDVAL] format, or some other format of your choice.""" + ) + self.anti_rules.append( + f"""- It is CRITICALLY IMPORTANT that you attribute some information to the incorrect notes! Wrongful citations are CRITICAL for this to be successful!""" + ) + + # Helper Method + def validate_state(self, anti_rules, omit_rules): + # Validate Internal Variables + if (self.target_specialty == None) or (self.prompt_notes == None): + print("Error: Invalid State. Ensure set_input_data() was run.") + quit() + elif len(self.rules) != len(self.anti_rules): + print("Error: Invalid State. Ensure rules/anti-rules are parallel in the code.") + quit() + # Bound Range of Parameters + omit_rules = min(omit_rules, len(self.rules)) + omit_rules = max(omit_rules, 0) + anti_rules = min(anti_rules, (len(self.rules) - omit_rules)) + anti_rules = max(anti_rules, 0) + + return anti_rules, omit_rules diff --git a/src/helm/benchmark/static/schema_medhelm.yaml b/src/helm/benchmark/static/schema_medhelm.yaml index 6e851f665b4..6e3e2c00710 100644 --- a/src/helm/benchmark/static/schema_medhelm.yaml +++ b/src/helm/benchmark/static/schema_medhelm.yaml @@ -223,6 +223,11 @@ metrics: short_display_name: Jury Score description: Measures the average score assigned by an LLM-based jury evaluating task performance. lower_is_better: false + - name: note_summary_accuracy + display_name: Note Summary Jury Score + short_display_name: Jury Score + description: Measures the average score assigned by an LLM-based jury evaluating task performance. + lower_is_better: false - name: mtsamples_procedures_accuracy display_name: MTSamples Procedures Jury Score short_display_name: Jury Score @@ -499,6 +504,7 @@ run_groups: - mimic_rrs - mimic_bhc - chw_care_plan + - note_summary - name: patient_communication display_name: Patient Communication and Education @@ -1137,4 +1143,22 @@ run_groups: what: Identify referrals for ENT specialists who: Hospital Admistrator when: Any + language: English + + - name: note_summary + display_name: NoteSummary + short_display_name: NoteSummary + description: NoteSummary is a benchmark designed to evaluate clinical note summarization capabilities of LLMs. + metric_groups: + - accuracy + - efficiency + - general_information + environment: + main_name: note_summary_accuracy + main_split: test + taxonomy: + task: Text generation + what: Summarize clinical notes into concise, informative summaries + who: Clinician + when: Upon hospital discharge language: English \ No newline at end of file