-
Notifications
You must be signed in to change notification settings - Fork 348
Medhelm Epic #3787
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Medhelm Epic #3787
Changes from all commits
edc2b9b
ac4884e
08f1b4e
7229ad7
22f526d
a25d578
ae6ad94
7b57073
662b0ee
c9e8498
7ab8718
07ccbb5
94a3fe7
b7cf00b
d5e47e6
c0f7c35
bf8bda0
c9d3bf9
49e0b51
5f39a8e
679d4dc
fab6100
29607b7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| import evaluation_instruments.instruments.pdsqi_9.pdsqi_prompt as pdsqi | ||
|
|
||
| from typing import Dict, Optional | ||
|
|
||
| from helm.benchmark.adaptation.request_state import RequestState | ||
| from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator | ||
| from helm.clients.auto_client import AutoClient | ||
|
|
||
| from evaluation_instruments import prep | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Display an error if this is not installed: from helm.common.optional_dependencies import OptionalDependencyNotInstalled
try:
from evaluation_instruments import prep
import evaluation_instruments.instruments.pdsqi_9.pdsqi_prompt as pdsqi
except ModuleNotFoundError as e:
# Provide manual instructions for installing evaluation-instruments from GitHub
# because PyPI does not allow installing dependencies directly from GitHub.
raise OptionalDependencyNotInstalled(
f"Optional dependency {e.name} is not installed. "
"Please run `evaluation-instruments @ git+https://github.com/epic-open-source/evaluation-instruments.git@1c4637e84fe4dc54f6695e438f3baca6b2cd4573` to install it."
) from e # noqa: E501 |
||
|
|
||
|
|
||
| class NoteSummaryAnnotator(LLMAsJuryAnnotator): | ||
| """The NoteSummary autograder.""" | ||
|
|
||
| name = "note_summary" | ||
|
|
||
| def __init__( | ||
| self, | ||
| auto_client: AutoClient, | ||
| annotator_models: Dict[str, AnnotatorModelInfo], | ||
| template_name: Optional[str] = None, | ||
| ): | ||
| super().__init__( | ||
| auto_client=auto_client, | ||
| prompt_template="", | ||
| annotation_criteria={}, | ||
| annotator_models=annotator_models, | ||
| ) | ||
|
|
||
| def _interpolate_prompt( | ||
| self, request_state: RequestState, custom_replacements: Optional[Dict[str, str]] = None | ||
| ) -> str: | ||
| """ | ||
| Interpolate prompt template with request state information. | ||
| :param request_state: The current request state | ||
| :param custom_replacements: Optional dictionary of additional replacements | ||
| :return: Interpolated prompt | ||
| """ | ||
| notes = (request_state.instance.extra_data or {}).get("notes", []) | ||
| prompt = pdsqi.resolve_prompt( | ||
| summary_to_evaluate=( | ||
| request_state.result.completions[0].text | ||
| if request_state.result and request_state.result.completions | ||
| else "" | ||
| ), | ||
| notes=notes, | ||
| target_specialty="emergency medicine", | ||
| output_mode=prep.OutputMode.EXPLAINED_SCORE, | ||
| ) | ||
| return prompt[1]["content"] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,14 +1,70 @@ | ||
| from typing import Any, Dict, List | ||
| from dataclasses import dataclass | ||
| from typing import Any, Dict, List, Optional | ||
|
|
||
| from helm.benchmark.adaptation.adapter_spec import AdapterSpec | ||
| from helm.benchmark.adaptation.request_state import RequestState | ||
| from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo | ||
| from helm.common.hierarchical_logger import hlog, hwarn | ||
| from helm.benchmark.metrics.metric import Metric | ||
| from helm.benchmark.metrics.metric_name import MetricName | ||
| from helm.benchmark.metrics.metric_service import MetricService | ||
| from helm.benchmark.metrics.statistic import Stat | ||
|
|
||
|
|
||
| @dataclass | ||
| class RubricItem: | ||
| name: str | ||
| min: float | ||
| max: float | ||
| weight: float | ||
| higher_is_better: bool | ||
|
|
||
|
|
||
| @dataclass | ||
| class Rubric: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did not look at the rubric logic too closely, but let me know if there's anything you want me to check. |
||
| items: Dict[str, RubricItem] | ||
|
|
||
| @classmethod | ||
| def from_config(cls, rubric_config: Dict[str, Any]) -> "Rubric": | ||
| items = {} | ||
| for name, attrs in rubric_config.items(): | ||
| item = RubricItem( | ||
| name=name, | ||
| min=attrs["min"], | ||
| max=attrs["max"], | ||
| weight=attrs["weight"], | ||
| higher_is_better=attrs["higher_is_better"], | ||
| ) | ||
| items[name] = item | ||
| return cls(items) | ||
|
|
||
| def normalize(self, name: str, score: float) -> float: | ||
| """Normalize the score according to the rubric item config.""" | ||
| item = self.items[name] | ||
| raw = (score - item.min) / (item.max - item.min) | ||
| return raw if item.higher_is_better else 1 - raw | ||
|
|
||
| def aggregate(self, scores: Dict[str, float]) -> float: | ||
| """Weighted aggregation of normalized scores.""" | ||
| total = 0.0 | ||
| weight_offset = 0.0 | ||
| invalid_scores = [name for name in scores.keys() if not isinstance(scores[name], (int, float))] | ||
| if invalid_scores: | ||
| n_valid_scores = len(scores) - len(invalid_scores) | ||
| weight_offset = sum(self.items[name].weight for name in invalid_scores) / n_valid_scores | ||
| hwarn( | ||
| f"Invalid scores found for {invalid_scores}. " | ||
| f"Using average weight offset of {weight_offset} to adjust the total score." | ||
| ) | ||
| for name, score in scores.items(): | ||
| if not isinstance(score, (int, float)): | ||
| hwarn(f"Skipping non-numeric score for {name}: {score}") | ||
| continue | ||
| norm = self.normalize(name, score) | ||
| total += norm * (self.items[name].weight + weight_offset) | ||
| return total | ||
|
|
||
|
|
||
| class LLMJuryMetric(Metric): | ||
| """Score metrics for LLM Jury.""" | ||
|
|
||
|
|
@@ -18,11 +74,13 @@ def __init__( | |
| scenario_name: str, | ||
| annotator_models: Dict[str, AnnotatorModelInfo], | ||
| default_score: float = 0.0, | ||
| rubric: Optional[Rubric] = None, | ||
| ): | ||
| self.metric_name = metric_name | ||
| self.scenario_name = scenario_name | ||
| self.annotator_models = annotator_models | ||
| self.default_score = default_score | ||
| self.rubric = rubric | ||
|
|
||
| def evaluate_generation( | ||
| self, | ||
|
|
@@ -32,15 +90,29 @@ def evaluate_generation( | |
| eval_cache_path: str, | ||
| ) -> List[Stat]: | ||
| assert request_state.annotations | ||
| if self.rubric: | ||
| hlog(f"Using rubric for {self.scenario_name} with items: {list(self.rubric.items.keys())}") | ||
| else: | ||
| hlog(f"No rubric defined for {self.scenario_name}, using raw scores.") | ||
| annotations: Dict[str, Any] = request_state.annotations[self.scenario_name] | ||
| scores: List[int] = [] | ||
| score = self.default_score | ||
| for annotation_key, annotation_dict in annotations.items(): | ||
| if annotation_key in self.annotator_models.keys() and annotation_dict is not None: | ||
| for val in annotation_dict.values(): | ||
| scores.append(int(val["score"])) | ||
| if scores: | ||
| score = sum(scores) / len(scores) | ||
| if self.rubric: | ||
| # Use rubric to normalize and aggregate scores | ||
| scores_dict = { | ||
| item: annotation_dict[item]["score"] | ||
| for item in self.rubric.items.keys() | ||
| if item in annotation_dict | ||
| } | ||
| score = self.rubric.aggregate(scores_dict) | ||
| else: | ||
| # Fallback to using the raw score | ||
| for val in annotation_dict.values(): | ||
| scores.append(int(val["score"])) | ||
| if scores: | ||
| score = sum(scores) / len(scores) | ||
| return [ | ||
| Stat(MetricName(self.metric_name)).add(score), | ||
| ] | ||
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -3,7 +3,11 @@ | |||
| Website: https://crfm.stanford.edu/helm/medhelm/ | ||||
| """ | ||||
|
|
||||
| from typing import Union | ||||
| import importlib.resources as pkg_resources | ||||
| import os | ||||
| import yaml | ||||
|
|
||||
| from typing import Optional, Union | ||||
|
|
||||
| from helm.benchmark.adaptation.adapter_spec import ( | ||||
| ADAPT_MULTIPLE_CHOICE_JOINT, | ||||
|
|
@@ -13,6 +17,7 @@ | |||
| get_multiple_choice_adapter_spec, | ||||
| ) | ||||
| from helm.benchmark.annotation.annotator import AnnotatorSpec | ||||
| from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo | ||||
| from helm.benchmark.metrics.common_metric_specs import ( | ||||
| get_basic_metric_specs, | ||||
| get_exact_match_metric_specs, | ||||
|
|
@@ -21,6 +26,7 @@ | |||
| get_generic_metric_specs, | ||||
| ) | ||||
| from helm.benchmark.metrics.metric import MetricSpec | ||||
| from helm.benchmark.metrics.llm_jury_metrics import Rubric | ||||
| from helm.benchmark.run_spec import RunSpec, run_spec_function | ||||
| from helm.benchmark.scenarios.scenario import ScenarioSpec | ||||
| from helm.common.gpu_utils import get_torch_device_name | ||||
|
|
@@ -1258,3 +1264,67 @@ def get_shc_proxy_spec(data_path: str) -> RunSpec: | |||
| metric_specs=get_exact_match_metric_specs(), | ||||
| groups=["shc_proxy_med"], | ||||
| ) | ||||
|
|
||||
|
|
||||
| @run_spec_function("note_summary") | ||||
| def get_note_summary_spec(config_path: Optional[str] = None) -> RunSpec: | ||||
| if config_path is None: | ||||
| package = "helm.benchmark.scenarios" | ||||
| config_path = str(pkg_resources.files(package).joinpath("note_summary_scenario.yaml")) | ||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You need to add *.yaml to the manifest, or this file will not actually get included in the package. Line 3 in 89001e7
|
||||
|
|
||||
| assert os.path.exists(config_path), f"Config path not found: {config_path}." | ||||
|
|
||||
| with open(config_path, "r") as f: | ||||
| config = yaml.safe_load(f) | ||||
|
|
||||
| scenario_spec = ScenarioSpec( | ||||
| class_name="helm.benchmark.scenarios.note_summary_scenario.NoteSummaryScenario", | ||||
| args={ | ||||
| "data_path": config["data_path"], | ||||
| }, | ||||
| ) | ||||
|
|
||||
| adapter_spec = get_generation_adapter_spec( | ||||
| instructions="", | ||||
| input_noun=None, | ||||
| newline_after_input_noun=False, | ||||
| output_noun=None, | ||||
| max_tokens=500, | ||||
| stop_sequences=[], | ||||
| max_train_instances=0, | ||||
| ) | ||||
|
|
||||
| annotator_models = { | ||||
| judge["name"]: AnnotatorModelInfo( | ||||
| model_name=judge["model"], | ||||
| model_deployment=judge["model_deployment"], | ||||
| ) | ||||
| for judge in config["judges"] | ||||
| } | ||||
|
|
||||
| annotator_specs = [ | ||||
| AnnotatorSpec( | ||||
| class_name="helm.benchmark.annotation.note_summary_annotator.NoteSummaryAnnotator", | ||||
| args={"annotator_models": annotator_models}, | ||||
| ) | ||||
| ] | ||||
|
|
||||
| metric_specs = get_basic_metric_specs([]) + [ | ||||
| MetricSpec( | ||||
| class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric", | ||||
| args={ | ||||
| "metric_name": "note_summary_accuracy", | ||||
| "scenario_name": "note_summary", | ||||
| "annotator_models": annotator_models, | ||||
| "rubric": Rubric.from_config(config["rubric"]), | ||||
| }, | ||||
| ) | ||||
| ] | ||||
| return RunSpec( | ||||
| name="note_summary", | ||||
| scenario_spec=scenario_spec, | ||||
| adapter_spec=adapter_spec, | ||||
| annotators=annotator_specs, | ||||
| metric_specs=metric_specs, | ||||
| groups=["note_summary"], | ||||
| ) | ||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,91 @@ | ||
| import pandas as pd | ||
|
|
||
| from typing import List | ||
| from helm.common.general import check_file_exists | ||
| from helm.benchmark.scenarios.scenario import ( | ||
| Input, | ||
| Scenario, | ||
| Instance, | ||
| TEST_SPLIT, | ||
| ) | ||
| from helm.benchmark.scenarios.note_summary_scenario_helper import Summarizer # type: ignore | ||
|
|
||
|
|
||
| def file_preprocessing(data_path: str) -> pd.DataFrame: | ||
| """ | ||
| Preprocess the data files to create a DataFrame with the necessary columns. | ||
| task_objective: 'brief_hospital_course' or 'discharge_instructions' | ||
| Use command to download: wget -r -N -c -np --user {PHYSIONET_USERNAME} \ | ||
| --ask-password https://physionet.org/files/discharge-me/1.3/ | ||
| data_path is directory that contains the downloaded files: '{base_dir}/physionet.org/' | ||
| """ | ||
| # Load the first CSV file | ||
| discharge_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge.csv.gz" | ||
| check_file_exists( | ||
| discharge_path, msg=f"[NoteSummaryScenario] Required discharge file not found: '{discharge_path}'" | ||
| ) | ||
| radiology_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/radiology.csv.gz" | ||
| check_file_exists( | ||
| radiology_path, msg=f"[NoteSummaryScenario] Required radiology file not found: '{radiology_path}'" | ||
| ) | ||
| df_discharge = pd.read_csv(discharge_path, compression="gzip", keep_default_na=False) | ||
| df_radiology = pd.read_csv(radiology_path, compression="gzip", keep_default_na=False) | ||
|
|
||
| final_df = pd.concat([df_discharge, df_radiology], ignore_index=True) | ||
| return final_df | ||
|
|
||
|
|
||
| class NoteSummaryScenario(Scenario): | ||
| """ | ||
| NoteSummary is a benchmark designed to evaluate clinical note summarization capabilities of LLMs. | ||
| In this scenario, we only consider the discharge text as well as the radiology report text. | ||
| We are using the phase I test set which is composed of 14,702 hospital admission instances. | ||
| @inproceedings{Xu_2024, | ||
| title={ Discharge me: Bionlp acl’24 shared task on streamlining discharge documentation.}, | ||
| url={https://doi.org/10.13026/4a0k-4360}, | ||
| DOI={10.13026/27pt-1259}, | ||
| booktitle={ Proceedings of the 23rd Workshop on Biomedical Natural Language Processing (BioNLP) at ACL 2024}, | ||
| publisher={Association for Computational Linguistics}, | ||
| author={Xu, Justin and Delbrouck, Jean-Benoit and Johnston, Andrew and Blankemeier, Louis and Langlotz, Curtis}, | ||
| year={2024} | ||
| } | ||
| """ | ||
|
|
||
| name = "note_summary" | ||
| description = "NoteSummary is a benchmark designed to evaluate clinical note summarization capabilities of LLMs." | ||
| tags = ["biomedical"] | ||
|
|
||
| def __init__(self, data_path: str): | ||
| super().__init__() | ||
| self.data_path = data_path | ||
|
|
||
| def get_instances(self, output_path: str) -> List[Instance]: | ||
| instances: List[Instance] = [] | ||
| df = file_preprocessing(self.data_path) | ||
| admissions = df["hadm_id"].unique() | ||
| for admission in admissions: | ||
| df_admission = df[df["hadm_id"] == admission] | ||
| summarizer = Summarizer( | ||
| notes=df_admission["text"].tolist(), | ||
| authors=df_admission["note_type"].tolist(), | ||
| timestamps=df_admission["charttime"].tolist(), | ||
| target_specialty="emergency medicine", | ||
| ) | ||
| prompt_di, _ = summarizer.build_prompt(anti_rules=0, omit_rules=0) | ||
| instances.append( | ||
| Instance( | ||
| input=Input(text=prompt_di), | ||
| references=[], | ||
| split=TEST_SPLIT, | ||
| extra_data={"notes": df_admission["text"].tolist()}, | ||
| ) | ||
| ) | ||
|
|
||
| return instances | ||
|
|
||
| def read_file(self, file_path: str) -> List[str]: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Delete unused method. |
||
| with open(file_path, "r") as file: | ||
| lines = file.readlines() | ||
| lines = [line.strip() for line in lines] | ||
| return lines | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
PyPI packages cannot depend on packages outside PyPI. You should instead provide instructions to users to manually install this this package, either by printing the installation command in an error message, or by documenting it in ReadTheDocs.