From f575512a529b09af72f426655c07fc4b4973b86d Mon Sep 17 00:00:00 2001
From: Michael Terry <michael.terry@childrens.harvard.edu>
Date: Wed, 5 Jun 2024 11:38:23 -0400
Subject: [PATCH] feat: add `info` subcommand to show computed ranges & labels

`chart-review info` will now print some useful project stats to the
console. Namely, a list of annotators and their note count & ranges,
as well as a list of the final computed labels.

We can augment this over time or add flags to it. But this felt like
a reasonable start.
---
 chart_review/__init__.py      |  2 +-
 chart_review/cli.py           | 21 ++++++++++++
 chart_review/cohort.py        | 46 +++++++++++++++----------
 chart_review/commands/info.py | 64 +++++++++++++++++++++++++++++++++++
 chart_review/common.py        |  2 +-
 chart_review/config.py        |  2 +-
 chart_review/simplify.py      |  4 +--
 docs/info.md                  | 42 +++++++++++++++++++++++
 tests/test_cli.py             | 32 ++++++++++++------
 tests/test_cohort.py          | 45 ++++++++++++++++++++++++
 tests/test_external.py        |  6 ++--
 11 files changed, 230 insertions(+), 36 deletions(-)
 create mode 100644 chart_review/commands/info.py
 create mode 100644 docs/info.md
 create mode 100644 tests/test_cohort.py

diff --git a/chart_review/__init__.py b/chart_review/__init__.py
index 82504c9..f79d0ae 100644
--- a/chart_review/__init__.py
+++ b/chart_review/__init__.py
@@ -1,3 +1,3 @@
 """Chart Review public entry point"""
 
-__version__ = "1.1.0"
+__version__ = "1.2.0"
diff --git a/chart_review/cli.py b/chart_review/cli.py
index 1231bcb..4709639 100644
--- a/chart_review/cli.py
+++ b/chart_review/cli.py
@@ -5,6 +5,7 @@
 
 from chart_review import cohort, config
 from chart_review.commands.accuracy import accuracy
+from chart_review.commands.info import info
 
 
 ###############################################################################
@@ -35,6 +36,7 @@ def define_parser() -> argparse.ArgumentParser:
     subparsers = parser.add_subparsers(required=True)
 
     add_accuracy_subparser(subparsers)
+    add_info_subparser(subparsers)
 
     return parser
 
@@ -61,6 +63,25 @@ def run_accuracy(args: argparse.Namespace) -> None:
     accuracy(reader, args.truth_annotator, args.annotator, save=args.save)
 
 
+###############################################################################
+#
+# Info
+#
+###############################################################################
+
+
+def add_info_subparser(subparsers) -> None:
+    parser = subparsers.add_parser("info")
+    add_project_args(parser)
+    parser.set_defaults(func=run_info)
+
+
+def run_info(args: argparse.Namespace) -> None:
+    proj_config = config.ProjectConfig(args.project_dir, config_path=args.config)
+    reader = cohort.CohortReader(proj_config)
+    info(reader)
+
+
 ###############################################################################
 #
 # Main CLI entrypoints
diff --git a/chart_review/cohort.py b/chart_review/cohort.py
index e865d93..4ccc322 100644
--- a/chart_review/cohort.py
+++ b/chart_review/cohort.py
@@ -32,18 +32,6 @@ def __init__(self, proj_config: config.ProjectConfig):
         for name, value in self.config.external_annotations.items():
             external.merge_external(self.annotations, saved, self.project_dir, name, value)
 
-        # Parse ignored IDs (might be note IDs, might be external IDs)
-        self.ignored_notes: set[int] = set()
-        for ignore_id in self.config.ignore:
-            ls_id = external.external_id_to_label_studio_id(saved, str(ignore_id))
-            if ls_id is None:
-                if isinstance(ignore_id, int):
-                    ls_id = ignore_id  # must be direct note ID
-                else:
-                    # Must just be over-zealous excluding (like automatically from SQL)
-                    continue
-            self.ignored_notes.add(ls_id)
-
         # Consolidate/expand mentions based on config
         simplify.simplify_mentions(
             self.annotations,
@@ -51,12 +39,36 @@ def __init__(self, proj_config: config.ProjectConfig):
             grouped_labels=self.config.grouped_labels,
         )
 
+        # Calculate the final set of note ranges for each annotator
+        self.note_range = self._collect_note_ranges(saved)
+
+    def _collect_note_ranges(self, exported_json: list[dict]) -> dict[str, set[int]]:
         # Detect note ranges if they were not defined in the project config
         # (i.e. default to the full set of annotated notes)
-        self.note_range = self.config.note_ranges
+        note_ranges = {k: set(v) for k, v in self.config.note_ranges.items()}
         for annotator, annotator_mentions in self.annotations.mentions.items():
-            if annotator not in self.note_range:
-                self.note_range[annotator] = sorted(annotator_mentions.keys())
+            if annotator not in note_ranges:
+                note_ranges[annotator] = set(annotator_mentions.keys())
+
+        # Parse ignored IDs (might be note IDs, might be external IDs)
+        ignored_notes: set[int] = set()
+        for ignore_id in self.config.ignore:
+            ls_id = external.external_id_to_label_studio_id(exported_json, str(ignore_id))
+            if ls_id is None:
+                if isinstance(ignore_id, int):
+                    ls_id = ignore_id  # must be direct note ID
+                else:
+                    # Must just be over-zealous excluding (like automatically from SQL)
+                    continue
+            ignored_notes.add(ls_id)
+
+        # Remove any invalid (ignored, non-existent) notes from the range sets
+        all_ls_notes = {int(entry["id"]) for entry in exported_json if "id" in entry}
+        for note_ids in note_ranges.values():
+            note_ids.difference_update(ignored_notes)
+            note_ids.intersection_update(all_ls_notes)
+
+        return note_ranges
 
     @property
     def class_labels(self):
@@ -103,7 +115,7 @@ def confusion_matrix(
         :return: dict
         """
         labels = self._select_labels(label_pick)
-        note_range = set(guard_iter(note_range)) - self.ignored_notes
+        note_range = set(guard_iter(note_range))
         return agree.confusion_matrix(
             self.annotations,
             truth,
@@ -122,7 +134,7 @@ def score_reviewer(self, truth: str, annotator: str, note_range, label_pick: str
         :return: dict, keys f1, precision, recall and vals= %score
         """
         labels = self._select_labels(label_pick)
-        note_range = set(guard_iter(note_range)) - self.ignored_notes
+        note_range = set(guard_iter(note_range))
         return agree.score_reviewer(self.annotations, truth, annotator, note_range, labels=labels)
 
     def score_reviewer_table_csv(self, truth: str, annotator: str, note_range) -> str:
diff --git a/chart_review/commands/info.py b/chart_review/commands/info.py
new file mode 100644
index 0000000..47cc459
--- /dev/null
+++ b/chart_review/commands/info.py
@@ -0,0 +1,64 @@
+"""Methods for showing config & calculated setup info."""
+
+import rich
+import rich.box
+import rich.table
+
+from chart_review import cohort
+
+
+def info(reader: cohort.CohortReader) -> None:
+    """
+    Show project information on the console.
+
+    :param reader: the cohort configuration
+    """
+    console = rich.get_console()
+
+    # Charts
+    chart_table = rich.table.Table(
+        "Annotator",
+        "Chart Count",
+        "Chart IDs",
+        box=rich.box.ROUNDED,
+        pad_edge=False,
+        title="Annotations:",
+        title_justify="left",
+        title_style="bold",
+    )
+    for annotator in sorted(reader.note_range):
+        notes = reader.note_range[annotator]
+        chart_table.add_row(annotator, str(len(notes)), pretty_note_range(notes))
+    console.print(chart_table)
+    console.print()
+
+    # Labels
+    console.print("Labels:", style="bold")
+    if reader.class_labels:
+        console.print(", ".join(sorted(reader.class_labels, key=str.casefold)))
+    else:
+        console.print("None", style="italic", highlight=False)
+
+
+def pretty_note_range(notes: set[int]) -> str:
+    ranges = []
+    range_start = None
+    prev_note = None
+
+    def end_range() -> None:
+        if prev_note is None:
+            return
+        if range_start == prev_note:
+            ranges.append(str(prev_note))
+        else:
+            ranges.append(f"{range_start}–{prev_note}")  # en dash
+
+    for note in sorted(notes):
+        if prev_note is None or prev_note + 1 != note:
+            end_range()
+            range_start = note
+        prev_note = note
+
+    end_range()
+
+    return ", ".join(ranges)
diff --git a/chart_review/common.py b/chart_review/common.py
index 7ed202c..568805f 100644
--- a/chart_review/common.py
+++ b/chart_review/common.py
@@ -23,7 +23,7 @@ def read_json(path: str) -> Union[dict, list[dict]]:
         return json.load(f, strict=False)
 
 
-def write_json(path: str, data: dict, indent: Optional[int] = 4) -> None:
+def write_json(path: str, data: dict | list, indent: Optional[int] = 4) -> None:
     """
     Writes data to the given path, in json format
     :param path: filesystem path
diff --git a/chart_review/config.py b/chart_review/config.py
index f41042d..0e3d2c2 100644
--- a/chart_review/config.py
+++ b/chart_review/config.py
@@ -35,7 +35,7 @@ def __init__(self, project_dir: str, config_path: Optional[str] = None):
 
         # ** Note ranges **
         # Handle some extra syntax like 1-3 == [1, 2, 3]
-        self.note_ranges = self._data.get("ranges", {})
+        self.note_ranges: dict[str, list[int]] = self._data.get("ranges", {})
         for key, values in self.note_ranges.items():
             self.note_ranges[key] = list(self._parse_note_range(values))
 
diff --git a/chart_review/simplify.py b/chart_review/simplify.py
index fcf9334..1321a3e 100644
--- a/chart_review/simplify.py
+++ b/chart_review/simplify.py
@@ -22,7 +22,7 @@ def simplify_export(
     for entry in exported_json:
         note_id = int(entry.get("id"))
 
-        for annot in entry.get("annotations"):
+        for annot in entry.get("annotations", []):
             completed_by = annot.get("completed_by")
             if completed_by not in proj_config.annotators:
                 continue  # we don't know who this is!
@@ -30,7 +30,7 @@ def simplify_export(
             # Grab all valid mentions for this annotator & note
             labels = types.LabelSet()
             text_tags = []
-            for result in annot.get("result"):
+            for result in annot.get("result", []):
                 result_value = result.get("value", {})
                 result_text = result_value.get("text")
                 result_labels = set(result_value.get("labels", []))
diff --git a/docs/info.md b/docs/info.md
new file mode 100644
index 0000000..cb11797
--- /dev/null
+++ b/docs/info.md
@@ -0,0 +1,42 @@
+---
+title: Info Command
+parent: Chart Review
+nav_order: 6
+# audience: lightly technical folks
+# type: how-to
+---
+
+# The Info Command
+
+The `info` command will print information about your current project.
+
+This is helpful to examine the computed list of chart ID ranges or labels.
+
+## Example
+
+```shell
+$ chart-review info
+Annotations:                              
+╭──────────┬─────────────┬──────────╮
+│Annotator │ Chart Count │ Chart IDs│
+├──────────┼─────────────┼──────────┤
+│jane      │ 3           │ 1, 3–4   │
+│jill      │ 4           │ 1–4      │
+│john      │ 3           │ 1–2, 4   │
+╰──────────┴─────────────┴──────────╯
+
+Labels:
+Cough, Fatigue, Headache
+```
+
+## Options
+
+### `--config=PATH`
+
+Use this to point to a secondary (non-default) config file.
+Useful if you have multiple label setups (e.g. one grouped into a binary label and one not).
+
+### `--project-dir=DIR`
+
+Use this to run `chart-review` outside of your project dir.
+Config files, external annotations, etc will be looked for in that directory. 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index edccc54..f3c4f55 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,5 +1,7 @@
 """Tests for cli.py"""
 
+import contextlib
+import io
 import os
 import shutil
 import tempfile
@@ -86,18 +88,26 @@ def test_accuracy(self):
                 accuracy_csv,
             )
 
-    def test_ignored_ids(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            shutil.copytree(f"{DATA_DIR}/ignore", tmpdir, dirs_exist_ok=True)
-            cli.main_cli(["accuracy", "--project-dir", tmpdir, "--save", "allison", "adam"])
+    def test_info(self):
+        stdout = io.StringIO()
+        with contextlib.redirect_stdout(stdout):
+            cli.main_cli(["info", "--project-dir", f"{DATA_DIR}/cold"])
+
+        self.assertEqual(
+            """Annotations:                         
+╭──────────┬─────────────┬──────────╮
+│Annotator │ Chart Count │ Chart IDs│
+├──────────┼─────────────┼──────────┤
+│jane      │ 3           │ 1, 3–4   │
+│jill      │ 4           │ 1–4      │
+│john      │ 3           │ 1–2, 4   │
+╰──────────┴─────────────┴──────────╯
 
-            # Only two of the five notes should be considered, and we should have full agreement.
-            accuracy_json = common.read_json(f"{tmpdir}/accuracy-allison-adam.json")
-            self.assertEqual(1, accuracy_json["F1"])
-            self.assertEqual(2, accuracy_json["TP"])
-            self.assertEqual(0, accuracy_json["FN"])
-            self.assertEqual(2, accuracy_json["TN"])
-            self.assertEqual(0, accuracy_json["FP"])
+Labels:
+Cough, Fatigue, Headache
+""",  # noqa: W291
+            stdout.getvalue(),
+        )
 
     def test_custom_config(self):
         with tempfile.TemporaryDirectory() as tmpdir:
diff --git a/tests/test_cohort.py b/tests/test_cohort.py
new file mode 100644
index 0000000..b946dcb
--- /dev/null
+++ b/tests/test_cohort.py
@@ -0,0 +1,45 @@
+"""Tests for cohort.py"""
+
+import os
+import tempfile
+import unittest
+
+from chart_review import cohort, common, config
+
+DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
+
+
+class TestCohort(unittest.TestCase):
+    """Test case for basic cohort management"""
+
+    def setUp(self):
+        super().setUp()
+        self.maxDiff = None
+
+    def test_ignored_ids(self):
+        reader = cohort.CohortReader(config.ProjectConfig(f"{DATA_DIR}/ignore"))
+
+        # Confirm 3, 4, and 5 got ignored
+        self.assertEqual(
+            {
+                "adam": {1, 2},
+                "allison": {1, 2},
+            },
+            reader.note_range,
+        )
+
+    def test_non_existent_ids(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            common.write_json(
+                f"{tmpdir}/config.json", {"annotators": {"bob": 1}, "ranges": {"bob": ["1-5"]}}
+            )
+            common.write_json(
+                f"{tmpdir}/labelstudio-export.json",
+                [
+                    {"id": 1, "annotations": [{"completed_by": 1}]},  # done by bob
+                    {"id": 3},  # not done by bob, but we are explicitly told it was
+                ],
+            )
+            reader = cohort.CohortReader(config.ProjectConfig(tmpdir))
+
+            self.assertEqual({"bob": {1, 3}}, reader.note_range)
diff --git a/tests/test_external.py b/tests/test_external.py
index 55ba939..b7d6816 100644
--- a/tests/test_external.py
+++ b/tests/test_external.py
@@ -43,9 +43,9 @@ def test_basic_read(self):
             # Confirm ranges got auto-detected for both human and icd10
             self.assertEqual(
                 {
-                    "human": [1, 2, 3],
-                    "icd10-doc": [1, 3],
-                    "icd10-enc": [1, 3],
+                    "human": {1, 2, 3},
+                    "icd10-doc": {1, 3},
+                    "icd10-enc": {1, 3},
                 },
                 reader.note_range,
             )