From d0805252ce5b0a5d787a40fa21272971f5231c1a Mon Sep 17 00:00:00 2001
From: Leon Derczynski <leonderczynski@gmail.com>
Date: Wed, 10 Dec 2025 12:12:01 +0100
Subject: [PATCH 1/4] add script to assess integrity of a garak report.jsonl
 file

---
 garak/analyze/check_report_integrity.py | 266 ++++++++++++++++++++++++
 1 file changed, 266 insertions(+)
 create mode 100644 garak/analyze/check_report_integrity.py

diff --git a/garak/analyze/check_report_integrity.py b/garak/analyze/check_report_integrity.py
new file mode 100644
index 000000000..a69798e2b
--- /dev/null
+++ b/garak/analyze/check_report_integrity.py
@@ -0,0 +1,266 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+""" Conduct a variety of checks and tests to assess the integrity of a garak report.jsonl file """
+
+"""
+inventory of tests:
+
+* ✔️ version mismatch between report and garak 
+* ✔️ report using dev version 
+* ✔️ current version is dev version
+* ✔️ probe_spec matches probes in attempts
+* ✔️ attempt status 1 has matching status 2
+* ✔️ attempts have enough unique generations
+* ✔️ attempt run ID in setup run IDs
+* ✔️ detection has correct cardinality in attempt status 2s
+* ✔️ summary object is present
+* at least one z-score is listed
+* summary matches probes requested
+* ✔️ run was completed
+* ✔️ run is <6 months old (calibration freshness)
+* ✔️ at least one eval statement per probe
+* ✔️ eval totals = num status 2 attempts
+* ✔️ eval passed+nones <= total prompts
+
+"""
+
+import argparse
+from collections import defaultdict
+import datetime
+import json
+import sys
+from typing import Set
+
+notes = []
+
+
+def add_note(note: str) -> None:
+    global notes
+    print("🔹", note)
+    notes.append(note)
+
+
+def _is_dev_version(version: str) -> bool:
+    return version.split(".")[-1].startswith("pre")
+
+
+def _compare_sets(set1: Set, set2: Set, item_name: str) -> None:
+    if len(set1) > len(set2):
+        add_note("spurious {item_name}: " + repr(set1.difference(set2)))
+    else:
+        add_note("not all {item_name} present, missing: " + repr(set1.difference(set2)))
+
+
+def main(argv=None) -> None:
+    if argv is None:
+        argv = sys.argv[1:]
+
+    import garak._config
+
+    garak._config.load_config()
+    print(
+        f"garak {garak.__description__} v{garak._config.version} ( https://github.com/NVIDIA/garak )"
+    )
+
+    p = argparse.ArgumentParser(
+        prog="python -m garak.analyze.aggregate_reports",
+        description="Check integrity of a garak report.jsonl file",
+        epilog="See https://github.com/NVIDIA/garak",
+        allow_abbrev=False,
+    )
+    p.add_argument("-r", "--report_path", help="Report to analyze", required=True)
+    a = p.parse_args(argv)
+
+    garak_version: str = None
+    report_garak_version: str = None
+    configured_probe_spec = set()
+    _probes_requested = set()
+    generations_requested: int = 0
+    setup_run_ids = set()
+    init_present = False
+    attempt_status_1_ids = set()
+    attempt_status_2_ids = set()
+    attempt_status_1_per_probe = defaultdict(int)
+    attempt_status_2_per_probe = defaultdict(int)
+    num_attempt_stats_2_per_probe = {}
+    probes_found_in_attempts_status_1 = set()
+    probes_found_in_attempts_status_2 = set()
+    probes_found_in_evals = set()
+    complete: bool = False
+    completion_id: str = None
+    digest_exists: bool = False
+
+    garak_version = garak._config.version
+    if _is_dev_version(garak_version):
+        add_note(
+            f"check running in development garak version {garak_version}, implementation will depend on branch+commit"
+        )
+
+    with open(a.report_path, encoding="utf-8") as reportfile:
+
+        for r in [json.loads(line.strip()) for line in reportfile if line.strip()]:
+            match r["entry_type"]:
+                case "start_run setup":
+                    report_garak_version = r["_config.version"]
+                    if _is_dev_version(garak_version):
+                        add_note(
+                            f"report generated under development garak version {garak_version}, implementation will depend on branch+commit"
+                        )
+                    if report_garak_version != garak_version:
+                        add_note(
+                            f"current and report garak version mismatch, {garak_version} vs. {report_garak_version}"
+                        )
+                    configured_probe_spec = r["plugins.probe_spec"]
+                    _probes_requested, __rejected = garak._config.parse_plugin_spec(
+                        configured_probe_spec, "probes"
+                    )
+                    _probes_requested = set(
+                        [
+                            _klassname.replace("probes.", "")
+                            for _klassname in _probes_requested
+                        ]
+                    )
+
+                    generations_requested = r["run.generations"]
+                    setup_run_ids = r["transient.run_id"]
+
+                case "init":
+                    init_present = True
+                    if r["run"] not in setup_run_ids:
+                        add_note(
+                            "init run uuid not in setup run uuid(s), did aggregation go wrong?"
+                        )
+                    _start = datetime.datetime.fromisoformat(r["start_time"])
+                    _now = datetime.datetime.now()
+                    _delta = _now - _start
+                    if _delta.days > 180:
+                        add_note(
+                            f"Run is old ({_delta.days} days), calibration may have shifted"
+                        )
+
+                case "attempt":
+                    _attempt_uuid = r["uuid"]
+                    _num_outputs = len(r["outputs"])
+                    _probe_name = r["probe_classname"]
+                    if _probe_name not in _probes_requested:
+                        add_note(
+                            f"attempt {_attempt_uuid} using probe {_probe_name} not requested in config"
+                        )
+                    if _num_outputs != generations_requested:
+                        add_note(
+                            f"probe {_probe_name} attempt {_attempt_uuid} status {r['status']} has {_num_outputs} outputs but {generations_requested} were requested"
+                        )
+
+                    match r["status"]:
+                        case 1:
+                            attempt_status_1_ids.add(_attempt_uuid)
+                            probes_found_in_attempts_status_1.add(_probe_name)
+                            attempt_status_1_per_probe[_probe_name] += 1
+                        case 2:
+                            attempt_status_2_ids.add(_attempt_uuid)
+                            probes_found_in_attempts_status_2.add(_probe_name)
+                            attempt_status_2_per_probe[_probe_name] += 1
+                            for _detectorname, _results in r[
+                                "detector_results"
+                            ].items():
+                                _resultcount = len(_results)
+                                if _resultcount != _num_outputs:
+                                    add_note(
+                                        f"attempt has incorrect detection results for {_detectorname}, {_resultcount} results vs. {_num_outputs} outputs"
+                                    )
+
+                        case _:
+                            add_note(
+                                f"attempt uuid {_attempt_uuid} found with unexpected status {r['status']}"
+                            )
+
+                case "completion":
+                    complete = True
+                    completion_id = r["run"]
+                    if completion_id not in setup_run_ids:
+                        add_note(
+                            "completion run uuid not in setup run uuid(s), did aggregation go wrong?"
+                        )
+
+                case "eval":
+                    _probename = r["probe"]
+                    _detectorname = r["detector"]
+                    probes_found_in_evals.add(_probename)
+                    if r["total"] != attempt_status_2_per_probe[_probe_name]:
+                        add_note(
+                            f"eval for {_probe_name} {_detectorname} gives {r['total']} instances but there were {attempt_status_2_per_probe[_probe_name]} status 2 attempts"
+                        )
+                    if r["passed"] + r["nones"] > r["total"]:
+                        add_note(
+                            f"More results than instances for {_probename} eval {r['detector']}"
+                            + repr(r)
+                        )
+                    if (
+                        attempt_status_1_per_probe[_probename]
+                        != attempt_status_2_per_probe[_probename]
+                    ):
+                        add_note(
+                            f"attempt 1/2 count mismatch for {_probename} on {_detectorname}: {attempt_status_1_per_probe[_probename]} @ status 1, but {attempt_status_2_per_probe[_probename]} @ status 2"
+                        )
+                        attempt_status_2_per_probe[_probe_name] = 0
+
+                case "digest":
+                    digest_exists = True
+
+                case _:
+                    continue
+
+    if not init_present:
+        add_note("no 'init' entry, run may not have started - invalid config?")
+    if not complete:
+        add_note("no 'completion' entry, run not complete or from very old version")
+    if not digest_exists:
+        add_note("no 'digest' entry, run may be incomplete or from old version")
+    if probes_found_in_evals != _probes_requested:
+        _compare_sets(
+            _probes_requested, probes_found_in_evals, "requested probes in eval entries"
+        )
+    if _probes_requested != probes_found_in_attempts_status_1:
+        _compare_sets(
+            _probes_requested,
+            probes_found_in_attempts_status_1,
+            "requested probes in status 1 entries",
+        )
+    if _probes_requested != probes_found_in_attempts_status_2:
+        _compare_sets(
+            _probes_requested,
+            probes_found_in_attempts_status_2,
+            "requested probes in status 2 entries",
+        )
+    if probes_found_in_attempts_status_1 != probes_found_in_evals:
+        _compare_sets(
+            probes_found_in_attempts_status_1,
+            probes_found_in_evals,
+            "probes in status 1 entries evaluated",
+        )
+    if probes_found_in_attempts_status_2 != probes_found_in_evals:
+        _compare_sets(
+            probes_found_in_attempts_status_2,
+            probes_found_in_evals,
+            "probes in status 1 entries evaluated",
+        )
+    if probes_found_in_attempts_status_1 != probes_found_in_attempts_status_2:
+        _compare_sets(
+            probes_found_in_attempts_status_1,
+            probes_found_in_attempts_status_2,
+            "probes in status 1 entries found in status 2 entries",
+        )
+    if attempt_status_1_ids != attempt_status_2_ids:
+        _compare_sets(
+            attempt_status_1_ids,
+            attempt_status_2_ids,
+            "attempt status 1 entries in status 2 entries",
+        )
+
+    print("done")
+    print(len(notes), "notes")
+
+
+if __name__ == "__main__":
+    main()

From da4faa70442ce25a98abf1743091f3f9dd4e4c04 Mon Sep 17 00:00:00 2001
From: Leon Derczynski <leonderczynski@gmail.com>
Date: Wed, 10 Dec 2025 12:43:52 +0100
Subject: [PATCH 2/4] add checks of digest object

---
 garak/analyze/check_report_integrity.py | 84 +++++++++++++++++++------
 1 file changed, 65 insertions(+), 19 deletions(-)

diff --git a/garak/analyze/check_report_integrity.py b/garak/analyze/check_report_integrity.py
index a69798e2b..594b2939b 100644
--- a/garak/analyze/check_report_integrity.py
+++ b/garak/analyze/check_report_integrity.py
@@ -75,7 +75,7 @@ def main(argv=None) -> None:
     garak_version: str = None
     report_garak_version: str = None
     configured_probe_spec = set()
-    _probes_requested = set()
+    probes_requested = set()
     generations_requested: int = 0
     setup_run_ids = set()
     init_present = False
@@ -112,13 +112,13 @@ def main(argv=None) -> None:
                             f"current and report garak version mismatch, {garak_version} vs. {report_garak_version}"
                         )
                     configured_probe_spec = r["plugins.probe_spec"]
-                    _probes_requested, __rejected = garak._config.parse_plugin_spec(
+                    probes_requested, __rejected = garak._config.parse_plugin_spec(
                         configured_probe_spec, "probes"
                     )
-                    _probes_requested = set(
+                    probes_requested = set(
                         [
                             _klassname.replace("probes.", "")
-                            for _klassname in _probes_requested
+                            for _klassname in probes_requested
                         ]
                     )
 
@@ -143,7 +143,7 @@ def main(argv=None) -> None:
                     _attempt_uuid = r["uuid"]
                     _num_outputs = len(r["outputs"])
                     _probe_name = r["probe_classname"]
-                    if _probe_name not in _probes_requested:
+                    if _probe_name not in probes_requested:
                         add_note(
                             f"attempt {_attempt_uuid} using probe {_probe_name} not requested in config"
                         )
@@ -175,14 +175,6 @@ def main(argv=None) -> None:
                                 f"attempt uuid {_attempt_uuid} found with unexpected status {r['status']}"
                             )
 
-                case "completion":
-                    complete = True
-                    completion_id = r["run"]
-                    if completion_id not in setup_run_ids:
-                        add_note(
-                            "completion run uuid not in setup run uuid(s), did aggregation go wrong?"
-                        )
-
                 case "eval":
                     _probename = r["probe"]
                     _detectorname = r["detector"]
@@ -205,8 +197,62 @@ def main(argv=None) -> None:
                         )
                         attempt_status_2_per_probe[_probe_name] = 0
 
+                case "completion":
+                    complete = True
+                    completion_id = r["run"]
+                    if completion_id not in setup_run_ids:
+                        add_note(
+                            "completion run uuid not in setup run uuid(s), did aggregation go wrong?"
+                        )
+
                 case "digest":
                     digest_exists = True
+                    if r["meta"]["garak_version"] != report_garak_version:
+                        add_note(
+                            f"digest was written with a different garak version ({r["meta"]["garak_version"]}) from the run ({report_garak_version})"
+                        )
+                    probes_in_digest = set()
+
+                    _z_score_values_found = set([])
+                    for groupname, group in r["eval"].items():
+                        group_probe_names = group.keys()
+                        probes_in_digest.update(group_probe_names)
+                        for probename, probe_summary in group.items():
+                            if probename == "_summary":
+                                continue
+                            for detectorname, detector_summary in probe_summary.items():
+                                if detectorname == "_summary":
+                                    continue
+                                try:
+                                    _z_score_values_found.add(
+                                        detector_summary["relative_score"]
+                                    )
+                                except KeyError:
+                                    add_note(
+                                        f"Missing 'relative_score' entry in digest for {probename} {detectorname}, old version?"
+                                    )
+
+                    _z_score_floats = filter(
+                        lambda f: isinstance(f, float), _z_score_values_found
+                    )
+                    if not len(list(_z_score_floats)):
+                        add_note(
+                            "No Z-scores/relative scores found. Maybe deliberate, maybe calibration broken"
+                        )
+
+                    probes_in_digest.remove("_summary")
+                    if probes_in_digest != probes_requested:
+                        _compare_sets(
+                            probes_requested,
+                            probes_in_digest,
+                            "requested probes in digest",
+                        )
+                    if probes_in_digest != probes_found_in_evals:
+                        _compare_sets(
+                            probes_found_in_evals,
+                            probes_in_digest,
+                            "evaluated probes in digest",
+                        )
 
                 case _:
                     continue
@@ -217,19 +263,19 @@ def main(argv=None) -> None:
         add_note("no 'completion' entry, run not complete or from very old version")
     if not digest_exists:
         add_note("no 'digest' entry, run may be incomplete or from old version")
-    if probes_found_in_evals != _probes_requested:
+    if probes_found_in_evals != probes_requested:
         _compare_sets(
-            _probes_requested, probes_found_in_evals, "requested probes in eval entries"
+            probes_requested, probes_found_in_evals, "requested probes in eval entries"
         )
-    if _probes_requested != probes_found_in_attempts_status_1:
+    if probes_requested != probes_found_in_attempts_status_1:
         _compare_sets(
-            _probes_requested,
+            probes_requested,
             probes_found_in_attempts_status_1,
             "requested probes in status 1 entries",
         )
-    if _probes_requested != probes_found_in_attempts_status_2:
+    if probes_requested != probes_found_in_attempts_status_2:
         _compare_sets(
-            _probes_requested,
+            probes_requested,
             probes_found_in_attempts_status_2,
             "requested probes in status 2 entries",
         )

From dc6fbcf3697d0d37d3de47c1f4b7b4f3b3d290c8 Mon Sep 17 00:00:00 2001
From: Leon Derczynski <leonderczynski@gmail.com>
Date: Wed, 10 Dec 2025 12:47:10 +0100
Subject: [PATCH 3/4] update todo in descr

---
 garak/analyze/check_report_integrity.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/garak/analyze/check_report_integrity.py b/garak/analyze/check_report_integrity.py
index 594b2939b..97ab976e5 100644
--- a/garak/analyze/check_report_integrity.py
+++ b/garak/analyze/check_report_integrity.py
@@ -15,8 +15,8 @@
 * ✔️ attempt run ID in setup run IDs
 * ✔️ detection has correct cardinality in attempt status 2s
 * ✔️ summary object is present
-* at least one z-score is listed
-* summary matches probes requested
+* ✔️ at least one z-score is listed
+* ✔️ summary matches probes requested
 * ✔️ run was completed
 * ✔️ run is <6 months old (calibration freshness)
 * ✔️ at least one eval statement per probe

From 08503c9ecf14efc7c90e4165344b3daa28d350be Mon Sep 17 00:00:00 2001
From: Leon Derczynski <leonderczynski@gmail.com>
Date: Thu, 11 Dec 2025 15:26:32 +0100
Subject: [PATCH 4/4] clarify compare-based reporting and some note
 formulations; handle broken output pipe

---
 garak/analyze/check_report_integrity.py | 74 +++++++++++++++++--------
 1 file changed, 50 insertions(+), 24 deletions(-)

diff --git a/garak/analyze/check_report_integrity.py b/garak/analyze/check_report_integrity.py
index 97ab976e5..75335dc69 100644
--- a/garak/analyze/check_report_integrity.py
+++ b/garak/analyze/check_report_integrity.py
@@ -37,19 +37,28 @@
 
 def add_note(note: str) -> None:
     global notes
-    print("🔹", note)
     notes.append(note)
+    try:
+        print("🔹", note)
+    except BrokenPipeError:
+        pass
 
 
 def _is_dev_version(version: str) -> bool:
     return version.split(".")[-1].startswith("pre")
 
 
-def _compare_sets(set1: Set, set2: Set, item_name: str) -> None:
-    if len(set1) > len(set2):
-        add_note("spurious {item_name}: " + repr(set1.difference(set2)))
-    else:
-        add_note("not all {item_name} present, missing: " + repr(set1.difference(set2)))
+def _compare_sets(set1: Set, set2: Set, set1_name: str, set2_name) -> None:
+    if set1.difference(set2):
+        add_note(
+            f"not all {set1_name} present in {set2_name}, missing: "
+            + repr(set1.difference(set2))
+        )
+    if set2.difference(set1):
+        add_note(
+            f"not all {set2_name} present in {set1_name}, missing: "
+            + repr(set2.difference(set1))
+        )
 
 
 def main(argv=None) -> None:
@@ -91,6 +100,8 @@ def main(argv=None) -> None:
     completion_id: str = None
     digest_exists: bool = False
 
+    print(f"checking {a.report_path}")
+
     garak_version = garak._config.version
     if _is_dev_version(garak_version):
         add_note(
@@ -98,8 +109,12 @@ def main(argv=None) -> None:
         )
 
     with open(a.report_path, encoding="utf-8") as reportfile:
-
-        for r in [json.loads(line.strip()) for line in reportfile if line.strip()]:
+        for line in [line.strip() for line in reportfile if line.strip()]:
+            try:
+                r = json.loads(line)
+            except json.decoder.JSONDecodeError as jde:
+                add_note(f"invalid json entry starting '{line[:100]}' : " + repr(jde))
+                continue
             match r["entry_type"]:
                 case "start_run setup":
                     report_garak_version = r["_config.version"]
@@ -149,7 +164,7 @@ def main(argv=None) -> None:
                         )
                     if _num_outputs != generations_requested:
                         add_note(
-                            f"probe {_probe_name} attempt {_attempt_uuid} status {r['status']} has {_num_outputs} outputs but {generations_requested} were requested"
+                            f"probe {_probe_name} attempt {_attempt_uuid} status:{r['status']} has {_num_outputs} outputs but {generations_requested} were requested"
                         )
 
                     match r["status"]:
@@ -172,7 +187,7 @@ def main(argv=None) -> None:
 
                         case _:
                             add_note(
-                                f"attempt uuid {_attempt_uuid} found with unexpected status {r['status']}"
+                                f"attempt uuid {_attempt_uuid} found with unexpected status:{r['status']}"
                             )
 
                 case "eval":
@@ -181,11 +196,11 @@ def main(argv=None) -> None:
                     probes_found_in_evals.add(_probename)
                     if r["total"] != attempt_status_2_per_probe[_probe_name]:
                         add_note(
-                            f"eval for {_probe_name} {_detectorname} gives {r['total']} instances but there were {attempt_status_2_per_probe[_probe_name]} status 2 attempts"
+                            f"eval entry for {_probe_name} {_detectorname} indicates {r['total']} instances but there were {attempt_status_2_per_probe[_probe_name]} status:2 attempts"
                         )
                     if r["passed"] + r["nones"] > r["total"]:
                         add_note(
-                            f"More results than instances for {_probename} eval {r['detector']}"
+                            f"More results than instances for {_probename} eval with {r['detector']}"
                             + repr(r)
                         )
                     if (
@@ -193,7 +208,7 @@ def main(argv=None) -> None:
                         != attempt_status_2_per_probe[_probename]
                     ):
                         add_note(
-                            f"attempt 1/2 count mismatch for {_probename} on {_detectorname}: {attempt_status_1_per_probe[_probename]} @ status 1, but {attempt_status_2_per_probe[_probename]} @ status 2"
+                            f"attempt 1/2 count mismatch for {_probename} on {_detectorname}: {attempt_status_1_per_probe[_probename]} @ status:1, but {attempt_status_2_per_probe[_probename]} @ status:2"
                         )
                         attempt_status_2_per_probe[_probe_name] = 0
 
@@ -245,13 +260,15 @@ def main(argv=None) -> None:
                         _compare_sets(
                             probes_requested,
                             probes_in_digest,
-                            "requested probes in digest",
+                            "probes requested in config",
+                            "probes listed in digest",
                         )
                     if probes_in_digest != probes_found_in_evals:
                         _compare_sets(
                             probes_found_in_evals,
                             probes_in_digest,
-                            "evaluated probes in digest",
+                            "probes evaluated",
+                            "probes listed in digest",
                         )
 
                 case _:
@@ -260,48 +277,57 @@ def main(argv=None) -> None:
     if not init_present:
         add_note("no 'init' entry, run may not have started - invalid config?")
     if not complete:
-        add_note("no 'completion' entry, run not complete or from very old version")
+        add_note("no 'completion' entry, run incomplete or from very old version")
     if not digest_exists:
-        add_note("no 'digest' entry, run may be incomplete or from old version")
+        add_note("no 'digest' entry, run incomplete or from old version")
     if probes_found_in_evals != probes_requested:
         _compare_sets(
-            probes_requested, probes_found_in_evals, "requested probes in eval entries"
+            probes_requested,
+            probes_found_in_evals,
+            "probes requested in config",
+            "probes evaluated",
         )
     if probes_requested != probes_found_in_attempts_status_1:
         _compare_sets(
             probes_requested,
             probes_found_in_attempts_status_1,
-            "requested probes in status 1 entries",
+            "probes requested in config",
+            "probes in status:1 entries",
         )
     if probes_requested != probes_found_in_attempts_status_2:
         _compare_sets(
             probes_requested,
             probes_found_in_attempts_status_2,
-            "requested probes in status 2 entries",
+            "probes requested in config",
+            "probes in status:2 entries",
         )
     if probes_found_in_attempts_status_1 != probes_found_in_evals:
         _compare_sets(
             probes_found_in_attempts_status_1,
             probes_found_in_evals,
-            "probes in status 1 entries evaluated",
+            "probes in status:1 entries",
+            "probes evaluated",
         )
     if probes_found_in_attempts_status_2 != probes_found_in_evals:
         _compare_sets(
             probes_found_in_attempts_status_2,
             probes_found_in_evals,
-            "probes in status 1 entries evaluated",
+            "probes in status:2 entries",
+            "probes evaluated",
         )
     if probes_found_in_attempts_status_1 != probes_found_in_attempts_status_2:
         _compare_sets(
             probes_found_in_attempts_status_1,
             probes_found_in_attempts_status_2,
-            "probes in status 1 entries found in status 2 entries",
+            "probes in status:1 entries",
+            "probes in status:2 entries",
         )
     if attempt_status_1_ids != attempt_status_2_ids:
         _compare_sets(
             attempt_status_1_ids,
             attempt_status_2_ids,
-            "attempt status 1 entries in status 2 entries",
+            "attempt status:1 entries",
+            "attempt status:2 entries",
         )
 
     print("done")