From fc7674e3368ec71c193ee2698b138b607fbc26f4 Mon Sep 17 00:00:00 2001
From: Conor Bronsdon <120674402+conorbronsdon@users.noreply.github.com>
Date: Thu, 12 Mar 2026 22:48:01 -0700
Subject: [PATCH 1/2] Fix Action Completion metric broken by Galileo platform
 change

The `agentic_session_success` metric was moved from experiment-level to
trace-level only in a Galileo platform update, breaking the Action
Completion scoring for new evaluation runs.

Update config.py to use the custom "Action Completion - Agent Leaderboard"
metric (duplicated at the trace level) as recommended in #12. Also make
fetch_results.py resilient to the metric key name change by trying
multiple possible key formats, with a graceful fallback.

Closes #12

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 v2/evaluate/config.py       |  2 +-
 v2/results/fetch_results.py | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/v2/evaluate/config.py b/v2/evaluate/config.py
index a9f7e9a..2ace876 100644
--- a/v2/evaluate/config.py
+++ b/v2/evaluate/config.py
@@ -14,7 +14,7 @@
 
 METRICS = [
     "tool_selection_quality",
-    "agentic_session_success",
+    "Action Completion - Agent Leaderboard",
 ]
 
 FILE_PATHS = {
diff --git a/v2/results/fetch_results.py b/v2/results/fetch_results.py
index 3b87882..0aae821 100644
--- a/v2/results/fetch_results.py
+++ b/v2/results/fetch_results.py
@@ -214,10 +214,24 @@ def process_experiment(exp, model):
                     else:
                         print(f"average_tool_selection_quality not found in experiment {exp.name}, proceeding without it")
                     
+                    # Look up the action completion metric by key.
+                    # The key changed from 'average_agentic_session_success' to a
+                    # custom metric name after a Galileo platform update (see #12).
+                    action_completion = None
+                    props = exp.aggregate_metrics.additional_properties
+                    for key in ("average_action_completion___agent_leaderboard",
+                                "average_Action Completion - Agent Leaderboard",
+                                "average_agentic_session_success"):
+                        if key in props:
+                            action_completion = round(props[key], 2)
+                            break
+                    if action_completion is None:
+                        print(f"Action completion metric not found in experiment {exp.name}, proceeding without it")
+
                     result = {
                         'experiment_name': exp.name, 
                         'total_responses': exp.aggregate_metrics.additional_properties['total_responses'],
-                        'average_action_completion': round(exp.aggregate_metrics.additional_properties['average_agentic_session_success'], 2),
+                        'average_action_completion': action_completion,
                         'average_tool_selection_quality': tool_selection_quality, 
                         'model': final_model_name, 
                         'category': category

From f1412c78d1e1a445420890ec7fa2d0d1c20bc174 Mon Sep 17 00:00:00 2001
From: Conor Bronsdon <120674402+conorbronsdon@users.noreply.github.com>
Date: Fri, 13 Mar 2026 00:07:06 -0700
Subject: [PATCH 2/2] Align CLI runner metric defaults with config.METRICS

The --metrics defaults in run_experiment.py and run_parallel_experiments.py
were hardcoded to the old metric name, so the new Action Completion metric
was never requested unless --metrics was explicitly passed. Import from
config.METRICS so the runners stay in sync automatically.

Also updates simple_test.py to use the new metric name.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 v2/evaluate/run_experiment.py           | 3 ++-
 v2/evaluate/run_parallel_experiments.py | 3 ++-
 v2/evaluate/simple_test.py              | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/v2/evaluate/run_experiment.py b/v2/evaluate/run_experiment.py
index b38f441..081ed14 100644
--- a/v2/evaluate/run_experiment.py
+++ b/v2/evaluate/run_experiment.py
@@ -1,6 +1,7 @@
 import argparse
 from typing import List
 from simulation import run_simulation_experiments
+from config import METRICS
 from dotenv import load_dotenv
 
 load_dotenv("../.env")
@@ -56,7 +57,7 @@ def main():
     parser.add_argument(
         "--metrics",
         type=str,
-        default="tool_selection_quality,agentic_session_success",
+        default=",".join(METRICS),
         help="Comma-separated list of metrics to evaluate",
     )
 
diff --git a/v2/evaluate/run_parallel_experiments.py b/v2/evaluate/run_parallel_experiments.py
index 32f8697..96a1ca4 100644
--- a/v2/evaluate/run_parallel_experiments.py
+++ b/v2/evaluate/run_parallel_experiments.py
@@ -5,6 +5,7 @@
 import itertools
 from typing import List, Tuple
 from simulation import run_simulation_experiments
+from config import METRICS
 from dotenv import load_dotenv
 import time
 
@@ -108,7 +109,7 @@ def main():
     parser.add_argument(
         "--metrics",
         type=str,
-        default="tool_selection_quality,agentic_session_success",
+        default=",".join(METRICS),
         help="Comma-separated list of metrics to evaluate",
     )
 
diff --git a/v2/evaluate/simple_test.py b/v2/evaluate/simple_test.py
index 18b2049..b20d180 100644
--- a/v2/evaluate/simple_test.py
+++ b/v2/evaluate/simple_test.py
@@ -321,7 +321,7 @@ def generate_response(
     experiment_name = f"weather-conversation-experiment-{int(time.time() * 1000000)}"
     METRICS = [
         "tool_selection_quality",
-        "agentic_session_success",
+        "Action Completion - Agent Leaderboard",
     ]
 
     # No galileo_context here - we're creating new loggers for each turn