From fc7674e3368ec71c193ee2698b138b607fbc26f4 Mon Sep 17 00:00:00 2001 From: Conor Bronsdon <120674402+conorbronsdon@users.noreply.github.com> Date: Thu, 12 Mar 2026 22:48:01 -0700 Subject: [PATCH 1/2] Fix Action Completion metric broken by Galileo platform change The `agentic_session_success` metric was moved from experiment-level to trace-level only in a Galileo platform update, breaking the Action Completion scoring for new evaluation runs. Update config.py to use the custom "Action Completion - Agent Leaderboard" metric (duplicated at the trace level) as recommended in #12. Also make fetch_results.py resilient to the metric key name change by trying multiple possible key formats, with a graceful fallback. Closes #12 Co-Authored-By: Claude Opus 4.6 --- v2/evaluate/config.py | 2 +- v2/results/fetch_results.py | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/v2/evaluate/config.py b/v2/evaluate/config.py index a9f7e9a..2ace876 100644 --- a/v2/evaluate/config.py +++ b/v2/evaluate/config.py @@ -14,7 +14,7 @@ METRICS = [ "tool_selection_quality", - "agentic_session_success", + "Action Completion - Agent Leaderboard", ] FILE_PATHS = { diff --git a/v2/results/fetch_results.py b/v2/results/fetch_results.py index 3b87882..0aae821 100644 --- a/v2/results/fetch_results.py +++ b/v2/results/fetch_results.py @@ -214,10 +214,24 @@ def process_experiment(exp, model): else: print(f"average_tool_selection_quality not found in experiment {exp.name}, proceeding without it") + # Look up the action completion metric by key. + # The key changed from 'average_agentic_session_success' to a + # custom metric name after a Galileo platform update (see #12). + action_completion = None + props = exp.aggregate_metrics.additional_properties + for key in ("average_action_completion___agent_leaderboard", + "average_Action Completion - Agent Leaderboard", + "average_agentic_session_success"): + if key in props: + action_completion = round(props[key], 2) + break + if action_completion is None: + print(f"Action completion metric not found in experiment {exp.name}, proceeding without it") + result = { 'experiment_name': exp.name, 'total_responses': exp.aggregate_metrics.additional_properties['total_responses'], - 'average_action_completion': round(exp.aggregate_metrics.additional_properties['average_agentic_session_success'], 2), + 'average_action_completion': action_completion, 'average_tool_selection_quality': tool_selection_quality, 'model': final_model_name, 'category': category From f1412c78d1e1a445420890ec7fa2d0d1c20bc174 Mon Sep 17 00:00:00 2001 From: Conor Bronsdon <120674402+conorbronsdon@users.noreply.github.com> Date: Fri, 13 Mar 2026 00:07:06 -0700 Subject: [PATCH 2/2] Align CLI runner metric defaults with config.METRICS The --metrics defaults in run_experiment.py and run_parallel_experiments.py were hardcoded to the old metric name, so the new Action Completion metric was never requested unless --metrics was explicitly passed. Import from config.METRICS so the runners stay in sync automatically. Also updates simple_test.py to use the new metric name. Co-Authored-By: Claude Opus 4.6 --- v2/evaluate/run_experiment.py | 3 ++- v2/evaluate/run_parallel_experiments.py | 3 ++- v2/evaluate/simple_test.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/v2/evaluate/run_experiment.py b/v2/evaluate/run_experiment.py index b38f441..081ed14 100644 --- a/v2/evaluate/run_experiment.py +++ b/v2/evaluate/run_experiment.py @@ -1,6 +1,7 @@ import argparse from typing import List from simulation import run_simulation_experiments +from config import METRICS from dotenv import load_dotenv load_dotenv("../.env") @@ -56,7 +57,7 @@ def main(): parser.add_argument( "--metrics", type=str, - default="tool_selection_quality,agentic_session_success", + default=",".join(METRICS), help="Comma-separated list of metrics to evaluate", ) diff --git a/v2/evaluate/run_parallel_experiments.py b/v2/evaluate/run_parallel_experiments.py index 32f8697..96a1ca4 100644 --- a/v2/evaluate/run_parallel_experiments.py +++ b/v2/evaluate/run_parallel_experiments.py @@ -5,6 +5,7 @@ import itertools from typing import List, Tuple from simulation import run_simulation_experiments +from config import METRICS from dotenv import load_dotenv import time @@ -108,7 +109,7 @@ def main(): parser.add_argument( "--metrics", type=str, - default="tool_selection_quality,agentic_session_success", + default=",".join(METRICS), help="Comma-separated list of metrics to evaluate", ) diff --git a/v2/evaluate/simple_test.py b/v2/evaluate/simple_test.py index 18b2049..b20d180 100644 --- a/v2/evaluate/simple_test.py +++ b/v2/evaluate/simple_test.py @@ -321,7 +321,7 @@ def generate_response( experiment_name = f"weather-conversation-experiment-{int(time.time() * 1000000)}" METRICS = [ "tool_selection_quality", - "agentic_session_success", + "Action Completion - Agent Leaderboard", ] # No galileo_context here - we're creating new loggers for each turn