rungalileo · conorbronsdon · Mar 13, 2026 · Mar 13, 2026 · baz-reviewer · Mar 13, 2026
diff --git a/v2/evaluate/config.py b/v2/evaluate/config.py
@@ -14,7 +14,7 @@
 
 METRICS = [
     "tool_selection_quality",
-    "agentic_session_success",
+    "Action Completion - Agent Leaderboard",
 ]
 
 FILE_PATHS = {

diff --git a/v2/evaluate/run_experiment.py b/v2/evaluate/run_experiment.py
@@ -1,6 +1,7 @@
 import argparse
 from typing import List
 from simulation import run_simulation_experiments
+from config import METRICS
 from dotenv import load_dotenv
 
 load_dotenv("../.env")
@@ -56,7 +57,7 @@ def main():
     parser.add_argument(
         "--metrics",
         type=str,
-        default="tool_selection_quality,agentic_session_success",
+        default=",".join(METRICS),
         help="Comma-separated list of metrics to evaluate",
     )
 

diff --git a/v2/evaluate/run_parallel_experiments.py b/v2/evaluate/run_parallel_experiments.py
@@ -5,6 +5,7 @@
 import itertools
 from typing import List, Tuple
 from simulation import run_simulation_experiments
+from config import METRICS
 from dotenv import load_dotenv
 import time
 
@@ -108,7 +109,7 @@ def main():
     parser.add_argument(
         "--metrics",
         type=str,
-        default="tool_selection_quality,agentic_session_success",
+        default=",".join(METRICS),
         help="Comma-separated list of metrics to evaluate",
     )
 

diff --git a/v2/evaluate/simple_test.py b/v2/evaluate/simple_test.py
@@ -321,7 +321,7 @@ def generate_response(
     experiment_name = f"weather-conversation-experiment-{int(time.time() * 1000000)}"
     METRICS = [
         "tool_selection_quality",
-        "agentic_session_success",
+        "Action Completion - Agent Leaderboard",
     ]
 
     # No galileo_context here - we're creating new loggers for each turn

diff --git a/v2/results/fetch_results.py b/v2/results/fetch_results.py
@@ -214,10 +214,24 @@ def process_experiment(exp, model):
                     else:
                         print(f"average_tool_selection_quality not found in experiment {exp.name}, proceeding without it")
 
+                    # Look up the action completion metric by key.
+                    # The key changed from 'average_agentic_session_success' to a
+                    # custom metric name after a Galileo platform update (see #12).
+                    action_completion = None
+                    props = exp.aggregate_metrics.additional_properties
+                    for key in ("average_action_completion___agent_leaderboard",
+                                "average_Action Completion - Agent Leaderboard",
+                                "average_agentic_session_success"):
+                        if key in props:
+                            action_completion = round(props[key], 2)
+                            break
+                    if action_completion is None:
+                        print(f"Action completion metric not found in experiment {exp.name}, proceeding without it")
+
                     result = {
                         'experiment_name': exp.name, 
                         'total_responses': exp.aggregate_metrics.additional_properties['total_responses'],
-                        'average_action_completion': round(exp.aggregate_metrics.additional_properties['average_agentic_session_success'], 2),
+                        'average_action_completion': action_completion,
                         'average_tool_selection_quality': tool_selection_quality, 
                         'model': final_model_name, 
                         'category': category