diff --git a/v2/evaluate/config.py b/v2/evaluate/config.py index a9f7e9a..2ace876 100644 --- a/v2/evaluate/config.py +++ b/v2/evaluate/config.py @@ -14,7 +14,7 @@ METRICS = [ "tool_selection_quality", - "agentic_session_success", + "Action Completion - Agent Leaderboard", ] FILE_PATHS = { diff --git a/v2/evaluate/run_experiment.py b/v2/evaluate/run_experiment.py index b38f441..081ed14 100644 --- a/v2/evaluate/run_experiment.py +++ b/v2/evaluate/run_experiment.py @@ -1,6 +1,7 @@ import argparse from typing import List from simulation import run_simulation_experiments +from config import METRICS from dotenv import load_dotenv load_dotenv("../.env") @@ -56,7 +57,7 @@ def main(): parser.add_argument( "--metrics", type=str, - default="tool_selection_quality,agentic_session_success", + default=",".join(METRICS), help="Comma-separated list of metrics to evaluate", ) diff --git a/v2/evaluate/run_parallel_experiments.py b/v2/evaluate/run_parallel_experiments.py index 32f8697..96a1ca4 100644 --- a/v2/evaluate/run_parallel_experiments.py +++ b/v2/evaluate/run_parallel_experiments.py @@ -5,6 +5,7 @@ import itertools from typing import List, Tuple from simulation import run_simulation_experiments +from config import METRICS from dotenv import load_dotenv import time @@ -108,7 +109,7 @@ def main(): parser.add_argument( "--metrics", type=str, - default="tool_selection_quality,agentic_session_success", + default=",".join(METRICS), help="Comma-separated list of metrics to evaluate", ) diff --git a/v2/evaluate/simple_test.py b/v2/evaluate/simple_test.py index 18b2049..b20d180 100644 --- a/v2/evaluate/simple_test.py +++ b/v2/evaluate/simple_test.py @@ -321,7 +321,7 @@ def generate_response( experiment_name = f"weather-conversation-experiment-{int(time.time() * 1000000)}" METRICS = [ "tool_selection_quality", - "agentic_session_success", + "Action Completion - Agent Leaderboard", ] # No galileo_context here - we're creating new loggers for each turn diff --git a/v2/results/fetch_results.py b/v2/results/fetch_results.py index 3b87882..0aae821 100644 --- a/v2/results/fetch_results.py +++ b/v2/results/fetch_results.py @@ -214,10 +214,24 @@ def process_experiment(exp, model): else: print(f"average_tool_selection_quality not found in experiment {exp.name}, proceeding without it") + # Look up the action completion metric by key. + # The key changed from 'average_agentic_session_success' to a + # custom metric name after a Galileo platform update (see #12). + action_completion = None + props = exp.aggregate_metrics.additional_properties + for key in ("average_action_completion___agent_leaderboard", + "average_Action Completion - Agent Leaderboard", + "average_agentic_session_success"): + if key in props: + action_completion = round(props[key], 2) + break + if action_completion is None: + print(f"Action completion metric not found in experiment {exp.name}, proceeding without it") + result = { 'experiment_name': exp.name, 'total_responses': exp.aggregate_metrics.additional_properties['total_responses'], - 'average_action_completion': round(exp.aggregate_metrics.additional_properties['average_agentic_session_success'], 2), + 'average_action_completion': action_completion, 'average_tool_selection_quality': tool_selection_quality, 'model': final_model_name, 'category': category