Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
76e9b6a
feat: add BasePlugin and PluginContext for plugin system
clkao Nov 3, 2025
6db0724
style: fix type hints in base_plugin (any -> Any, remove unused Path …
clkao Nov 3, 2025
4e147f7
feat: add PluginRegistry for managing plugin execution
clkao Nov 3, 2025
1d03324
feat: add SuperpowersPlugin for Claude Code
clkao Nov 3, 2025
74fa315
fix: use _shared_path property in SuperpowersPlugin
clkao Nov 3, 2025
7782cb2
feat: add DbtMcpPlugin migrating existing dbt-mcp logic
clkao Nov 3, 2025
1ce455f
feat: integrate PluginRegistry into SetupOrchestrator
clkao Nov 3, 2025
dfc60a8
feat: add --plugins CLI argument to harness
clkao Nov 3, 2025
4313127
feat: add pre-agent and post-trial hook execution to harness
clkao Nov 3, 2025
40d33c4
fix: use NAME instead of name for agent, fix type for agent_name and …
clkao Nov 3, 2025
ad7c2d6
refactor: remove hard-coded dbt-mcp logic (now in DbtMcpPlugin)
clkao Nov 3, 2025
353c396
feat: add --plugins argument to 'ade run' CLI command
clkao Nov 5, 2025
8af2543
feat: add debug logging to plugins for troubleshooting copy issues
clkao Nov 3, 2025
480e03d
fix: use correct copy_to_container signature with container_dir and c…
clkao Nov 3, 2025
8e9ce2d
fix: use session.send_keys instead of terminal.run for command execution
clkao Nov 3, 2025
029eb2b
Allow Skill and other tools for Claude, and install claude for superp…
clkao Nov 4, 2025
daadab2
refactor: replace use_mcp flag with plugin-based detection
clkao Nov 4, 2025
411dc9a
fix: pass enabled_plugins to SetupOrchestrator in trial execution
clkao Nov 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 1 addition & 23 deletions ade_bench/agents/installed_agents/abstract_installed_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,9 @@
class AbstractInstalledAgent(BaseAgent, ABC):
NAME = AgentName.ABSTRACT_INSTALLED

def __init__(self, use_mcp: bool = False, model_name: str | None = None, **kwargs):
def __init__(self, model_name: str | None = None, **kwargs):
super().__init__(**kwargs)
self._variant_config = {}
self._use_mcp = use_mcp
self._model_name = model_name

@property
Expand Down Expand Up @@ -109,27 +108,6 @@ def perform_task(
max_timeout_sec=config.setup_timeout_sec, # Use setup timeout for installation
)

# Optionally setup dbt MCP server
if self._use_mcp:
dbt_mcp_script = Path(__file__).parent.parent.parent.parent / "shared" / "scripts" / "setup-dbt-mcp.sh"
session.copy_to_container(
dbt_mcp_script,
container_dir="/scripts",
container_filename="setup-dbt-mcp.sh",
)

# Pass db_type, project_type, and agent name
db_type = self._variant_config.get('db_type', 'unknown')
project_type = self._variant_config.get('project_type', 'unknown')
agent_name = self.NAME.value if hasattr(self.NAME, 'value') else str(self.NAME)
session.send_keys(
[
f"bash /scripts/setup-dbt-mcp.sh {db_type} {project_type} {agent_name}",
"Enter",
],
block=True,
max_timeout_sec=config.setup_timeout_sec,
)
except TimeoutError:
log_harness_info(
logger,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

class ClaudeCodeAgent(AbstractInstalledAgent):
NAME = AgentName.CLAUDE_CODE
ALLOWED_TOOLS = ["Bash", "Edit", "Write", "NotebookEdit", "WebFetch", "mcp__dbt"]
ALLOWED_TOOLS = ["Bash", "Edit", "Glob", "Grep", "Write", "NotebookEdit", "WebFetch", "Skill", "mcp__dbt"]

def __init__(self, **kwargs):
super().__init__(**kwargs)
Expand Down
17 changes: 10 additions & 7 deletions ade_bench/cli/ab/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,15 +116,15 @@ def run(
"--log-level",
help="Set the logging level"
),
use_mcp: bool = typer.Option(
False,
"--use-mcp",
help="Enable MCP (Model Context Protocol) for the agent"
),
with_profiling: bool = typer.Option(
False,
"--with-profiling",
help="Run the harness with a python profiler",
),
plugins: str = typer.Option(
"",
"--plugins",
help="Comma-separated list of plugins to enable (e.g., 'superpowers,dbt-mcp')"
)
):
"""
Expand Down Expand Up @@ -167,6 +167,9 @@ def run(
elif agent_args:
agent_kwargs["additional_args"] = agent_args

# Parse plugins
enabled_plugins = [p.strip() for p in plugins.split(",") if p.strip()]

# Create and run the harness
harness = Harness(
dataset_path=dataset_path,
Expand All @@ -189,8 +192,8 @@ def run(
db_type=db,
project_type=project_type,
keep_alive=persist,
use_mcp=use_mcp,
with_profiling=with_profiling
with_profiling=with_profiling,
enabled_plugins=enabled_plugins,
)

results = harness.run()
Expand Down
40 changes: 30 additions & 10 deletions ade_bench/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
TrialResults,
)
from ade_bench.setup.setup_orchestrator import SetupOrchestrator
from ade_bench.plugins.base_plugin import PluginContext
from ade_bench.llms.base_llm import ContextLengthExceededError, ParseError
from ade_bench.parsers.base_parser import UnitTestStatus, ParserResult
from ade_bench.terminal.docker_compose_manager import DockerComposeManager
Expand Down Expand Up @@ -64,8 +65,8 @@ def __init__(
db_type: str | None = None,
project_type: str | None = None,
keep_alive: bool = False,
use_mcp: bool = False,
with_profiling: bool = False,
enabled_plugins: list[str] | None = None,
):
"""
Runs the Terminal-Bench harness.
Expand Down Expand Up @@ -93,8 +94,8 @@ def __init__(
db_type: Database type to filter variants (e.g., duckdb, postgres, sqlite, snowflake).
project_type: Project type to filter variants (e.g., dbt, other).
keep_alive: If True, keep containers alive when tasks fail for debugging.
use_mcp: If True, start a dbt MCP server after setup completes.
with_profiling: If True, will enable the cProfiler.
enabled_plugins: List of plugin names to enable (from CLI --plugins).
"""
self._run_uuid = None
self._start_time = datetime.now(timezone.utc).isoformat()
Expand All @@ -107,11 +108,14 @@ def __init__(
self._db_filter = db_type
self._project_type_filter = project_type
self._keep_alive = keep_alive
self._use_mcp = use_mcp
self._with_profiling = with_profiling
self._enabled_plugins = enabled_plugins

# Initialize setup orchestrator for variant-specific setup
self._setup_orchestrator = SetupOrchestrator()
self._setup_orchestrator = SetupOrchestrator(enabled_plugins=enabled_plugins)

# Keep reference to registry for pre-agent/post-trial hooks
self.plugin_registry = self._setup_orchestrator.plugin_registry

self._output_path = output_path
self._agent_name = agent_name
Expand Down Expand Up @@ -180,9 +184,6 @@ def _create_agent_for_task(self, task_id: str) -> BaseAgent:
if self._model_name:
agent_kwargs["model_name"] = self._model_name

# Pass use_mcp flag to installed agents
agent_kwargs["use_mcp"] = self._use_mcp

return AgentFactory.get_agent(self._agent_name, **agent_kwargs)

def _init_dataset(self) -> None:
Expand Down Expand Up @@ -522,7 +523,8 @@ def _run_setup(
terminal=terminal,
session=session,
file_diff_handler=file_diff_handler,
trial_handler=trial_handler
trial_handler=trial_handler,
enabled_plugins=self._enabled_plugins
)

# Run setup with timeout using asyncio
Expand Down Expand Up @@ -569,7 +571,7 @@ def _run_trial(
model_name=self._model_name,
db_type=config.get("db_type"),
project_type=config.get("project_type"),
used_mcp=self._use_mcp,
used_mcp=self.plugin_registry.did_plugin_run("dbt-mcp"),
)

with spin_up_terminal(
Expand Down Expand Up @@ -651,6 +653,21 @@ def _run_trial(
if hasattr(task_agent, 'set_variant_config'):
task_agent.set_variant_config(config)

# Build context for agent-phase hooks
context = PluginContext(
terminal=terminal,
session=session,
trial_handler=trial_handler,
task_id=trial_handler.task_id,
variant=config,
agent_name=task_agent.NAME,
db_type=config.get("database", {}).get("type") if config.get("database") else config.get("db_type"),
project_type=config.get("project_type"),
)

# PRE-AGENT HOOKS
self.plugin_registry.run_hooks("pre_agent", context)

log_harness_info(self._logger, trial_handler.task_id, "agent", f"Starting agent...")
try:
agent_result, agent_failure_mode = self._run_agent(
Expand Down Expand Up @@ -716,6 +733,9 @@ def _run_trial(
results.runtime_ms = agent_result.runtime_ms
results.cost_usd = agent_result.cost_usd

# POST-TRIAL HOOKS
self.plugin_registry.run_hooks("post_trial", context)

# Always kill the agent session to ensure cleanup, regardless of success/failure
try:
session.kill_session()
Expand Down Expand Up @@ -1138,7 +1158,7 @@ def _execute_single_trial(
model_name=self._model_name,
db_type=config.get("db_type"),
project_type=config.get("project_type"),
used_mcp=self._use_mcp,
used_mcp=self.plugin_registry.did_plugin_run("dbt-mcp"),
)
return trial_results

Expand Down
Empty file added ade_bench/plugins/__init__.py
Empty file.
67 changes: 67 additions & 0 deletions ade_bench/plugins/base_plugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# ABOUTME: Base plugin interface and context for ADE-Bench plugin system
# ABOUTME: Plugins hook into setup/agent lifecycle phases with conditional execution

from abc import ABC
from dataclasses import dataclass
from typing import Any, Optional, TYPE_CHECKING

if TYPE_CHECKING:
from ade_bench.agents.agent_name import AgentName
from ade_bench.terminal.docker_compose_manager import DockerComposeManager
from ade_bench.handlers.trial_handler import TrialHandler


@dataclass
class PluginContext:
"""Immutable context passed to plugin hooks"""
terminal: "DockerComposeManager"
session: Any # libtmux Session
trial_handler: "TrialHandler"
task_id: str
variant: dict
agent_name: Optional["AgentName"] = None
db_type: Optional[str] = None
project_type: Optional[str] = None


class BasePlugin(ABC):
"""Base class for ADE-Bench plugins

Subclasses override class attributes (name, description) and implement
lifecycle hook methods (pre_setup, post_setup, pre_agent, post_trial).
"""

# Subclasses override these
name: str = ""
description: str = ""

def __init__(self):
pass

def should_run(self, phase: str, context: PluginContext) -> bool:
"""Override to conditionally execute based on context

Args:
phase: Hook phase name (pre_setup, post_setup, pre_agent, post_trial)
context: Plugin execution context

Returns:
True if plugin should run for this phase/context
"""
return True

def pre_setup(self, context: PluginContext) -> None:
"""Hook before task's setup.sh execution"""
pass

def post_setup(self, context: PluginContext) -> None:
"""Hook after task's setup.sh execution"""
pass

def pre_agent(self, context: PluginContext) -> None:
"""Hook before agent execution"""
pass

def post_trial(self, context: PluginContext) -> None:
"""Hook after trial completion"""
pass
54 changes: 54 additions & 0 deletions ade_bench/plugins/dbt_mcp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# ABOUTME: dbt-mcp plugin registers dbt MCP server with installed agents
# ABOUTME: Uses existing setup-dbt-mcp.sh script for Snowflake dbt projects

import logging
from pathlib import Path
from ade_bench.plugins.base_plugin import BasePlugin, PluginContext

logger = logging.getLogger(__name__)


class DbtMcpPlugin(BasePlugin):
name = "dbt-mcp"
description = "Registers dbt MCP server with installed agents"

def should_run(self, phase: str, context: PluginContext) -> bool:
"""Only run for dbt projects on Snowflake"""
is_dbt = context.project_type in ["dbt", "dbt-fusion"]
is_snowflake = context.db_type == "snowflake"
return is_dbt and is_snowflake

def pre_agent(self, context: PluginContext) -> None:
"""Register dbt MCP server before agent starts"""
# Use existing script at shared/scripts/setup-dbt-mcp.sh
script_path = context.trial_handler._shared_path / "scripts/setup-dbt-mcp.sh"

logger.info(f"[DbtMcpPlugin] Preparing to register dbt MCP server")
logger.info(f"[DbtMcpPlugin] Source script path: {script_path}")
logger.info(f"[DbtMcpPlugin] Script exists: {script_path.exists()}")

if not script_path.exists():
logger.error(f"[DbtMcpPlugin] Script not found at {script_path}")
raise FileNotFoundError(f"dbt-mcp setup script not found at {script_path}")

logger.info(f"[DbtMcpPlugin] Copying script to container at /scripts/setup-dbt-mcp.sh")
context.terminal.copy_to_container(
paths=script_path,
container_dir="/scripts",
container_filename="setup-dbt-mcp.sh"
)
logger.info(f"[DbtMcpPlugin] Script copied successfully")

# Convert agent enum to string for script
agent_name = context.agent_name.value if context.agent_name else "unknown"

logger.info(f"[DbtMcpPlugin] Executing dbt-mcp setup for {agent_name}")
context.session.send_keys(
[
f"bash /scripts/setup-dbt-mcp.sh {context.db_type} {context.project_type} {agent_name}",
"Enter",
],
block=True,
max_timeout_sec=300
)
logger.info(f"[DbtMcpPlugin] dbt-mcp setup completed")
Loading