diff --git a/ade_bench/agents/installed_agents/abstract_installed_agent.py b/ade_bench/agents/installed_agents/abstract_installed_agent.py index 7cc7c2b3..45b32b9e 100644 --- a/ade_bench/agents/installed_agents/abstract_installed_agent.py +++ b/ade_bench/agents/installed_agents/abstract_installed_agent.py @@ -26,10 +26,9 @@ class AbstractInstalledAgent(BaseAgent, ABC): NAME = AgentName.ABSTRACT_INSTALLED - def __init__(self, use_mcp: bool = False, model_name: str | None = None, **kwargs): + def __init__(self, model_name: str | None = None, **kwargs): super().__init__(**kwargs) self._variant_config = {} - self._use_mcp = use_mcp self._model_name = model_name @property @@ -109,27 +108,6 @@ def perform_task( max_timeout_sec=config.setup_timeout_sec, # Use setup timeout for installation ) - # Optionally setup dbt MCP server - if self._use_mcp: - dbt_mcp_script = Path(__file__).parent.parent.parent.parent / "shared" / "scripts" / "setup-dbt-mcp.sh" - session.copy_to_container( - dbt_mcp_script, - container_dir="/scripts", - container_filename="setup-dbt-mcp.sh", - ) - - # Pass db_type, project_type, and agent name - db_type = self._variant_config.get('db_type', 'unknown') - project_type = self._variant_config.get('project_type', 'unknown') - agent_name = self.NAME.value if hasattr(self.NAME, 'value') else str(self.NAME) - session.send_keys( - [ - f"bash /scripts/setup-dbt-mcp.sh {db_type} {project_type} {agent_name}", - "Enter", - ], - block=True, - max_timeout_sec=config.setup_timeout_sec, - ) except TimeoutError: log_harness_info( logger, diff --git a/ade_bench/agents/installed_agents/claude_code/claude_code_agent.py b/ade_bench/agents/installed_agents/claude_code/claude_code_agent.py index b4b77e70..bc5d092c 100644 --- a/ade_bench/agents/installed_agents/claude_code/claude_code_agent.py +++ b/ade_bench/agents/installed_agents/claude_code/claude_code_agent.py @@ -14,7 +14,7 @@ class ClaudeCodeAgent(AbstractInstalledAgent): NAME = AgentName.CLAUDE_CODE - ALLOWED_TOOLS = ["Bash", "Edit", "Write", "NotebookEdit", "WebFetch", "mcp__dbt"] + ALLOWED_TOOLS = ["Bash", "Edit", "Glob", "Grep", "Write", "NotebookEdit", "WebFetch", "Skill", "mcp__dbt"] def __init__(self, **kwargs): super().__init__(**kwargs) diff --git a/ade_bench/cli/ab/main.py b/ade_bench/cli/ab/main.py index 3f5b97a9..b9053de7 100644 --- a/ade_bench/cli/ab/main.py +++ b/ade_bench/cli/ab/main.py @@ -116,15 +116,15 @@ def run( "--log-level", help="Set the logging level" ), - use_mcp: bool = typer.Option( - False, - "--use-mcp", - help="Enable MCP (Model Context Protocol) for the agent" - ), with_profiling: bool = typer.Option( False, "--with-profiling", help="Run the harness with a python profiler", + ), + plugins: str = typer.Option( + "", + "--plugins", + help="Comma-separated list of plugins to enable (e.g., 'superpowers,dbt-mcp')" ) ): """ @@ -167,6 +167,9 @@ def run( elif agent_args: agent_kwargs["additional_args"] = agent_args + # Parse plugins + enabled_plugins = [p.strip() for p in plugins.split(",") if p.strip()] + # Create and run the harness harness = Harness( dataset_path=dataset_path, @@ -189,8 +192,8 @@ def run( db_type=db, project_type=project_type, keep_alive=persist, - use_mcp=use_mcp, - with_profiling=with_profiling + with_profiling=with_profiling, + enabled_plugins=enabled_plugins, ) results = harness.run() diff --git a/ade_bench/harness.py b/ade_bench/harness.py index a432bd18..c6a02e94 100644 --- a/ade_bench/harness.py +++ b/ade_bench/harness.py @@ -28,6 +28,7 @@ TrialResults, ) from ade_bench.setup.setup_orchestrator import SetupOrchestrator +from ade_bench.plugins.base_plugin import PluginContext from ade_bench.llms.base_llm import ContextLengthExceededError, ParseError from ade_bench.parsers.base_parser import UnitTestStatus, ParserResult from ade_bench.terminal.docker_compose_manager import DockerComposeManager @@ -64,8 +65,8 @@ def __init__( db_type: str | None = None, project_type: str | None = None, keep_alive: bool = False, - use_mcp: bool = False, with_profiling: bool = False, + enabled_plugins: list[str] | None = None, ): """ Runs the Terminal-Bench harness. @@ -93,8 +94,8 @@ def __init__( db_type: Database type to filter variants (e.g., duckdb, postgres, sqlite, snowflake). project_type: Project type to filter variants (e.g., dbt, other). keep_alive: If True, keep containers alive when tasks fail for debugging. - use_mcp: If True, start a dbt MCP server after setup completes. with_profiling: If True, will enable the cProfiler. + enabled_plugins: List of plugin names to enable (from CLI --plugins). """ self._run_uuid = None self._start_time = datetime.now(timezone.utc).isoformat() @@ -107,11 +108,14 @@ def __init__( self._db_filter = db_type self._project_type_filter = project_type self._keep_alive = keep_alive - self._use_mcp = use_mcp self._with_profiling = with_profiling + self._enabled_plugins = enabled_plugins # Initialize setup orchestrator for variant-specific setup - self._setup_orchestrator = SetupOrchestrator() + self._setup_orchestrator = SetupOrchestrator(enabled_plugins=enabled_plugins) + + # Keep reference to registry for pre-agent/post-trial hooks + self.plugin_registry = self._setup_orchestrator.plugin_registry self._output_path = output_path self._agent_name = agent_name @@ -180,9 +184,6 @@ def _create_agent_for_task(self, task_id: str) -> BaseAgent: if self._model_name: agent_kwargs["model_name"] = self._model_name - # Pass use_mcp flag to installed agents - agent_kwargs["use_mcp"] = self._use_mcp - return AgentFactory.get_agent(self._agent_name, **agent_kwargs) def _init_dataset(self) -> None: @@ -522,7 +523,8 @@ def _run_setup( terminal=terminal, session=session, file_diff_handler=file_diff_handler, - trial_handler=trial_handler + trial_handler=trial_handler, + enabled_plugins=self._enabled_plugins ) # Run setup with timeout using asyncio @@ -569,7 +571,7 @@ def _run_trial( model_name=self._model_name, db_type=config.get("db_type"), project_type=config.get("project_type"), - used_mcp=self._use_mcp, + used_mcp=self.plugin_registry.did_plugin_run("dbt-mcp"), ) with spin_up_terminal( @@ -651,6 +653,21 @@ def _run_trial( if hasattr(task_agent, 'set_variant_config'): task_agent.set_variant_config(config) + # Build context for agent-phase hooks + context = PluginContext( + terminal=terminal, + session=session, + trial_handler=trial_handler, + task_id=trial_handler.task_id, + variant=config, + agent_name=task_agent.NAME, + db_type=config.get("database", {}).get("type") if config.get("database") else config.get("db_type"), + project_type=config.get("project_type"), + ) + + # PRE-AGENT HOOKS + self.plugin_registry.run_hooks("pre_agent", context) + log_harness_info(self._logger, trial_handler.task_id, "agent", f"Starting agent...") try: agent_result, agent_failure_mode = self._run_agent( @@ -716,6 +733,9 @@ def _run_trial( results.runtime_ms = agent_result.runtime_ms results.cost_usd = agent_result.cost_usd + # POST-TRIAL HOOKS + self.plugin_registry.run_hooks("post_trial", context) + # Always kill the agent session to ensure cleanup, regardless of success/failure try: session.kill_session() @@ -1138,7 +1158,7 @@ def _execute_single_trial( model_name=self._model_name, db_type=config.get("db_type"), project_type=config.get("project_type"), - used_mcp=self._use_mcp, + used_mcp=self.plugin_registry.did_plugin_run("dbt-mcp"), ) return trial_results diff --git a/ade_bench/plugins/__init__.py b/ade_bench/plugins/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ade_bench/plugins/base_plugin.py b/ade_bench/plugins/base_plugin.py new file mode 100644 index 00000000..ac2c36e3 --- /dev/null +++ b/ade_bench/plugins/base_plugin.py @@ -0,0 +1,67 @@ +# ABOUTME: Base plugin interface and context for ADE-Bench plugin system +# ABOUTME: Plugins hook into setup/agent lifecycle phases with conditional execution + +from abc import ABC +from dataclasses import dataclass +from typing import Any, Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from ade_bench.agents.agent_name import AgentName + from ade_bench.terminal.docker_compose_manager import DockerComposeManager + from ade_bench.handlers.trial_handler import TrialHandler + + +@dataclass +class PluginContext: + """Immutable context passed to plugin hooks""" + terminal: "DockerComposeManager" + session: Any # libtmux Session + trial_handler: "TrialHandler" + task_id: str + variant: dict + agent_name: Optional["AgentName"] = None + db_type: Optional[str] = None + project_type: Optional[str] = None + + +class BasePlugin(ABC): + """Base class for ADE-Bench plugins + + Subclasses override class attributes (name, description) and implement + lifecycle hook methods (pre_setup, post_setup, pre_agent, post_trial). + """ + + # Subclasses override these + name: str = "" + description: str = "" + + def __init__(self): + pass + + def should_run(self, phase: str, context: PluginContext) -> bool: + """Override to conditionally execute based on context + + Args: + phase: Hook phase name (pre_setup, post_setup, pre_agent, post_trial) + context: Plugin execution context + + Returns: + True if plugin should run for this phase/context + """ + return True + + def pre_setup(self, context: PluginContext) -> None: + """Hook before task's setup.sh execution""" + pass + + def post_setup(self, context: PluginContext) -> None: + """Hook after task's setup.sh execution""" + pass + + def pre_agent(self, context: PluginContext) -> None: + """Hook before agent execution""" + pass + + def post_trial(self, context: PluginContext) -> None: + """Hook after trial completion""" + pass diff --git a/ade_bench/plugins/dbt_artifacts.py b/ade_bench/plugins/dbt_artifacts.py new file mode 100644 index 00000000..523833ed --- /dev/null +++ b/ade_bench/plugins/dbt_artifacts.py @@ -0,0 +1,120 @@ +# ABOUTME: dbt artifacts plugin generates baseline manifest.json and catalog.json +# ABOUTME: Creates artifacts in target-base directory before agent execution + +import logging +import subprocess +from pathlib import Path +from ade_bench.plugins.base_plugin import BasePlugin, PluginContext + +logger = logging.getLogger(__name__) + + +class DbtArtifactsPlugin(BasePlugin): + name = "dbt-artifacts" + description = "Generates baseline dbt artifacts (manifest.json, catalog.json) in target-base directory" + + def should_run(self, phase: str, context: PluginContext) -> bool: + """Only run for dbt and dbt-fusion projects""" + is_dbt = context.project_type in ["dbt", "dbt-fusion"] + return is_dbt + + def pre_setup(self, context: PluginContext) -> None: + """Generate dbt artifacts before setup runs""" + script_path = context.trial_handler._shared_path / "plugins/dbt-artifacts/generate-artifacts.sh" + + logger.info(f"[DbtArtifactsPlugin] Preparing to generate baseline dbt artifacts") + logger.info(f"[DbtArtifactsPlugin] Source script path: {script_path}") + logger.info(f"[DbtArtifactsPlugin] Script exists: {script_path.exists()}") + + if not script_path.exists(): + logger.error(f"[DbtArtifactsPlugin] Script not found at {script_path}") + raise FileNotFoundError(f"dbt-artifacts script not found at {script_path}") + + logger.info(f"[DbtArtifactsPlugin] Copying script to container at /scripts/generate-dbt-artifacts.sh") + context.terminal.copy_to_container( + paths=script_path, + container_dir="/scripts", + container_filename="generate-dbt-artifacts.sh" + ) + logger.info(f"[DbtArtifactsPlugin] Script copied successfully") + + logger.info(f"[DbtArtifactsPlugin] Generating baseline dbt artifacts in target-base/") + context.session.send_keys( + [ + "bash /scripts/generate-dbt-artifacts.sh", + "Enter", + ], + block=True, + max_timeout_sec=300 + ) + logger.info(f"[DbtArtifactsPlugin] Baseline dbt artifacts generated successfully") + + def post_trial(self, context: PluginContext) -> None: + """Generate final artifacts and archive both baseline and final to experiments folder""" + logger.info(f"[DbtArtifactsPlugin] Generating final dbt artifacts after agent execution") + + # Generate final artifacts in /app/target/ + context.session.send_keys( + [ + "cd /app && dbt docs generate", + "Enter", + ], + block=True, + max_timeout_sec=300 + ) + logger.info(f"[DbtArtifactsPlugin] Final dbt artifacts generated in /app/target/") + + # Archive artifacts to experiments folder + trial_dir = context.trial_handler._task_output_path + artifacts_dir = trial_dir / "dbt_artifacts" + artifacts_dir.mkdir(exist_ok=True) + + logger.info(f"[DbtArtifactsPlugin] Archiving artifacts to {artifacts_dir}") + + # Copy baseline artifacts (target-base) + baseline_dir = artifacts_dir / "baseline" + baseline_dir.mkdir(exist_ok=True) + self._copy_from_container( + context.session.container, + "/app/target-base/manifest.json", + baseline_dir / "manifest.json" + ) + self._copy_from_container( + context.session.container, + "/app/target-base/catalog.json", + baseline_dir / "catalog.json" + ) + logger.info(f"[DbtArtifactsPlugin] Baseline artifacts archived to {baseline_dir}") + + # Copy final artifacts (target) + final_dir = artifacts_dir / "final" + final_dir.mkdir(exist_ok=True) + self._copy_from_container( + context.session.container, + "/app/target/manifest.json", + final_dir / "manifest.json" + ) + self._copy_from_container( + context.session.container, + "/app/target/catalog.json", + final_dir / "catalog.json" + ) + logger.info(f"[DbtArtifactsPlugin] Final artifacts archived to {final_dir}") + + logger.info(f"[DbtArtifactsPlugin] Artifact archival complete") + + def _copy_from_container(self, container, container_path: str, local_path: Path) -> None: + """Copy a file from container to local filesystem using docker cp""" + try: + # Use docker cp command + container_name = container.name + subprocess.run( + ["docker", "cp", f"{container_name}:{container_path}", str(local_path)], + check=True, + capture_output=True + ) + logger.info(f"[DbtArtifactsPlugin] Copied {container_path} to {local_path}") + except subprocess.CalledProcessError as e: + logger.warning(f"[DbtArtifactsPlugin] Failed to copy {container_path}: {e.stderr.decode()}") + except Exception as e: + logger.warning(f"[DbtArtifactsPlugin] Failed to copy {container_path}: {e}") diff --git a/ade_bench/plugins/dbt_mcp.py b/ade_bench/plugins/dbt_mcp.py new file mode 100644 index 00000000..2547b1d1 --- /dev/null +++ b/ade_bench/plugins/dbt_mcp.py @@ -0,0 +1,54 @@ +# ABOUTME: dbt-mcp plugin registers dbt MCP server with installed agents +# ABOUTME: Uses existing setup-dbt-mcp.sh script for Snowflake dbt projects + +import logging +from pathlib import Path +from ade_bench.plugins.base_plugin import BasePlugin, PluginContext + +logger = logging.getLogger(__name__) + + +class DbtMcpPlugin(BasePlugin): + name = "dbt-mcp" + description = "Registers dbt MCP server with installed agents" + + def should_run(self, phase: str, context: PluginContext) -> bool: + """Only run for dbt projects on Snowflake""" + is_dbt = context.project_type in ["dbt", "dbt-fusion"] + is_snowflake = context.db_type == "snowflake" + return is_dbt and is_snowflake + + def pre_agent(self, context: PluginContext) -> None: + """Register dbt MCP server before agent starts""" + # Use existing script at shared/scripts/setup-dbt-mcp.sh + script_path = context.trial_handler._shared_path / "scripts/setup-dbt-mcp.sh" + + logger.info(f"[DbtMcpPlugin] Preparing to register dbt MCP server") + logger.info(f"[DbtMcpPlugin] Source script path: {script_path}") + logger.info(f"[DbtMcpPlugin] Script exists: {script_path.exists()}") + + if not script_path.exists(): + logger.error(f"[DbtMcpPlugin] Script not found at {script_path}") + raise FileNotFoundError(f"dbt-mcp setup script not found at {script_path}") + + logger.info(f"[DbtMcpPlugin] Copying script to container at /scripts/setup-dbt-mcp.sh") + context.terminal.copy_to_container( + paths=script_path, + container_dir="/scripts", + container_filename="setup-dbt-mcp.sh" + ) + logger.info(f"[DbtMcpPlugin] Script copied successfully") + + # Convert agent enum to string for script + agent_name = context.agent_name.value if context.agent_name else "unknown" + + logger.info(f"[DbtMcpPlugin] Executing dbt-mcp setup for {agent_name}") + context.session.send_keys( + [ + f"bash /scripts/setup-dbt-mcp.sh {context.db_type} {context.project_type} {agent_name}", + "Enter", + ], + block=True, + max_timeout_sec=300 + ) + logger.info(f"[DbtMcpPlugin] dbt-mcp setup completed") diff --git a/ade_bench/plugins/registry.py b/ade_bench/plugins/registry.py new file mode 100644 index 00000000..f83716b9 --- /dev/null +++ b/ade_bench/plugins/registry.py @@ -0,0 +1,79 @@ +# ABOUTME: Plugin registry manages plugin instances and executes hooks at lifecycle phases +# ABOUTME: Loads plugins from AVAILABLE_PLUGINS list based on enabled names from CLI + +from typing import List, Set +from ade_bench.plugins.base_plugin import BasePlugin, PluginContext +from ade_bench.plugins.superpowers import SuperpowersPlugin +from ade_bench.plugins.dbt_mcp import DbtMcpPlugin +from ade_bench.plugins.dbt_artifacts import DbtArtifactsPlugin + + +# Static list of available plugins +AVAILABLE_PLUGINS: List[type[BasePlugin]] = [ + SuperpowersPlugin, + DbtMcpPlugin, + DbtArtifactsPlugin, +] + + +class PluginRegistry: + """Registry that loads and executes plugins at lifecycle phases""" + + def __init__(self, enabled_plugin_names: List[str]): + """Initialize registry with enabled plugins + + Args: + enabled_plugin_names: List of plugin names to enable (from CLI --plugins) + + Raises: + ValueError: If any requested plugin name is not found in AVAILABLE_PLUGINS + """ + self.plugins: List[BasePlugin] = [] + self.plugins_run: Set[str] = set() # Track which plugins actually ran + + # Build map of available plugin names + available_plugin_map = {} + for plugin_class in AVAILABLE_PLUGINS: + plugin = plugin_class() + available_plugin_map[plugin.name] = plugin_class + + # Validate all requested plugins exist + available_names = set(available_plugin_map.keys()) + requested_names = set(enabled_plugin_names) + unknown_plugins = requested_names - available_names + + if unknown_plugins: + raise ValueError( + f"Unknown plugin(s): {', '.join(sorted(unknown_plugins))}. " + f"Available plugins: {', '.join(sorted(available_names))}" + ) + + # Load requested plugins + for plugin_name in enabled_plugin_names: + plugin_class = available_plugin_map[plugin_name] + self.plugins.append(plugin_class()) + + def run_hooks(self, phase: str, context: PluginContext) -> None: + """Execute all enabled plugins for given phase + + Args: + phase: Hook phase name (pre_setup, post_setup, pre_agent, post_trial) + context: Plugin execution context + """ + for plugin in self.plugins: + if plugin.should_run(phase, context): + hook_method = getattr(plugin, phase) + hook_method(context) + # Track that this plugin actually ran + self.plugins_run.add(plugin.name) + + def did_plugin_run(self, plugin_name: str) -> bool: + """Check if a specific plugin ran during this trial + + Args: + plugin_name: Name of the plugin to check + + Returns: + True if the plugin ran, False otherwise + """ + return plugin_name in self.plugins_run diff --git a/ade_bench/plugins/superpowers.py b/ade_bench/plugins/superpowers.py new file mode 100644 index 00000000..99b0dfa4 --- /dev/null +++ b/ade_bench/plugins/superpowers.py @@ -0,0 +1,52 @@ +# ABOUTME: Superpowers plugin enables superpowers skills for Claude Code agent +# ABOUTME: Installs via Claude plugin marketplace in pre-agent phase + +import logging +from pathlib import Path +from ade_bench.plugins.base_plugin import BasePlugin, PluginContext +from ade_bench.agents.agent_name import AgentName + +logger = logging.getLogger(__name__) + + +class SuperpowersPlugin(BasePlugin): + name = "superpowers" + description = "Enables superpowers skills for Claude Code agent" + + def should_run(self, phase: str, context: PluginContext) -> bool: + """Only run for Claude Code agent""" + return context.agent_name == AgentName.CLAUDE_CODE + + def pre_agent(self, context: PluginContext) -> None: + """Install superpowers before agent starts""" + script_path = context.trial_handler._shared_path / "plugins/superpowers/install.sh" + + logger.info(f"[SuperpowersPlugin] Preparing to install superpowers") + logger.info(f"[SuperpowersPlugin] Source script path: {script_path}") + logger.info(f"[SuperpowersPlugin] Script exists: {script_path.exists()}") + + if not script_path.exists(): + logger.error(f"[SuperpowersPlugin] Script not found at {script_path}") + raise FileNotFoundError(f"Superpowers install script not found at {script_path}") + + # Copy script to container at /scripts/ + logger.info(f"[SuperpowersPlugin] Copying script to container at /scripts/install-superpowers.sh") + context.terminal.copy_to_container( + paths=script_path, + container_dir="/scripts", + container_filename="install-superpowers.sh" + ) + logger.info(f"[SuperpowersPlugin] Script copied successfully") + + # Execute installation + logger.info(f"[SuperpowersPlugin] Executing installation script") + context.session.send_keys( + [ + "bash /scripts/install-superpowers.sh", + "Enter", + ], + block=True, + max_timeout_sec=300 + ) + logger.info(f"[SuperpowersPlugin] Superpowers installation completed") + diff --git a/ade_bench/setup/setup_orchestrator.py b/ade_bench/setup/setup_orchestrator.py index 35ec3ce1..4b9fad9c 100644 --- a/ade_bench/setup/setup_orchestrator.py +++ b/ade_bench/setup/setup_orchestrator.py @@ -1,6 +1,5 @@ -""" -Simple setup orchestrator - just calls functions directly. -""" +# ABOUTME: Orchestrates task setup by coordinating setup functions and plugin hooks +# ABOUTME: Calls setup functions in sequence with plugin hooks at lifecycle phases from typing import Dict, Any from .base_setup import setup_base_files @@ -10,22 +9,46 @@ from .migration_setup import setup_migration from .agent_setup import setup_agent_config from ..utils.logger import log_harness_info +from ..plugins.registry import PluginRegistry +from ..plugins.base_plugin import PluginContext class SetupOrchestrator: - """Simple orchestrator that calls setup functions directly.""" - - def __init__(self, logger=None, terminal=None, session=None, file_diff_handler=None, trial_handler=None): + """Orchestrator that calls setup functions and plugin hooks at lifecycle phases.""" + + def __init__(self, logger=None, terminal=None, session=None, file_diff_handler=None, trial_handler=None, enabled_plugins=None): + """Initialize orchestrator with plugin registry + + Args: + logger: Logger instance for output + terminal: Terminal/container manager + session: Session for running commands + file_diff_handler: Handler for file diffing + trial_handler: Handler for trial operations + enabled_plugins: List of plugin names to enable (from CLI --plugins) + """ self.logger = logger self.terminal = terminal self.session = session self.file_diff_handler = file_diff_handler self.trial_handler = trial_handler + self.plugin_registry = PluginRegistry(enabled_plugins or []) def setup_task(self, task_id: str, variant: Dict[str, Any]) -> bool: - """Setup a task for the given variant.""" + """Setup a task with plugin hooks at lifecycle phases.""" log_harness_info(self.logger, task_id, "setup", f"Starting task setup...") + # Build context once for all hooks + context = PluginContext( + terminal=self.terminal, + session=self.session, + trial_handler=self.trial_handler, + task_id=task_id, + variant=variant, + agent_name=variant.get("agent_name"), + db_type=variant.get("database", {}).get("type") if variant.get("database") else None, + project_type=variant.get("project_type"), + ) # Set up the project project_type = variant.get('project_type') @@ -61,12 +84,16 @@ def setup_task(self, task_id: str, variant: Dict[str, Any]) -> bool: setup_migration(self.terminal, self.session, variant, self.trial_handler) log_harness_info(self.logger, task_id, "setup", "Migration script complete") + # PRE-SETUP HOOKS + self.plugin_registry.run_hooks("pre_setup", context) # Run main setup script. log_harness_info(self.logger, task_id, "setup", "Running setup script...") setup_base_files(self.terminal, self.session, task_id, variant, self.trial_handler) log_harness_info(self.logger, task_id, "setup", "Setup script complete.") + # POST-SETUP HOOKS + self.plugin_registry.run_hooks("post_setup", context) # Take final snapshot after setup script if self.file_diff_handler: diff --git a/docker/base/Dockerfile.duckdb-dbt b/docker/base/Dockerfile.duckdb-dbt index 0a3a3f01..42c94cd0 100644 --- a/docker/base/Dockerfile.duckdb-dbt +++ b/docker/base/Dockerfile.duckdb-dbt @@ -1,7 +1,7 @@ FROM python:3.11-slim RUN apt-get update && apt-get install -y \ - tmux asciinema \ + git tmux asciinema \ curl \ && curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \ && apt-get install -y nodejs \ diff --git a/scripts_python/run_harness.py b/scripts_python/run_harness.py index ad503dbd..00ae7498 100755 --- a/scripts_python/run_harness.py +++ b/scripts_python/run_harness.py @@ -133,16 +133,17 @@ action="store_true", help="Keep containers alive when tasks fail for debugging", ) - parser.add_argument( - "--use-mcp", - action="store_true", - help="Start a dbt MCP server after setup completes", - ) parser.add_argument( "--with-profiling", action="store_true", help="Run the harness with a python profiler", ) + parser.add_argument( + "--plugins", + type=str, + default="", + help="Comma-separated list of plugins to enable (e.g., 'superpowers,dbt-mcp')", + ) args = parser.parse_args() dataset_path = Path("tasks") @@ -160,6 +161,9 @@ if args.agent_args: agent_kwargs["additional_args"] = args.agent_args + # Parse plugins string into list + enabled_plugins = [p.strip() for p in args.plugins.split(",") if p.strip()] + harness = Harness( dataset_path=dataset_path, output_path=args.output_path, @@ -181,8 +185,8 @@ db_type=args.db, project_type=args.project_type, keep_alive=args.persist, - use_mcp=args.use_mcp, with_profiling=args.with_profiling, + enabled_plugins=enabled_plugins, ) results = harness.run() diff --git a/shared/plugins/dbt-artifacts/generate-artifacts.sh b/shared/plugins/dbt-artifacts/generate-artifacts.sh new file mode 100755 index 00000000..8f13bb5a --- /dev/null +++ b/shared/plugins/dbt-artifacts/generate-artifacts.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -e + +echo "Generating baseline dbt artifacts in target-base directory..." + +# Check if dbt project exists +if [ ! -f "/app/dbt_project.yml" ]; then + echo "Warning: No dbt_project.yml found at /app/dbt_project.yml" + echo "Skipping artifact generation" + exit 0 +fi + +# Navigate to app directory +cd /app + +# Create target-base directory if it doesn't exist +mkdir -p target-base + +# Install dbt dependencies if needed +if [ -f "packages.yml" ] || [ -f "dependencies.yml" ]; then + echo "Installing dbt dependencies..." + dbt deps +fi + +# Generate manifest.json via compile +echo "Running dbt compile to generate manifest.json..." +dbt compile --target-path target-base + +# Generate catalog.json via docs generate +echo "Running dbt docs generate to generate catalog.json..." +dbt docs generate --target-path target-base + +# Verify artifacts were created +if [ -f "target-base/manifest.json" ]; then + echo "✓ manifest.json generated successfully" +else + echo "✗ Warning: manifest.json not found in target-base/" +fi + +if [ -f "target-base/catalog.json" ]; then + echo "✓ catalog.json generated successfully" +else + echo "✗ Warning: catalog.json not found in target-base/" +fi + +echo "Baseline dbt artifacts generation complete" diff --git a/shared/plugins/superpowers/install.sh b/shared/plugins/superpowers/install.sh new file mode 100755 index 00000000..262d33aa --- /dev/null +++ b/shared/plugins/superpowers/install.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +echo "Installing superpowers for Claude Code..." + + +# XXX: right now there's no post-agent-instaill hook, so we are just installing claude-code here +npm install -g @anthropic-ai/claude-code + +# Add superpowers marketplace +claude plugin marketplace add obra/superpowers-marketplace + +# Install superpowers plugin +claude plugin install superpowers@superpowers-marketplace + +echo "Superpowers installed successfully"