diff --git a/scripts/aidlc-evaluator/.gitleaks.toml b/scripts/aidlc-evaluator/.gitleaks.toml new file mode 100644 index 00000000..856ebb71 --- /dev/null +++ b/scripts/aidlc-evaluator/.gitleaks.toml @@ -0,0 +1,8 @@ +# Gitleaks configuration for aidlc-evaluator +# Suppress false positives from test fixtures that intentionally contain fake credentials. + +[allowlist] +description = "Fake credentials used in test_credential_scrubber.py test fixtures" +paths = [ + "packages/shared/tests/test_credential_scrubber.py", +] diff --git a/scripts/aidlc-evaluator/ARCHITECTURE.md b/scripts/aidlc-evaluator/ARCHITECTURE.md index 1628f402..45c2f42e 100644 --- a/scripts/aidlc-evaluator/ARCHITECTURE.md +++ b/scripts/aidlc-evaluator/ARCHITECTURE.md @@ -460,6 +460,36 @@ get_adapter(name) ← lazy import from registry Supported adapters: Cursor, Cline, Copilot, Kiro, Windsurf, Antigravity. +### 6.5 CLI Evaluation (`run_cli_evaluation.py`) + +Runs the AIDLC workflow through CLI-based AI assistants (Claude Code, Kiro CLI, etc.): + +```text +load_adapters_from_config(cfg_data) ← register any custom adapters from config.yaml + │ +get_adapter(name) ← lazy import from registry + │ + ├── check_prerequisites() + ├── HumanSimulator built once by orchestrator (vision + tech_env + openapi injected) + ├── adapter.run(config) ──► CLI-specific automation + simulator gate reviews + ├── normalize_output() ──► standard run folder layout + └── run_evaluation.py --evaluate-only ──► stages 2-6 +``` + +**Adapter pattern**: Each CLI tool is implemented as a subclass of `CLIAdapter` (`packages/cli-harness/src/cli_harness/adapter.py`) with three methods: + +- `name` — human-readable identifier (e.g. `"kiro-cli"`) +- `check_prerequisites()` — verify the CLI tool is installed and credentials are valid +- `run(config: AdapterConfig) -> AdapterResult` — execute the AIDLC workflow and return results + +**HumanSimulator injection**: The orchestrator constructs a single `HumanSimulator` with the full document context (vision, tech-env, OpenAPI spec) before calling the adapter. It is passed in as `config.simulator`. Adapters access it via `config.simulator.respond(message)` — they do not construct it themselves. + +**Simulator gates**: Adapters use `config.simulator` to inject human-reviewer feedback at key workflow stages. The kiro-cli adapter uses 4 stage gates (requirements → design → code-gen plan → construction); the claude-code-sdk adapter intercepts `handoff_to_simulator` tool calls inline. + +**Plugin registration**: Custom adapters can be added without modifying framework code — see [Adding a New CLI Adapter](#adding-a-new-cli-adapter) below. + +Supported built-in adapters: `claude-code`, `claude-code-sdk`, `kiro-cli`. + --- ## 7. Data Flow: YAML Artifact Graph @@ -633,6 +663,94 @@ The default test case is `sci-calc` (a scientific calculator API). All CLI defau 1. Create `config/.yaml` with `models.executor.model_id` set to the Bedrock model ID 2. The batch runner will automatically discover it +### Adding a New CLI Adapter + +CLI adapters live in `packages/cli-harness` and follow a plugin pattern — no framework code changes are needed. + +**Step 1 — Implement the adapter** + +Create a module anywhere importable (e.g. `packages/cli-harness/src/cli_harness/adapters/my_tool.py`): + +```python +from cli_harness.adapter import AdapterConfig, AdapterResult, CLIAdapter + +class MyToolAdapter(CLIAdapter): + @property + def name(self) -> str: + return "my-tool" + + def check_prerequisites(self) -> tuple[bool, str]: + import shutil + if not shutil.which("my-tool"): + return False, "'my-tool' not found in PATH" + return True, "my-tool found" + + def run(self, config: AdapterConfig) -> AdapterResult: + import time, shutil + from cli_harness.normalizer import normalize_output + + start = time.monotonic() + workspace = config.output_dir / "workspace" + workspace.mkdir(parents=True, exist_ok=True) + + # Copy inputs, inject rules, run the CLI tool... + # Use config.simulator.respond(message) at review gates. + simulator = config.simulator # pre-built with vision/tech_env/openapi context + if simulator is None: + raise RuntimeError("my-tool requires a simulator (set --simulator-model)") + + # ... run CLI tool stages, call simulator.respond() between stages ... + + elapsed = time.monotonic() - start + normalize_output( + source_dir=workspace, + output_dir=config.output_dir, + adapter_name=self.name, + elapsed_seconds=elapsed, + ) + dst_docs = config.output_dir / "aidlc-docs" + return AdapterResult( + success=dst_docs.is_dir(), + output_dir=config.output_dir, + aidlc_docs_dir=dst_docs if dst_docs.is_dir() else None, + workspace_dir=workspace, + elapsed_seconds=elapsed, + ) +``` + +**Step 2 — Register in config** (no framework edits needed) + +Add one line to `config/default.yaml` (or your own config file): + +```yaml +cli: + adapters: + my-tool: "cli_harness.adapters.my_tool.MyToolAdapter" +``` + +**Step 3 — Verify** + +```bash +# Confirm it appears +uv run python run.py cli --list + +# Check prerequisites +uv run python run.py cli --cli my-tool --check-only + +# Run evaluation +uv run python run.py cli --cli my-tool --scenario sci-calc +``` + +**Key contracts for adapter implementors:** + +| What | Where | Notes | +| ---------------- | ----------------------------------------------------------------------- | ------------------------------------------------------------------------- | +| Abstract base | `cli_harness/adapter.py` - `CLIAdapter` | Implement `name`, `check_prerequisites`, `run` | +| Simulator | `config.simulator` (`HumanSimulator`) | Call `.respond(message)` at review gates; never construct it yourself | +| Output layout | `cli_harness/normalizer.py` (`normalize_output()`) | Call at end of `run()` to write `run-meta.yaml` / `run-metrics.yaml` | +| Post-run tests | `aidlc_runner.post_run.run_post_evaluation()` | Optional; call after `normalize_output()` to run generated project tests | +| Document context | `config.vision_path`, `config.tech_env_path`, `config.openapi_content` | Available if needed; simulator already has this context | + ### Adding a New IDE Adapter 1. Create `packages/ide-harness/src/ide_harness/adapters/.py` diff --git a/scripts/aidlc-evaluator/CONTRIBUTING.md b/scripts/aidlc-evaluator/CONTRIBUTING.md index c8b1fb7f..e72e3120 100644 --- a/scripts/aidlc-evaluator/CONTRIBUTING.md +++ b/scripts/aidlc-evaluator/CONTRIBUTING.md @@ -36,11 +36,14 @@ git checkout -b feature/your-feature-name Work in the appropriate package: -- `aidlc-runner/` - Execution Framework (two-agent AIDLC workflow runner) +- `packages/execution/` - Execution Framework (two-agent AIDLC workflow runner) +- `packages/cli-harness/` - CLI Adapter Framework (Claude Code, Kiro CLI, custom tools) +- `packages/ide-harness/` - IDE Adapter Framework (Cursor, Cline, Kiro, etc.) - `packages/qualitative/` - Semantic Evaluation (intent & design similarity scoring) - `packages/quantitative/` - Code Evaluation (linting, security, organization) - `packages/nonfunctional/` - NFR Evaluation (tokens, timing, consistency) - `packages/reporting/` - Report generation +- `packages/trend-reports/` - Cross-release trend reporting - `packages/shared/` - Common utilities Or contribute to other work streams: @@ -96,14 +99,17 @@ git commit -m "Add token tracking to nonfunctional package" The project is organized around six big rocks. Your changes will typically fall into one or more of these: -| Work Stream | Description | Package / Area | -| ----------------------- | --------------------------------------------- | ------------------------- | -| **Golden Test Case** | Curated baseline test inputs | `test_cases/` | -| **Execution Framework** | Two-agent AIDLC workflow runner (Owner: Jeff) | `aidlc-runner/` | -| **Semantic Evaluation** | Intent & design similarity scoring | `packages/qualitative/` | -| **Code Evaluation** | Linting, security, organization | `packages/quantitative/` | -| **NFR Evaluation** | Tokens, timing, consistency | `packages/nonfunctional/` | -| **GitHub CI/CD** | Pipeline integration & management | `.github/workflows/` | +| Work Stream | Description | Package / Area | +| ----------------------- | --------------------------------------------- | ---------------------------- | +| **Golden Test Case** | Curated baseline test inputs | `test_cases/` | +| **Execution Framework** | Two-agent AIDLC workflow runner | `packages/execution/` | +| **CLI Adapters** | CLI tool integrations (Claude Code, Kiro CLI) | `packages/cli-harness/` | +| **IDE Adapters** | IDE tool integrations (Cursor, Cline, etc.) | `packages/ide-harness/` | +| **Semantic Evaluation** | Intent & design similarity scoring | `packages/qualitative/` | +| **Code Evaluation** | Linting, security, organization | `packages/quantitative/` | +| **NFR Evaluation** | Tokens, timing, consistency | `packages/nonfunctional/` | +| **Trend Reporting** | Cross-release metric tracking | `packages/trend-reports/` | +| **GitHub CI/CD** | Pipeline integration & management | `.github/workflows/` | ## Code Standards diff --git a/scripts/aidlc-evaluator/config/default.yaml b/scripts/aidlc-evaluator/config/default.yaml index 28992087..e0e31bb2 100644 --- a/scripts/aidlc-evaluator/config/default.yaml +++ b/scripts/aidlc-evaluator/config/default.yaml @@ -41,3 +41,6 @@ execution: tools: pmd_path: null # Path to PMD executable; if null, looks for 'pmd' on PATH + +cli: + adapters: {} # Register custom CLI adapters: name: "mypackage.MyAdapter" diff --git a/scripts/aidlc-evaluator/docker/sandbox/Dockerfile b/scripts/aidlc-evaluator/docker/sandbox/Dockerfile index d45d61d4..b42e4444 100644 --- a/scripts/aidlc-evaluator/docker/sandbox/Dockerfile +++ b/scripts/aidlc-evaluator/docker/sandbox/Dockerfile @@ -1,6 +1,6 @@ # Multi-language sandbox image for running AI-generated code in isolation. # -# Includes Python 3.14 + uv, Node.js 22 + npm, and common build tools. +# Includes Python 3.13 + uv, Node.js 22 + npm, and common build tools. # Runs as a non-root user with no credentials or host tools. # # Security notes: @@ -9,7 +9,7 @@ # checkov:skip=CKV_DOCKER_2:HEALTHCHECK not needed for ephemeral test sandbox # nosemgrep: dockerfile-source-not-pinned -FROM public.ecr.aws/docker/library/python:3.14-slim@sha256:3989a23fd2c28a34c7be819e488b958a10601d421ac25bea1e7a5d757365e2d5 AS base +FROM public.ecr.aws/docker/library/python:3.13-slim@sha256:8922791069fdfdd6056cf7f418a8655d970862d1972570d4c0e78dfc43afacd6 AS base # Install system dependencies and Node.js 22 # nosemgrep: set-pipefail diff --git a/scripts/aidlc-evaluator/packages/cli-harness/pyproject.toml b/scripts/aidlc-evaluator/packages/cli-harness/pyproject.toml index c5e30217..ad9acf0c 100644 --- a/scripts/aidlc-evaluator/packages/cli-harness/pyproject.toml +++ b/scripts/aidlc-evaluator/packages/cli-harness/pyproject.toml @@ -5,6 +5,8 @@ description = "CLI-based harness for testing AIDLC workflows via kiro-cli" requires-python = ">=3.13" dependencies = [ "pyyaml>=6.0", + "anthropic[bedrock]>=0.40", + "boto3>=1.42.47", ] [project.optional-dependencies] diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapter.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapter.py index 95f0f0c0..e015f3d4 100644 --- a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapter.py +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapter.py @@ -5,6 +5,10 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from cli_harness.simulator import HumanSimulator @dataclass @@ -17,7 +21,11 @@ class AdapterConfig: tech_env_path: Path | None = None prompt_template: str | None = None model: str | None = None + simulator_model: str | None = None # kept for backwards compat; prefer simulator field aws_profile: str | None = None + aws_region: str | None = None + openapi_content: str | None = None # injected into prompt/simulator for contract validation + simulator: "HumanSimulator | None" = None # pre-built by orchestrator; shared across adapters timeout_seconds: int = 7200 # 2 hours max diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code.py index 1bb25d94..247b74e2 100644 --- a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code.py +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code.py @@ -128,8 +128,11 @@ def run(self, config: AdapterConfig) -> AdapterResult: shutil.copy2(rules_path, rules_dir / rules_path.name) _log(f"Copied AIDLC rules file: {rules_path.name}") - # Build the prompt - prompt = config.prompt_template or render_prompt() + # Build the prompt — inject OpenAPI spec so the self-approving executor + # has the full contract in view during design and code review. + prompt = config.prompt_template or render_prompt( + openapi_content=config.openapi_content, + ) # Build command — claude -p for non-interactive print mode cmd = [ diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code_sdk.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code_sdk.py new file mode 100644 index 00000000..a08842d4 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code_sdk.py @@ -0,0 +1,588 @@ +"""Claude Code SDK adapter — drives AIDLC workflows via Anthropic SDK with Bedrock. + +Unlike the subprocess-based ClaudeCodeAdapter (which runs ``claude -p`` as a +one-shot process), this adapter uses ``anthropic.AnthropicBedrock`` to drive +the executor turn-by-turn. It intercepts ``handoff_to_simulator`` tool calls +and injects Human Simulator responses using the same system prompt as the +Strands two-agent swarm in ``packages/execution``. + +This faithfully recreates the interactive executor↔simulator loop that the CLI +subprocess approach cannot support. +""" + +from __future__ import annotations + +import json +import logging +import os +import shlex +import subprocess +import time +from dataclasses import dataclass, field +from pathlib import Path + +import anthropic +import boto3 + +from cli_harness.adapter import AdapterConfig, AdapterResult, CLIAdapter +from cli_harness.normalizer import normalize_output +from cli_harness.simulator import HumanSimulator + +# Execution package imports (system prompts + post-run tests) +import sys as _sys +_EXEC_SRC = Path(__file__).resolve().parents[6] / "execution" / "src" +if str(_EXEC_SRC) not in _sys.path: + _sys.path.insert(0, str(_EXEC_SRC)) +from aidlc_runner.agents.executor import EXECUTOR_SYSTEM_PROMPT # noqa: E402 +from aidlc_runner.post_run import run_post_evaluation # noqa: E402 +from aidlc_runner.config import ExecutionConfig, SandboxConfig, RunnerConfig # noqa: E402 + +_SHARED_SRC = Path(__file__).resolve().parents[6] / "shared" / "src" +if str(_SHARED_SRC) not in _sys.path: + _sys.path.insert(0, str(_SHARED_SRC)) +from shared.sandbox import _get_container_cli # noqa: E402 + +logger = logging.getLogger(__name__) + +_MAX_ITERATIONS = 300 +_MAX_OUTPUT_CHARS = 50_000 + + +def _log(msg: str) -> None: + print(f" [claude-sdk] {msg}", file=_sys.stderr, flush=True) + + +# ── Tool schemas ────────────────────────────────────────────────────────────── + +_TOOL_READ_FILE: dict = { + "name": "read_file", + "description": "Read the contents of a file in the run folder.", + "input_schema": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "File path relative to the run folder (e.g. 'aidlc-docs/aidlc-state.md').", + } + }, + "required": ["path"], + }, +} + +_TOOL_WRITE_FILE: dict = { + "name": "write_file", + "description": "Write content to a file in the run folder. Creates parent directories if needed.", + "input_schema": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "Relative to run folder (e.g. 'aidlc-docs/inception/requirements.md').", + }, + "content": { + "type": "string", + "description": "The text content to write to the file.", + }, + }, + "required": ["path", "content"], + }, +} + +_TOOL_LIST_FILES: dict = { + "name": "list_files", + "description": "List files and directories within a path in the run folder.", + "input_schema": { + "type": "object", + "properties": { + "directory": { + "type": "string", + "description": "Directory path relative to the run folder. Defaults to '.'.", + "default": ".", + } + }, + "required": [], + }, +} + +_TOOL_LOAD_RULE: dict = { + "name": "load_rule", + "description": ( + "Load an AIDLC rule file by path. " + "Use this to read AIDLC workflow rules as you progress through stages." + ), + "input_schema": { + "type": "object", + "properties": { + "rule_path": { + "type": "string", + "description": ( + "Path relative to the rules directory. Examples: " + "'core-workflow', 'common/process-overview.md', " + "'inception/requirements-analysis.md', 'construction/code-generation.md'." + ), + } + }, + "required": ["rule_path"], + }, +} + +_TOOL_RUN_COMMAND: dict = { + "name": "run_command", + "description": ( + "Execute a shell command in the run folder. " + "Use during Build and Test to install dependencies, run tests, and fix issues." + ), + "input_schema": { + "type": "object", + "properties": { + "command": { + "type": "string", + "description": "The shell command to execute.", + }, + "working_directory": { + "type": "string", + "description": "Directory relative to the run folder to run in (default: workspace/).", + "default": "workspace", + }, + }, + "required": ["command"], + }, +} + +_TOOL_HANDOFF_TO_SIMULATOR: dict = { + "name": "handoff_to_simulator", + "description": ( + "Hand off to the Human Simulator agent for answers, approvals, or reviews. " + "The simulator will respond and hand control back to you." + ), + "input_schema": { + "type": "object", + "properties": { + "message": { + "type": "string", + "description": ( + "Message to the simulator — describe what input you need " + "(answer questions / approve document / review) and include the " + "file path they need to read." + ), + } + }, + "required": ["message"], + }, +} + +_EXECUTOR_TOOLS = [ + _TOOL_READ_FILE, + _TOOL_WRITE_FILE, + _TOOL_LIST_FILES, + _TOOL_LOAD_RULE, + _TOOL_RUN_COMMAND, + _TOOL_HANDOFF_TO_SIMULATOR, +] + +# ── Token accumulator ───────────────────────────────────────────────────────── + +@dataclass +class _TokenBucket: + input_tokens: int = 0 + output_tokens: int = 0 + cache_read_tokens: int = 0 + cache_write_tokens: int = 0 + + def add(self, usage: anthropic.types.Usage) -> None: + self.input_tokens += getattr(usage, "input_tokens", 0) + self.output_tokens += getattr(usage, "output_tokens", 0) + self.cache_read_tokens += getattr(usage, "cache_read_input_tokens", 0) + self.cache_write_tokens += getattr(usage, "cache_creation_input_tokens", 0) + + @property + def total(self) -> int: + return self.input_tokens + self.output_tokens + self.cache_read_tokens + self.cache_write_tokens + + +@dataclass +class _UsageTracker: + executor: _TokenBucket = field(default_factory=_TokenBucket) + simulator: _TokenBucket = field(default_factory=_TokenBucket) + handoff_count: int = 0 + + def to_dict(self) -> dict: + e, s = self.executor, self.simulator + return { + "input_tokens": e.input_tokens + s.input_tokens, + "output_tokens": e.output_tokens + s.output_tokens, + "total_tokens": e.total + s.total, + "cache_read_tokens": e.cache_read_tokens + s.cache_read_tokens, + "cache_write_tokens": e.cache_write_tokens + s.cache_write_tokens, + "executor_input_tokens": e.input_tokens, + "executor_output_tokens": e.output_tokens, + "executor_total_tokens": e.total, + "simulator_input_tokens": s.input_tokens, + "simulator_output_tokens": s.output_tokens, + "simulator_total_tokens": s.total, + "handoffs": self.handoff_count, + "num_turns": self.handoff_count, + } + + +# ── Tool execution ──────────────────────────────────────────────────────────── + +def _resolve_safe(base: Path, relative: str) -> Path: + resolved = (base / relative).resolve() + if not str(resolved).startswith(str(base.resolve())): + raise ValueError(f"Path traversal denied: {relative}") + return resolved + + +def _exec_tool(name: str, tool_input: dict, run_folder: Path, rules_dir: Path) -> str: + """Execute a tool call and return its string result.""" + try: + if name == "read_file": + path = tool_input["path"] + target = _resolve_safe(run_folder, path) + if not target.exists(): + return f"Error: File not found: {path}" + if not target.is_file(): + return f"Error: Not a file: {path}" + return target.read_text(encoding="utf-8") + + elif name == "write_file": + path, content = tool_input["path"], tool_input.get("content", "") + target = _resolve_safe(run_folder, path) + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(content, encoding="utf-8") + return f"Written: {path} ({len(content)} chars)" + + elif name == "list_files": + directory = tool_input.get("directory", ".") + target = _resolve_safe(run_folder, directory) + if not target.exists(): + return f"Error: Directory not found: {directory}" + if not target.is_dir(): + return f"Error: Not a directory: {directory}" + entries = sorted(target.iterdir()) + lines = [ + f" {e.relative_to(run_folder)}{'/' if e.is_dir() else ''}" + for e in entries + ] + return "\n".join(lines) if lines else f"(empty: {directory})" + + elif name == "load_rule": + rule_path = tool_input["rule_path"] + if rule_path in ("core-workflow", "core-workflow.md"): + target = rules_dir / "aws-aidlc-rules" / "core-workflow.md" + else: + target = rules_dir / "aws-aidlc-rule-details" / rule_path + if not target.suffix: + target = target.with_suffix(".md") + resolved = target.resolve() + if not str(resolved).startswith(str(rules_dir.resolve())): + return f"Error: Path traversal denied: {rule_path}" + if not resolved.exists(): + return f"Error: Rule not found: {rule_path}" + return resolved.read_text(encoding="utf-8") + + elif name == "run_command": + command = tool_input["command"] + working_dir = tool_input.get("working_directory", "workspace") + cwd = _resolve_safe(run_folder, working_dir) + if not cwd.is_dir(): + return f"[error: working directory not found: {working_dir}]" + env = { + "PATH": os.environ.get("PATH", "/usr/bin:/bin"), + "HOME": str(run_folder), + "LANG": os.environ.get("LANG", "C.UTF-8"), + "TERM": "dumb", + } + for var in ("UV_CACHE_DIR", "UV_PYTHON", "NODE_PATH", "NPM_CONFIG_CACHE", + "VIRTUAL_ENV", "PYTHONPATH"): + if (val := os.environ.get(var)): + env[var] = val + try: + # nosemgrep: dangerous-subprocess-use-audit + result = subprocess.run( # nosec B603 + shlex.split(command), + shell=False, + cwd=str(cwd), + capture_output=True, + text=True, + timeout=120, + env=env, + ) + output = result.stdout + result.stderr + if len(output) > _MAX_OUTPUT_CHARS: + output = output[:_MAX_OUTPUT_CHARS] + "\n... (output truncated)" + return f"[exit code: {result.returncode}]\n{output}" + except subprocess.TimeoutExpired: + return "[error: command timed out after 120s]" + except OSError as e: + return f"[error: {e}]" + + else: + return f"[error: unknown tool: {name}]" + + except ValueError as e: + return f"Error: {e}" + except Exception as e: + logger.exception("Tool %r failed", name) + return f"[error: {e}]" + + +# ── Agent loops ─────────────────────────────────────────────────────────────── + +def _run_executor_loop( + client: anthropic.AnthropicBedrock, + executor_model: str, + simulator: HumanSimulator, + initial_prompt: str, + run_folder: Path, + rules_dir: Path, + usage: _UsageTracker, +) -> None: + """Run the executor agent loop, injecting simulator turns on handoff calls.""" + messages: list[dict] = [{"role": "user", "content": initial_prompt}] + + for iteration in range(_MAX_ITERATIONS): + response = client.messages.create( + model=executor_model, + max_tokens=8192, + system=EXECUTOR_SYSTEM_PROMPT, + tools=_EXECUTOR_TOOLS, + messages=messages, + ) + usage.executor.add(response.usage) + + tool_uses = [b for b in response.content if b.type == "tool_use"] + + if response.stop_reason == "end_turn" and not tool_uses: + _log(f"Executor finished after {iteration + 1} iterations") + return + + messages.append({"role": "assistant", "content": response.content}) + tool_results = [] + + for tu in tool_uses: + if tu.name == "handoff_to_simulator": + usage.handoff_count += 1 + _log(f" → simulator turn (handoff #{usage.handoff_count})") + sim_response = simulator.respond(tu.input.get("message", "")) + _log(f" ← simulator responded ({len(sim_response)} chars)") + tool_results.append({ + "type": "tool_result", + "tool_use_id": tu.id, + "content": sim_response, + }) + else: + result_text = _exec_tool(tu.name, tu.input, run_folder, rules_dir) + tool_results.append({ + "type": "tool_result", + "tool_use_id": tu.id, + "content": result_text, + }) + + messages.append({"role": "user", "content": tool_results}) + + _log(f"[WARN] Executor hit max iterations ({_MAX_ITERATIONS})") + + +# ── Adapter ─────────────────────────────────────────────────────────────────── + +class ClaudeCodeSDKAdapter(CLIAdapter): + """Adapter that drives AIDLC workflows via the Anthropic SDK with an embedded simulator. + + Uses ``anthropic.AnthropicBedrock`` to run an executor agent that can + interactively hand off to a Human Simulator agent mid-workflow, matching + the two-agent Strands Swarm in ``packages/execution``. + """ + + def __init__(self, verbose: bool = False): + self.verbose = verbose + + @property + def name(self) -> str: + return "claude-code-sdk" + + def check_prerequisites(self) -> tuple[bool, str]: + """Verify AWS credentials are resolvable via boto3.""" + try: + session = boto3.Session() + creds = session.get_credentials() + if creds is None: + return False, "No AWS credentials found. Configure via profile, env vars, or IAM role." + return True, "AWS credentials available" + except Exception as e: + return False, f"AWS credential check failed: {e}" + + def run(self, config: AdapterConfig) -> AdapterResult: + """Execute the full AIDLC workflow through the Anthropic SDK with an embedded simulator.""" + ok, msg = self.check_prerequisites() + if not ok: + return AdapterResult(success=False, output_dir=config.output_dir, error=msg) + + start_time = time.monotonic() + config.output_dir.mkdir(parents=True, exist_ok=True) + workspace = config.output_dir / "workspace" + workspace.mkdir(exist_ok=True) + _log(f"Run folder: {config.output_dir}") + + import shutil + + try: + # Copy input documents into the run folder (matching execution runner layout) + shutil.copy2(config.vision_path, config.output_dir / "vision.md") + vision_content = config.vision_path.read_text(encoding="utf-8") + + tech_env_content: str | None = None + if config.tech_env_path and config.tech_env_path.is_file(): + shutil.copy2(config.tech_env_path, config.output_dir / "tech-env.md") + tech_env_content = config.tech_env_path.read_text(encoding="utf-8") + + # Also place vision.md in workspace for the executor to find + shutil.copy2(config.vision_path, workspace / "vision.md") + if tech_env_content: + shutil.copy2(config.tech_env_path, workspace / "tech-env.md") + + # rules_path is already set up by the orchestrator (output_dir/aidlc-rules); + # use it directly rather than copying again. + rules_dir = config.rules_path + + # Build initial prompt (mirrors runner.py) + initial_prompt = ( + "Begin the AIDLC workflow and execute it TO COMPLETION through ALL phases. " + "The project vision is available at vision.md in the run folder. " + ) + if tech_env_content: + initial_prompt += ( + "The technical environment document is available at tech-env.md " + "in the run folder. It defines the required languages, frameworks, " + "cloud services, security controls, testing standards, and prohibited " + "technologies. Follow it as a binding reference during all Construction stages. " + ) + initial_prompt += ( + "Start by loading the core workflow rules and the process overview, then " + "execute every stage of the Inception phase followed by every stage of the " + "Construction phase. The workspace directory is 'workspace/' (currently empty — " + "this is a greenfield project). You MUST generate all application code in " + "workspace/ before the workflow is complete. Do NOT stop after requirements — " + "continue through application design, code generation, and build-and-test." + ) + + # Retrieve the pre-built HumanSimulator injected by the orchestrator. + simulator = config.simulator + if simulator is None: + raise RuntimeError( + "claude-code-sdk adapter requires a HumanSimulator — " + "ensure --simulator-model is set or models.simulator.model_id is in config.yaml" + ) + + # Resolve executor model and region + executor_model = config.model or "global.anthropic.claude-opus-4-6-v1" + aws_region = getattr(config, "aws_region", None) or os.environ.get("AWS_DEFAULT_REGION", "us-east-1") + + # Build Bedrock client for executor loop only + session_kwargs: dict = {} + if config.aws_profile: + session_kwargs["profile_name"] = config.aws_profile + boto_session = boto3.Session(**session_kwargs) + frozen = boto_session.get_credentials().get_frozen_credentials() + client = anthropic.AnthropicBedrock( + aws_access_key=frozen.access_key, + aws_secret_key=frozen.secret_key, + aws_session_token=frozen.token, + aws_region=aws_region, + ) + + _log(f"Executor model: {executor_model}") + _log(f"Simulator model: {simulator._model}") + + # Run the executor↔simulator loop + usage = _UsageTracker() + _run_executor_loop( + client=client, + executor_model=executor_model, + simulator=simulator, + initial_prompt=initial_prompt, + run_folder=config.output_dir, + rules_dir=rules_dir, + usage=usage, + ) + + elapsed_seconds = time.monotonic() - start_time + usage_extra = usage.to_dict() + usage_extra["duration_ms"] = int(elapsed_seconds * 1000) + usage_extra["model"] = executor_model + + _log( + f"Completed in {elapsed_seconds:.0f}s — " + f"{usage_extra['total_tokens']:,} total tokens, " + f"{usage_extra['handoffs']} handoffs" + ) + + # Move aidlc-docs from workspace/ up to run_folder/ if the executor placed them there + src_docs = workspace / "aidlc-docs" + dst_docs = config.output_dir / "aidlc-docs" + if src_docs.is_dir() and not dst_docs.exists(): + shutil.move(str(src_docs), str(dst_docs)) + + # Write run metadata + normalize_output( + source_dir=workspace, + output_dir=config.output_dir, + adapter_name=self.name, + model_hint=executor_model, + elapsed_seconds=elapsed_seconds, + token_usage=usage_extra, + ) + + # Stage 2: post-run tests — same logic as the Strands runner + _log("Running post-run test evaluation...") + sandbox_enabled = _get_container_cli() is not None + runner_cfg = RunnerConfig() + runner_cfg.execution = ExecutionConfig( + post_run_tests=True, + post_run_timeout=300, + sandbox=SandboxConfig(enabled=sandbox_enabled), + ) + test_results_path = run_post_evaluation(config.output_dir, runner_cfg) + if test_results_path: + _log(f"Test results: {test_results_path}") + else: + _log("No testable project detected — post-run tests skipped.") + + has_docs = dst_docs.is_dir() and any(dst_docs.iterdir()) + return AdapterResult( + success=has_docs, + output_dir=config.output_dir, + aidlc_docs_dir=dst_docs if has_docs else None, + workspace_dir=workspace, + elapsed_seconds=elapsed_seconds, + extra=usage_extra, + error=None if has_docs else "No aidlc-docs produced", + ) + + except Exception as exc: + elapsed_seconds = time.monotonic() - start_time + logger.exception("claude-code-sdk adapter run failed") + return AdapterResult( + success=False, + output_dir=config.output_dir, + workspace_dir=workspace, + error=f"claude-code-sdk adapter error: {exc}", + elapsed_seconds=elapsed_seconds, + ) + + +def _setup_rules(rules_dir: Path, rules_path: Path) -> None: + """Copy or link the AIDLC rules into the run folder.""" + import shutil + rules_dir.mkdir(parents=True, exist_ok=True) + if rules_path.is_dir(): + for rule_file in sorted(rules_path.rglob("*.md")): + rel = rule_file.relative_to(rules_path) + dst = rules_dir / rel + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(rule_file, dst) + _log(f"Copied AIDLC rules ({sum(1 for _ in rules_dir.rglob('*.md'))} files)") + else: + shutil.copy2(rules_path, rules_dir / rules_path.name) + _log(f"Copied AIDLC rules file: {rules_path.name}") diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/kiro_cli.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/kiro_cli.py index ae0fc23e..e5861b81 100644 --- a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/kiro_cli.py +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/kiro_cli.py @@ -20,6 +20,19 @@ from cli_harness.adapter import AdapterConfig, AdapterResult, CLIAdapter from cli_harness.normalizer import normalize_output from cli_harness.prompt_template import render_prompt +from cli_harness.simulator import HumanSimulator + +import sys as _sys +_EXEC_SRC = Path(__file__).resolve().parents[6] / "execution" / "src" +if str(_EXEC_SRC) not in _sys.path: + _sys.path.insert(0, str(_EXEC_SRC)) +from aidlc_runner.post_run import run_post_evaluation # noqa: E402 +from aidlc_runner.config import ExecutionConfig, SandboxConfig, RunnerConfig # noqa: E402 + +_SHARED_SRC = Path(__file__).resolve().parents[6] / "shared" / "src" +if str(_SHARED_SRC) not in _sys.path: + _sys.path.insert(0, str(_SHARED_SRC)) +from shared.sandbox import _get_container_cli # noqa: E402 logger = logging.getLogger(__name__) @@ -110,95 +123,188 @@ def run(self, config: AdapterConfig) -> AdapterResult: ) _log(f"Injected AIDLC rules ({len(rules_content)} chars)") - # Build the prompt - prompt = config.prompt_template or render_prompt() + # Build executor prompt — instructs kiro to pause at review gates + # so the human simulator can respond rather than self-approving. + prompt = config.prompt_template or render_prompt( + openapi_content=config.openapi_content, + with_simulator=True, + ) - # Base command flags - base_flags = [ - "--no-interactive", - "--trust-all-tools", - ] + # Retrieve the pre-built HumanSimulator injected by the orchestrator. + # All document context (vision, tech_env, openapi) is already embedded. + simulator = config.simulator + if simulator is None: + raise RuntimeError( + "kiro-cli adapter requires a HumanSimulator — " + "ensure --simulator-model is set or models.simulator.model_id is in config.yaml" + ) + _log(f"Simulator model: {simulator._model}") + + # Per-stage gate approach using kiro's --no-interactive + --resume. + # + # Each stage produces a sentinel file. We run kiro to that sentinel, + # have the simulator review the output, then resume with feedback. + # Stages map to the AIDLC workflow as tracked in aidlc-state.md. + # + # Gate schedule (sentinel → simulator focus): + # 1. requirements.md → answer verification questions, approve requirements + # 2. execution-plan.md → approve workflow plan and application design + # 3. code-gen-plan → approve code generation plan before code is written + # 4. build-and-test-summary → review final output + base_flags = ["--no-interactive", "--trust-all-tools"] if config.model: base_flags += ["--model", config.model] - # Run kiro-cli in a loop to handle AIDLC review gates. - # The workflow pauses at gates (e.g. "Approve & Continue"). - # With --no-interactive, kiro-cli exits at each gate. - # We resume the session with an approval message each time. log_path = config.output_dir / "kiro-session.log" _log(f"Session log: {log_path}") - turn = 0 - max_turns = 20 # safety limit + gate_count = 0 total_rc = 0 - with open(log_path, "w", encoding="utf-8") as log_file: - while turn < max_turns: - turn += 1 - - if turn == 1: - cmd = [_KIRO_CLI, "chat"] + base_flags + [prompt] - _log(f"Turn {turn}: initial prompt ({len(prompt)} chars)") - else: - approval = "Approve & Continue. Proceed to the next phase." - cmd = [_KIRO_CLI, "chat"] + base_flags + ["--resume", approval] - _log(f"Turn {turn}: resuming with approval") - - log_file.write(f"\n{'='*60}\n") - log_file.write(f"TURN {turn}\n") - log_file.write(f"{'='*60}\n") - log_file.flush() - - # nosec B603 - Executing user's Kiro CLI with validated configuration - # nosemgrep: dangerous-subprocess-use-audit - process = subprocess.Popen( - cmd, - cwd=str(workspace), - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, - ) - - for line in process.stdout: - log_file.write(_strip_ansi(line)) - log_file.flush() + def _run_kiro_stage(stage_prompt: str, stage_name: str, is_first: bool) -> tuple[str, int]: + """Run one kiro stage segment and return (output, exit_code).""" + cmd = [_KIRO_CLI, "chat"] + base_flags + if is_first: + cmd.append(stage_prompt) + else: + cmd += ["--resume", stage_prompt] + + _log(f"{stage_name}: launching kiro ({len(stage_prompt)} chars)") + + # nosemgrep: dangerous-subprocess-use-audit + proc = subprocess.Popen( # nosec B603 + cmd, + cwd=str(workspace), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + + output_lines: list[str] = [] + last_printed = [""] + line_buf = [""] + + _SKIP = ("⠀","⠋","⠙","⠹","⠸","⠼","⠴","⠦","⠧","⠇","⠏", + "⣴","⣿","⠿","╭","╰","│","▸ Credits:","Credits:", + "Model:","Plan:","All tools are","Agents can", + "Learn more","https://","Did you know","Jump into", + "Use /","38;","5;","[0m","[1m") + + def _print_line(line: str) -> None: + s = line.strip() + if not s or len(s) < 8: + return + if any(s.startswith(p) for p in _SKIP): + return + if s == last_printed[0]: + return + last_printed[0] = s + print(f" [kiro] {s}", file=sys.stderr, flush=True) + + with open(log_path, "a", encoding="utf-8") as lf: + lf.write(f"\n{'='*60}\n{stage_name.upper()}\n{'='*60}\n") + lf.flush() + while True: + chunk = proc.stdout.read(4096) + if not chunk: + break + text = chunk.decode("utf-8", errors="replace") + clean = _strip_ansi(text) + lf.write(clean) + lf.flush() + output_lines.append(clean) + for ch in clean: + if ch == "\n": + _print_line(line_buf[0]) + line_buf[0] = "" + else: + line_buf[0] += ch if self.verbose: - sys.stderr.write(line) + sys.stderr.write(text) sys.stderr.flush() - remaining = config.timeout_seconds - (time.monotonic() - start_time) - if remaining <= 0: - process.kill() - _log(f"Timeout reached at turn {turn}") - break - process.wait(timeout=max(remaining, 10)) - total_rc = process.returncode - - _log(f"Turn {turn} exited with code {process.returncode}") - - # Check if aidlc-docs looks complete (has construction phase files) - aidlc_docs_dir = workspace / "aidlc-docs" - if aidlc_docs_dir.is_dir(): - has_construction = any( - (aidlc_docs_dir / "construction").rglob("*.md") - ) if (aidlc_docs_dir / "construction").is_dir() else False - file_count = sum(1 for _ in aidlc_docs_dir.rglob("*") if _.is_file()) - _log(f" aidlc-docs: {file_count} files, construction={'yes' if has_construction else 'no'}") - - if has_construction: - _log("Construction phase detected — workflow complete") - break - else: - _log(" aidlc-docs/ not yet created") + proc.wait() + return "".join(output_lines), proc.returncode + + def _sim_review(sentinel_glob: str, focus: str) -> str: + """Run simulator review after a stage completes.""" + nonlocal gate_count + gate_count += 1 + _log(f"Gate #{gate_count}: simulator reviewing ({focus})...") + response = simulator.respond( + f"The AIDLC executor has just completed: {focus}.\n\n" + f"Please read the relevant files in aidlc-docs/ ({sentinel_glob}) " + f"and any supporting documents. " + f"Answer any open questions, approve or request changes, " + f"and give clear direction for the next stage. Be concise." + ) + _log(f"Gate #{gate_count}: simulator responded ({len(response)} chars)") + return response + + # ── Stage 1: Requirements Analysis ─────────────────────────────── + _log("Stage 1: Requirements Analysis...") + _, rc = _run_kiro_stage( + prompt + ( + "\n\nIMPORTANT: Execute ONLY these stages in order: " + "Workspace Detection, Requirements Analysis. " + "Stop after writing aidlc-docs/inception/requirements/requirements.md " + "and aidlc-docs/inception/requirements/requirement-verification-questions.md. " + "Do NOT proceed further. End your response when these files are written." + ), + "stage-1-requirements", + is_first=True, + ) + feedback = _sim_review( + "inception/requirements/*.md", + "Requirements Analysis — requirements.md and requirement-verification-questions.md", + ) - elapsed = time.monotonic() - start_time - if elapsed >= config.timeout_seconds: - _log("Timeout reached") - break + # ── Stage 2: Workflow Planning + Application Design ─────────────── + _log("Stage 2: Workflow Planning + Application Design...") + _, rc = _run_kiro_stage( + f"Human reviewer feedback on requirements:\n\n{feedback}\n\n" + "Now execute: Workflow Planning, then Application Design. " + "Stop after writing aidlc-docs/inception/plans/execution-plan.md " + "and all application-design artifacts (components.md, component-methods.md, " + "component-dependency.md, services.md). " + "Do NOT proceed to Construction.", + "stage-2-design", + is_first=False, + ) + feedback = _sim_review( + "inception/plans/*.md, inception/application-design/*.md", + "Workflow Planning and Application Design", + ) + + # ── Stage 3: Code Generation Plan ──────────────────────────────── + _log("Stage 3: Code Generation Plan...") + _, rc = _run_kiro_stage( + f"Human reviewer feedback on design:\n\n{feedback}\n\n" + "Now execute the Code Generation PLAN only — write the detailed code generation plan " + "in aidlc-docs/construction/plans/ with exact file paths and implementation steps. " + "Do NOT write any application code yet. Stop after the plan document is complete.", + "stage-3-codegen-plan", + is_first=False, + ) + feedback = _sim_review( + "construction/plans/*.md", + "Code Generation Plan", + ) + + # ── Stage 4: Code Generation + Build and Test ───────────────────── + _log("Stage 4: Code Generation + Build and Test...") + _, rc = _run_kiro_stage( + f"Human reviewer has approved the code generation plan:\n\n{feedback}\n\n" + "Now execute: generate ALL application code per the plan, then run Build and Test. " + "Install dependencies, run tests, fix failures, and write the build-and-test-summary.md. " + "Complete the full Construction phase.", + "stage-4-construction", + is_first=False, + ) + total_rc = rc + _log(f"Stage 4 complete (exit {rc})") elapsed_seconds = time.monotonic() - start_time - _log(f"Completed {turn} turn(s) in {elapsed_seconds:.0f}s") + _log(f"Completed in {elapsed_seconds:.0f}s ({gate_count} simulator gate(s))") # List workspace contents for debugging _log("Workspace contents:") @@ -223,11 +329,26 @@ def run(self, config: AdapterConfig) -> AdapterResult: adapter_name=self.name, elapsed_seconds=elapsed_seconds, token_usage={ - "num_turns": turn, + "num_turns": gate_count, "model": config.model or "", }, ) + # Stage 2: post-run tests — same logic as the Strands runner + _log("Running post-run test evaluation...") + sandbox_enabled = _get_container_cli() is not None + runner_cfg = RunnerConfig() + runner_cfg.execution = ExecutionConfig( + post_run_tests=True, + post_run_timeout=300, + sandbox=SandboxConfig(enabled=sandbox_enabled), + ) + test_results_path = run_post_evaluation(config.output_dir, runner_cfg) + if test_results_path: + _log(f"Test results: {test_results_path}") + else: + _log("No testable project detected — post-run tests skipped.") + has_docs = dst_docs.is_dir() and any(dst_docs.iterdir()) if total_rc == 0 and has_docs: diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/orchestrator.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/orchestrator.py index a837a4f8..61c9573f 100644 --- a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/orchestrator.py +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/orchestrator.py @@ -11,6 +11,7 @@ from cli_harness.adapter import AdapterConfig, AdapterResult, CLIAdapter from cli_harness.normalizer import normalize_output, _count_workspace_files, _count_doc_files +from cli_harness.simulator import HumanSimulator REPO_ROOT = Path(__file__).resolve().parents[4] # packages/cli-harness/src/cli_harness -> repo root @@ -114,6 +115,7 @@ def run_cli_evaluation( profile: str | None = None, region: str | None = None, scorer_model: str = "us.anthropic.claude-sonnet-4-5-20250929-v1:0", + simulator_model: str | None = None, report_format: str = "both", prompt_template: str | None = None, model: str | None = None, @@ -145,7 +147,32 @@ def run_cli_evaluation( print(f"[OK] {adapter.name} prerequisites met: {msg}") - # 2. Run the adapter + # 2. Build the shared HumanSimulator and run the adapter. + # The simulator is constructed once here with the full document context + # (vision, tech_env, openapi) and injected into AdapterConfig so every + # adapter uses the same instance — no per-adapter construction needed. + openapi_content: str | None = None + if openapi_path and openapi_path.is_file(): + openapi_content = openapi_path.read_text(encoding="utf-8") + + simulator: HumanSimulator | None = None + if simulator_model: + vision_content = vision_path.read_text(encoding="utf-8") + tech_env_content = ( + tech_env_path.read_text(encoding="utf-8") + if tech_env_path and tech_env_path.is_file() + else None + ) + simulator = HumanSimulator.from_adapter_config( + run_folder=output_dir, + vision_content=vision_content, + tech_env_content=tech_env_content, + openapi_content=openapi_content, + aws_profile=profile, + aws_region=region or "us-east-1", + model=simulator_model, + ) + config = AdapterConfig( vision_path=vision_path, tech_env_path=tech_env_path, @@ -153,7 +180,11 @@ def run_cli_evaluation( output_dir=output_dir, prompt_template=prompt_template, model=model, + simulator_model=simulator_model, aws_profile=profile, + aws_region=region, + openapi_content=openapi_content, + simulator=simulator, timeout_seconds=timeout_seconds, ) @@ -194,7 +225,7 @@ def run_cli_evaluation( # 5. Run evaluation pipeline (stages 2-6) eval_cmd = [ - sys.executable, str(REPO_ROOT / "run_evaluation.py"), + sys.executable, str(REPO_ROOT / "scripts" / "run_evaluation.py"), "--evaluate-only", str(aidlc_docs), "--golden", str(golden_docs), "--results", str(output_dir / "qualitative-comparison.yaml"), diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py index 3ef18884..cb5a03e0 100644 --- a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py @@ -98,26 +98,59 @@ ## Important rules -- Since you are running autonomously without a human reviewer, self-approve all stages \ -and continue immediately to the next one. Do NOT pause or wait for approval. -- Read the relevant rule file BEFORE starting each stage. +{approval_rule}- Read the relevant rule file BEFORE starting each stage. - Read common rules as needed (e.g. `aidlc-rules/common/content-validation.md` before \ writing files, `aidlc-rules/common/question-format-guide.md` before creating questions). - For CONDITIONAL stages, evaluate based on project scope and skip with justification if \ not needed, but always continue to the next stage. - When generating code, write COMPLETE, WORKING files — not stubs or placeholders. - Generate complete, working code with full test coverage. -""" - - -def render_prompt(vision_path: str = "vision.md", tech_env_path: str = "tech-env.md") -> str: - r"""Render the AIDLC prompt with customized file paths. - - Only replaces backtick-delimited references (``\`vision.md\```) so that - prose mentions like "alongside vision.md" are left intact. +{openapi_section}""" + +_SELF_APPROVE_RULE = ( + "- Since you are running autonomously without a human reviewer, self-approve all stages " + "and continue immediately to the next one. Do NOT pause or wait for approval.\n" +) + +_SIMULATOR_HANDOFF_RULE = ( + "- At each stage gate (questions, approval requests, document reviews, code reviews), " + "PAUSE and end your turn. A human reviewer will read your output and respond. " + "Resume work only after receiving their response — do not self-approve.\n" +) + + +def render_prompt( + vision_path: str = "vision.md", + tech_env_path: str = "tech-env.md", + openapi_content: str | None = None, + with_simulator: bool = False, +) -> str: + r"""Render the AIDLC executor prompt. + + Args: + vision_path: Path to vision doc (replaces backtick references only). + tech_env_path: Path to tech-env doc. + openapi_content: Full OpenAPI spec text — injected as a binding contract section. + with_simulator: When True, instructs the executor to pause at review gates + for a human reviewer instead of self-approving. Use for kiro-cli when + a HumanSimulator will be providing responses between turns. """ + openapi_section = ( + "\n## The API contract (OpenAPI specification)\n\n" + "The following is the OpenAPI specification that defines the exact API contract " + "this project must implement. Ensure all generated endpoints, request/response " + "schemas, status codes, and error shapes match this specification exactly.\n\n" + "---\n" + f"{openapi_content}\n" + "---\n" + ) if openapi_content else "" + + approval_rule = _SIMULATOR_HANDOFF_RULE if with_simulator else _SELF_APPROVE_RULE + return ( EXECUTOR_SYSTEM_PROMPT .replace("`vision.md`", f"`{vision_path}`") .replace("`tech-env.md`", f"`{tech_env_path}`") + .replace("{openapi_section}", openapi_section) + .replace("{approval_rule}", approval_rule) ) diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/registry.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/registry.py index 025d6374..146f0f9a 100644 --- a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/registry.py +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/registry.py @@ -5,13 +5,43 @@ from cli_harness.adapter import CLIAdapter -# Lazy imports to avoid pulling in adapter-specific deps at import time +# Built-in adapters — always available _ADAPTER_MAP: dict[str, str] = { "kiro-cli": "cli_harness.adapters.kiro_cli.KiroCLIAdapter", "claude-code": "cli_harness.adapters.claude_code.ClaudeCodeAdapter", + "claude-code-sdk": "cli_harness.adapters.claude_code_sdk.ClaudeCodeSDKAdapter", } +def register_adapter(name: str, fqn: str) -> None: + """Register an adapter by name and fully-qualified class path. + + Allows external code (config loaders, plugins) to add adapters without + modifying framework code. Built-in adapters can be overridden by name. + + Args: + name: Adapter name as used on the CLI (e.g. 'my-tool'). + fqn: Fully-qualified class path (e.g. 'mypackage.adapters.MyAdapter'). + """ + _ADAPTER_MAP[name.lower().strip()] = fqn + + +def load_adapters_from_config(cfg_data: dict) -> None: + """Register adapters declared under ``cli.adapters`` in a config dict. + + Config shape:: + + cli: + adapters: + my-tool: "mypackage.adapters.MyToolAdapter" + + Each entry calls :func:`register_adapter` so the adapter is available + for the current process without any framework code changes. + """ + for adapter_name, fqn in cfg_data.get("cli", {}).get("adapters", {}).items(): + register_adapter(adapter_name, fqn) + + def list_adapters() -> list[str]: """Return sorted list of registered adapter names.""" return sorted(_ADAPTER_MAP.keys()) diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/simulator.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/simulator.py new file mode 100644 index 00000000..a820c37a --- /dev/null +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/simulator.py @@ -0,0 +1,229 @@ +"""Shared Human Simulator — Anthropic SDK-based reviewer for CLI adapter workflows. + +Used by both the kiro-cli adapter (after each kiro turn) and the claude-code-sdk +adapter (on each handoff_to_simulator tool call). Backed by the same system prompt +as the Strands two-agent swarm via build_simulator_system_prompt(). +""" + +from __future__ import annotations + +import logging +import sys +from pathlib import Path + +import anthropic +import boto3 + +# Import shared prompt builder from execution package +_EXEC_SRC = Path(__file__).resolve().parents[5] / "execution" / "src" +if str(_EXEC_SRC) not in sys.path: + sys.path.insert(0, str(_EXEC_SRC)) +from aidlc_runner.agents.simulator import build_simulator_system_prompt # noqa: E402 + +logger = logging.getLogger(__name__) + +_SIMULATOR_TOOLS = [ + { + "name": "read_file", + "description": "Read the contents of a file in the run folder.", + "input_schema": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "File path relative to the run folder.", + } + }, + "required": ["path"], + }, + }, + { + "name": "write_file", + "description": "Write content to a file in the run folder.", + "input_schema": { + "type": "object", + "properties": { + "path": {"type": "string"}, + "content": {"type": "string"}, + }, + "required": ["path", "content"], + }, + }, + { + "name": "list_files", + "description": "List files and directories within a path in the run folder.", + "input_schema": { + "type": "object", + "properties": { + "directory": {"type": "string", "default": "."}, + }, + "required": [], + }, + }, +] + + +def _resolve_safe(base: Path, relative: str) -> Path: + resolved = (base / relative).resolve() + if not str(resolved).startswith(str(base.resolve())): + raise ValueError(f"Path traversal denied: {relative}") + return resolved + + +def _exec_file_tool(name: str, tool_input: dict, run_folder: Path) -> str: + try: + if name == "read_file": + path = tool_input["path"] + target = _resolve_safe(run_folder, path) + if not target.exists(): + return f"Error: File not found: {path}" + return target.read_text(encoding="utf-8") + + elif name == "write_file": + path = tool_input["path"] + content = tool_input.get("content", "") + target = _resolve_safe(run_folder, path) + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(content, encoding="utf-8") + return f"Written: {path} ({len(content)} chars)" + + elif name == "list_files": + directory = tool_input.get("directory", ".") + target = _resolve_safe(run_folder, directory) + if not target.is_dir(): + return f"Error: Not a directory: {directory}" + entries = sorted(target.iterdir()) + lines = [ + f" {e.relative_to(run_folder)}{'/' if e.is_dir() else ''}" + for e in entries + ] + return "\n".join(lines) if lines else f"(empty: {directory})" + + return f"[error: unknown tool: {name}]" + except ValueError as e: + return f"Error: {e}" + except Exception as e: + logger.exception("File tool %r failed", name) + return f"[error: {e}]" + + +class HumanSimulator: + """Anthropic SDK-based human simulator for CLI adapter review gates. + + Wraps a single stateless call: given a message from the executor (e.g. + the output of a kiro turn, or a handoff_to_simulator tool call), runs + a short simulator conversation and returns the simulator's text response. + + Token usage is accumulated separately per respond() call so callers can + report simulator and executor tokens independently. + """ + + def __init__( + self, + client: anthropic.AnthropicBedrock, + model: str, + system_prompt: str, + run_folder: Path, + ): + self._client = client + self._model = model + self._system_prompt = system_prompt + self._run_folder = run_folder + # Accumulated token counts across all respond() calls + self._input_tokens: int = 0 + self._output_tokens: int = 0 + self._cache_read_tokens: int = 0 + self._cache_write_tokens: int = 0 + + @property + def accumulated_usage(self) -> dict[str, int]: + """Token totals across all respond() calls, keyed by snake_case names + matching MetricsCollector's expected format.""" + total = self._input_tokens + self._output_tokens + return { + "inputTokens": self._input_tokens, + "outputTokens": self._output_tokens, + "totalTokens": total, + "cacheReadInputTokens": self._cache_read_tokens, + "cacheWriteInputTokens": self._cache_write_tokens, + } + + @classmethod + def from_adapter_config( + cls, + run_folder: Path, + vision_content: str, + tech_env_content: str | None, + openapi_content: str | None, + aws_profile: str | None, + aws_region: str | None, + model: str, + ) -> "HumanSimulator": + """Construct a HumanSimulator from the pieces available in an AdapterConfig.""" + system_prompt = build_simulator_system_prompt( + vision_content=vision_content, + tech_env_content=tech_env_content, + openapi_content=openapi_content, + ) + + session_kwargs: dict = {} + if aws_profile: + session_kwargs["profile_name"] = aws_profile + boto_session = boto3.Session(**session_kwargs) + frozen = boto_session.get_credentials().get_frozen_credentials() + client = anthropic.AnthropicBedrock( + aws_access_key=frozen.access_key, + aws_secret_key=frozen.secret_key, + aws_session_token=frozen.token, + aws_region=aws_region or "us-east-1", + ) + + return cls( + client=client, + model=model, + system_prompt=system_prompt, + run_folder=run_folder, + ) + + def respond(self, message: str, max_iterations: int = 50) -> str: + """Run one simulator turn and return the final text response. + + The simulator may make file tool calls before responding — this loop + handles those transparently. + """ + messages: list[dict] = [{"role": "user", "content": message}] + + for _ in range(max_iterations): + response = self._client.messages.create( + model=self._model, + max_tokens=8192, + system=self._system_prompt, + tools=_SIMULATOR_TOOLS, + messages=messages, + ) + + # Accumulate token usage from this API call + u = response.usage + self._input_tokens += getattr(u, "input_tokens", 0) + self._output_tokens += getattr(u, "output_tokens", 0) + self._cache_read_tokens += getattr(u, "cache_read_input_tokens", 0) + self._cache_write_tokens += getattr(u, "cache_creation_input_tokens", 0) + + tool_uses = [b for b in response.content if b.type == "tool_use"] + text_blocks = [b for b in response.content if b.type == "text"] + + if not tool_uses: + return "\n".join(b.text for b in text_blocks).strip() or "(no response)" + + messages.append({"role": "assistant", "content": response.content}) + tool_results = [] + for tu in tool_uses: + result_text = _exec_file_tool(tu.name, tu.input, self._run_folder) + tool_results.append({ + "type": "tool_result", + "tool_use_id": tu.id, + "content": result_text, + }) + messages.append({"role": "user", "content": tool_results}) + + return "[error: simulator exceeded max iterations]" diff --git a/scripts/aidlc-evaluator/packages/execution/pyproject.toml b/scripts/aidlc-evaluator/packages/execution/pyproject.toml index b383dedb..3586fdf0 100644 --- a/scripts/aidlc-evaluator/packages/execution/pyproject.toml +++ b/scripts/aidlc-evaluator/packages/execution/pyproject.toml @@ -7,6 +7,8 @@ dependencies = [ "strands-agents>=0.1.0", "strands-agents-tools>=0.1.0", "pyyaml>=6.0", + "anthropic[bedrock]>=0.40", + "boto3>=1.42.47", ] [project.scripts] diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/executor.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/executor.py index 32c29f05..1d4c5a0d 100644 --- a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/executor.py +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/executor.py @@ -177,6 +177,7 @@ def create_executor( aws_region: str | None = None, callback_handler: Callable[..., Any] | None = None, execution_config: ExecutionConfig | None = None, + simulator_tool: Any | None = None, ) -> Agent: """Create the AIDLC Executor agent. @@ -188,6 +189,9 @@ def create_executor( aws_region: AWS region for Bedrock. callback_handler: Optional callback handler for progress reporting. execution_config: Optional execution config controlling run_command availability. + simulator_tool: Optional Strands @tool wrapping a HumanSimulator. When + provided it is added to the executor's tool list so handoff_to_simulator + calls are handled inline rather than via a separate Swarm agent. Returns: Configured Strands Agent instance. @@ -206,6 +210,9 @@ def create_executor( else: system_prompt = _EXECUTOR_PROMPT_NO_EXEC + if simulator_tool is not None: + tools.append(simulator_tool) + session_kwargs: dict = {} if aws_profile: session_kwargs["profile_name"] = aws_profile diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/simulator.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/simulator.py index 8355400c..b1e63963 100644 --- a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/simulator.py +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/simulator.py @@ -40,8 +40,7 @@ --- {vision_content} --- -{tech_env_section} -## How you work +{tech_env_section}{openapi_section}## How you work 1. When you receive a handoff from the "executor" agent, read the file path mentioned \ in the handoff message. @@ -57,8 +56,10 @@ describe what needs to change. - **Review requests**: Read the document, provide brief feedback, and approve. Only \ request revisions for significant issues that contradict the vision. - - **Code review**: Review generated code for correctness against the vision spec. \ -Approve if it implements the required functionality. Do not block on style issues. + - **Code review**: Review generated code for correctness against the vision spec \ +and the API contract above. Verify that all required endpoints, request/response shapes, \ +and error codes match the specification. Reject if critical endpoints are missing or \ +the contract is violated; approve otherwise. 3. Write your response to the same file (appending) or to a response file as directed \ by the question format. @@ -79,6 +80,52 @@ """ +def build_simulator_system_prompt( + vision_content: str, + tech_env_content: str | None = None, + openapi_content: str | None = None, +) -> str: + """Build the simulator system prompt string from project inputs. + + Extracted so other adapters (SDK, kiro) can construct the same prompt + without calling the full Strands-specific create_simulator(). + """ + if tech_env_content: + tech_env_section = ( + "\n## The technical environment\n\n" + "The following is the technical environment document that defines HOW the project " + "must be built — languages, frameworks, cloud services, security controls, testing " + "standards, and prohibited technologies. Use this as a binding reference when " + "answering technical questions and reviewing designs and code:\n\n" + "---\n" + f"{tech_env_content}\n" + "---\n" + ) + else: + tech_env_section = "" + + if openapi_content: + openapi_section = ( + "\n## The API contract (OpenAPI specification)\n\n" + "The following is the OpenAPI specification that defines the exact API contract " + "this project must implement — all required endpoints, request/response schemas, " + "status codes, and error shapes. Use this as a binding reference when reviewing " + "API design documents and generated code. Reject any design or code that violates " + "this contract.\n\n" + "---\n" + f"{openapi_content}\n" + "---\n\n" + ) + else: + openapi_section = "" + + return SIMULATOR_SYSTEM_PROMPT_TEMPLATE.format( + vision_content=vision_content, + tech_env_section=tech_env_section, + openapi_section=openapi_section, + ) + + def create_simulator( run_folder: Path, vision_content: str, @@ -87,6 +134,7 @@ def create_simulator( aws_region: str | None = None, callback_handler: Callable[..., Any] | None = None, tech_env_content: str | None = None, + openapi_content: str | None = None, ) -> Agent: """Create the Human Simulator agent. @@ -98,29 +146,17 @@ def create_simulator( aws_region: AWS region for Bedrock. callback_handler: Optional callback handler for progress reporting. tech_env_content: Optional full text of the technical environment file. + openapi_content: Optional full text of the OpenAPI spec (test contract). Returns: Configured Strands Agent instance. """ file_tools = make_file_tools(run_folder) - if tech_env_content: - tech_env_section = ( - "\n## The technical environment\n\n" - "The following is the technical environment document that defines HOW the project " - "must be built — languages, frameworks, cloud services, security controls, testing " - "standards, and prohibited technologies. Use this as a binding reference when " - "answering technical questions and reviewing designs and code:\n\n" - "---\n" - f"{tech_env_content}\n" - "---\n" - ) - else: - tech_env_section = "" - - system_prompt = SIMULATOR_SYSTEM_PROMPT_TEMPLATE.format( + system_prompt = build_simulator_system_prompt( vision_content=vision_content, - tech_env_section=tech_env_section, + tech_env_content=tech_env_content, + openapi_content=openapi_content, ) session_kwargs: dict = {} diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/cli.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/cli.py index de49dc75..918b17c0 100644 --- a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/cli.py +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/cli.py @@ -27,6 +27,12 @@ def build_parser() -> argparse.ArgumentParser: default=None, help="Path to the technical environment markdown file (optional).", ) + parser.add_argument( + "--openapi", + type=Path, + default=None, + help="Path to OpenAPI spec — injected into the simulator's system prompt for contract validation (optional).", + ) parser.add_argument( "--config", type=Path, @@ -143,4 +149,4 @@ def main(argv: list[str] | None = None) -> None: config = load_config(config_path=config_path, cli_overrides=cli_overrides) # Run the workflow - run(config=config, vision_path=args.vision, tech_env_path=args.tech_env) + run(config=config, vision_path=args.vision, tech_env_path=args.tech_env, openapi_path=args.openapi) diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/metrics.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/metrics.py index aefd21be..836cb0ac 100644 --- a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/metrics.py +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/metrics.py @@ -168,6 +168,7 @@ def __init__(self, config: RunnerConfig) -> None: self._handoffs: list[dict[str, Any]] = [] self._errors: list[dict[str, str]] = [] self._context_samples: list[dict[str, Any]] = [] + self._simulator_usage: dict[str, int] | None = None # -- Live recording (called during execution) -- @@ -187,6 +188,15 @@ def record_error(self, error_type: str, message: str) -> None: "message": message, }) + def record_simulator_usage(self, usage: dict[str, int]) -> None: + """Record accumulated token usage from the HumanSimulator (Anthropic SDK). + + Called after the swarm completes. The usage dict must use the same + camelCase key format as Strands accumulated_usage so _usage_to_dict + can normalise it uniformly. + """ + self._simulator_usage = usage + def record_context_sample(self, agent_name: str, input_tokens: int) -> None: """Record the input token count from a single model invocation. @@ -227,11 +237,16 @@ def build_metrics(self, result: MultiAgentResult, run_folder: Path) -> dict[str, metrics: dict[str, Any] = {} # --- Tokens --- - # Extract per-agent token counts (unique tokens per agent) + # Extract per-agent token counts (unique tokens per agent). + # Strands nodes cover the executor. The simulator is tracked separately + # via record_simulator_usage() since it runs outside the Strands swarm. per_agent: dict[str, dict[str, int]] = {} for node_id, node_result in result.results.items(): per_agent[node_id] = _usage_to_dict(node_result.accumulated_usage) + if self._simulator_usage is not None: + per_agent["simulator"] = _usage_to_dict(self._simulator_usage) + # Calculate sum of per-agent tokens (unique tokens across all agents) unique_total = { "input_tokens": sum(agent["input_tokens"] for agent in per_agent.values()), diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/runner.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/runner.py index 6b5f8829..a3d507f4 100644 --- a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/runner.py +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/runner.py @@ -16,13 +16,58 @@ from shared.io import atomic_yaml_dump from strands.multiagent import Swarm +from strands import tool as strands_tool + from aidlc_runner.agents.executor import create_executor -from aidlc_runner.agents.simulator import create_simulator +from aidlc_runner.agents.simulator import build_simulator_system_prompt from aidlc_runner.config import AidlcConfig, RunnerConfig from aidlc_runner.metrics import MetricsCollector from aidlc_runner.post_run import run_post_evaluation from aidlc_runner.progress import AgentProgressHandler, SwarmProgressHook + +def _make_simulator_tool( + run_folder: Path, + vision_content: str, + model_id: str, + aws_profile: str | None, + aws_region: str | None, + tech_env_content: str | None = None, + openapi_content: str | None = None, +): + """Create a Strands @tool that delegates to HumanSimulator. + + Returns (tool, simulator) so the caller can harvest accumulated_usage + after the swarm completes and record it separately in MetricsCollector, + keeping executor and simulator token counts in distinct buckets. + """ + import sys as _sys + _CLI_HARNESS = Path(__file__).resolve().parents[4] / "cli-harness" / "src" + if str(_CLI_HARNESS) not in _sys.path: + _sys.path.insert(0, str(_CLI_HARNESS)) + from cli_harness.simulator import HumanSimulator # noqa: E402 + + simulator = HumanSimulator.from_adapter_config( + run_folder=run_folder, + vision_content=vision_content, + tech_env_content=tech_env_content, + openapi_content=openapi_content, + aws_profile=aws_profile, + aws_region=aws_region, + model=model_id, + ) + + @strands_tool + def handoff_to_simulator(message: str) -> str: + """Hand off to the Human Simulator for answers, approvals, or reviews. + + Args: + message: Message describing what input is needed and which file to read. + """ + return simulator.respond(message) + + return handoff_to_simulator, simulator + _SLUG_MAX_LEN = 80 @@ -45,34 +90,41 @@ def _rules_slug(aidlc: AidlcConfig) -> str: def create_run_folder(output_dir: str | Path, config: RunnerConfig) -> Path: - """Create a timestamped run folder named after the rules source. + """Create or use the specified run folder. - Format: {ISO8601_compact}-{rules_slug} - Example: 20260224T214917-aidlc-workflows_v0.1.0 + Two modes: + 1. If output_dir itself looks like a timestamped folder (name starts with + a digit and contains "T"), use it directly — the orchestrator pre-allocated + the exact path for deterministic, parallel-safe execution. + 2. Otherwise treat output_dir as a parent and create a timestamped subfolder. + Format: {ISO8601_compact}-{rules_slug} + Example: 20260224T214917-aidlc-workflows_v0.1.0 - Also writes a sentinel file (``{output_dir}/.last_run_folder``) containing - the absolute path of the new run folder so that parent orchestrators can - discover the folder without racy before/after directory listing. + Also writes a sentinel file (``{output_dir.parent}/.last_run_folder``) in + Mode 2 so legacy orchestrators can discover the folder. Returns: Path to the created run folder. """ output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") - slug = _rules_slug(config.aidlc) - folder_name = f"{timestamp}-{slug}" - run_folder = output_dir / folder_name - run_folder.mkdir() - (run_folder / "aidlc-docs" / "inception").mkdir(parents=True) - (run_folder / "aidlc-docs" / "construction").mkdir(parents=True) - (run_folder / "workspace").mkdir() - - # Write sentinel for orchestrator discovery (atomic via os.replace) - sentinel = output_dir / _SENTINEL_NAME - sentinel.write_text(str(run_folder.resolve()), encoding="utf-8") + folder_name = output_dir.name + if folder_name and folder_name[0].isdigit() and "T" in folder_name: + # Mode 1: orchestrator specified exact folder name + run_folder = output_dir + else: + # Mode 2: generate a timestamped subfolder + output_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") + slug = _rules_slug(config.aidlc) + run_folder = output_dir / f"{timestamp}-{slug}" + # Write sentinel for legacy orchestrator discovery + sentinel = output_dir / _SENTINEL_NAME + sentinel.write_text(str(run_folder.resolve()), encoding="utf-8") + + (run_folder / "aidlc-docs" / "inception").mkdir(parents=True, exist_ok=True) + (run_folder / "aidlc-docs" / "construction").mkdir(parents=True, exist_ok=True) + (run_folder / "workspace").mkdir(exist_ok=True) return run_folder @@ -172,13 +224,21 @@ def write_run_meta( atomic_yaml_dump(meta, run_folder / "run-meta.yaml") -def run(config: RunnerConfig, vision_path: Path, tech_env_path: Path | None = None) -> None: +def run( + config: RunnerConfig, + vision_path: Path, + tech_env_path: Path | None = None, + openapi_path: Path | None = None, +) -> None: """Execute a full AIDLC workflow run. Args: config: Fully resolved runner configuration. vision_path: Path to the vision/constraints markdown file. tech_env_path: Optional path to the technical environment markdown file. + openapi_path: Optional path to the OpenAPI spec — injected into the + simulator's system prompt so it can validate the API contract + during design reviews and code review handoffs. """ # 1. Create run folder run_folder = create_run_folder(config.runs.output_dir, config) @@ -194,6 +254,11 @@ def run(config: RunnerConfig, vision_path: Path, tech_env_path: Path | None = No tech_env_content = tech_env_path.read_text(encoding="utf-8") (run_folder / "tech-env.md").write_text(tech_env_content, encoding="utf-8") + # 2c. Read OpenAPI spec if provided (not copied to run folder — used for simulator prompt only) + openapi_content: str | None = None + if openapi_path is not None and openapi_path.is_file(): + openapi_content = openapi_path.read_text(encoding="utf-8") + # 3. Set up AIDLC rules print("Setting up AIDLC rules...") rules_dir = setup_rules(run_folder, config) @@ -202,11 +267,25 @@ def run(config: RunnerConfig, vision_path: Path, tech_env_path: Path | None = No # 4. Write run metadata write_run_meta(run_folder, config, vision_path, tech_env_path=tech_env_path) - # 5. Create metrics collector and agents with progress handlers + # 5. Create metrics collector and executor with progress handler print("Creating agents...") collector = MetricsCollector(config) executor_handler = AgentProgressHandler("executor", collector=collector) - simulator_handler = AgentProgressHandler("simulator", collector=collector) + + # Build the HumanSimulator tool — same implementation as kiro-cli and + # claude-code-sdk, backed by build_simulator_system_prompt(). + # The simulator instance is kept so we can harvest its token usage after + # the swarm completes and inject it into MetricsCollector as a separate + # "simulator" bucket — keeping executor and simulator tokens distinct. + simulator_tool, simulator_instance = _make_simulator_tool( + run_folder=run_folder, + vision_content=vision_content, + model_id=config.models.simulator.model_id, + aws_profile=config.aws.profile, + aws_region=config.aws.region, + tech_env_content=tech_env_content, + openapi_content=openapi_content, + ) executor = create_executor( run_folder=run_folder, @@ -216,19 +295,11 @@ def run(config: RunnerConfig, vision_path: Path, tech_env_path: Path | None = No aws_region=config.aws.region, callback_handler=executor_handler, execution_config=config.execution, - ) - simulator = create_simulator( - run_folder=run_folder, - vision_content=vision_content, - model_config=config.models.simulator, - aws_profile=config.aws.profile, - aws_region=config.aws.region, - callback_handler=simulator_handler, - tech_env_content=tech_env_content, + simulator_tool=simulator_tool, ) - # 6. Create and run the Swarm - print("Starting AIDLC workflow swarm...") + # 6. Run the executor (single-agent; simulator is a tool call) + print("Starting AIDLC workflow executor...") initial_prompt = ( "Begin the AIDLC workflow and execute it TO COMPLETION through ALL phases. " "The project vision is available at vision.md in the run folder. " @@ -250,14 +321,12 @@ def run(config: RunnerConfig, vision_path: Path, tech_env_path: Path | None = No ) swarm = Swarm( - [executor, simulator], + [executor], entry_point=executor, max_handoffs=config.swarm.max_handoffs, max_iterations=config.swarm.max_iterations, execution_timeout=config.swarm.execution_timeout, node_timeout=config.swarm.node_timeout, - repetitive_handoff_detection_window=5, - repetitive_handoff_min_unique_agents=2, ) # Register progress hook for node-level events @@ -271,7 +340,11 @@ def run(config: RunnerConfig, vision_path: Path, tech_env_path: Path | None = No print(f"Execution time: {result.execution_time}ms") print(f"Total handoffs: {len(result.node_history)}") - # 8. Write run metrics + # 8. Record simulator token usage separately so metrics keep executor + # and simulator buckets distinct (simulator runs outside the Strands swarm). + collector.record_simulator_usage(simulator_instance.accumulated_usage) + + # 9. Write run metrics metrics_path = collector.write(result, run_folder) print(f"Metrics written to: {metrics_path}") diff --git a/scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py b/scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py index 625591db..6464d514 100644 --- a/scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py +++ b/scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py @@ -1,6 +1,7 @@ -"""Docker sandbox for running untrusted commands in an isolated container. +"""Container sandbox for running untrusted commands in an isolated container. -Provides a thin wrapper around ``docker run`` so that generated code +Supports Docker, Podman, and Finch — whichever is available on PATH. +Provides a thin wrapper around `` run`` so that generated code (post-run tests, contract-test servers) can be executed without granting access to the host filesystem, network credentials, or environment. @@ -10,6 +11,7 @@ from __future__ import annotations import os +import shutil import subprocess from dataclasses import dataclass from pathlib import Path @@ -28,16 +30,27 @@ class SandboxResult: _DOCKER_AVAILABLE: bool | None = None +_CONTAINER_CLI: str | None = None -def is_docker_available() -> bool: - """Check whether Docker can actually run containers. +def _get_container_cli() -> str | None: + """Return the first available container CLI: docker, podman, or finch.""" + global _CONTAINER_CLI + if _CONTAINER_CLI is not None: + return _CONTAINER_CLI + for cli in ("docker", "podman", "finch"): + if shutil.which(cli): + _CONTAINER_CLI = cli + return _CONTAINER_CLI + return None + - Goes beyond ``docker info`` by spawning a trivial container, which - catches cgroup v2 / OCI runtime errors that ``docker info`` misses. +def is_docker_available() -> bool: + """Check whether a container runtime can actually run containers. - Goes beyond ``docker info`` by spawning a trivial container, which - catches cgroup v2 / OCI runtime errors that ``docker info`` misses. + Tries docker, podman, and finch in that order. Goes beyond `` info`` + by spawning a trivial container, which catches cgroup v2 / OCI runtime + errors that plain info checks miss. The result is cached for the lifetime of the process. """ @@ -45,10 +58,15 @@ def is_docker_available() -> bool: if _DOCKER_AVAILABLE is not None: return _DOCKER_AVAILABLE + cli = _get_container_cli() + if cli is None: + _DOCKER_AVAILABLE = False + return _DOCKER_AVAILABLE + try: - # nosec B603, B607 - Static docker command for availability check + # nosec B603 - Static container CLI info command for availability check result = subprocess.run( - ["docker", "info"], + [cli, "info"], capture_output=True, timeout=10, ) @@ -57,10 +75,10 @@ def is_docker_available() -> bool: return _DOCKER_AVAILABLE # Verify containers can actually start *with resource limits* - # (catches cgroup v2 / OCI runtime errors that plain `docker run` misses) - # nosec B603, B607 - Static docker command for runtime verification + # (catches cgroup v2 / OCI runtime errors that plain info misses) + # nosec B603 - Static container CLI run command for runtime verification result = subprocess.run( - ["docker", "run", "--rm", "--memory=6m", "--cpus=1", "alpine", "true"], + [cli, "run", "--rm", "--memory=6m", "--cpus=1", "alpine", "true"], capture_output=True, timeout=30, ) @@ -108,8 +126,9 @@ def sandbox_run( cpus: Container CPU limit. """ + cli = _get_container_cli() or "docker" docker_cmd: list[str] = [ - "docker", "run", + cli, "run", "--rm", f"--memory={memory}", f"--cpus={cpus}", @@ -121,7 +140,7 @@ def sandbox_run( # no entry in the container's /etc/passwd. "-e", "HOME=/tmp", "-e", "UV_CACHE_DIR=/tmp/.cache/uv", - "-e", "NPM_CONFIG_CACHE=/tmp/.cache/npm", + "-e", "NPM_CONFIG_CACHE=/tmp/.cache/npm", ] if not network: @@ -185,8 +204,9 @@ def sandbox_run_detached( Raises ``RuntimeError`` if the container fails to start. """ + cli = _get_container_cli() or "docker" docker_cmd: list[str] = [ - "docker", "run", + cli, "run", "-d", "--rm", f"--memory={memory}", f"--cpus={cpus}", @@ -198,7 +218,7 @@ def sandbox_run_detached( # no entry in the container's /etc/passwd. "-e", "HOME=/tmp", "-e", "UV_CACHE_DIR=/tmp/.cache/uv", - "-e", "NPM_CONFIG_CACHE=/tmp/.cache/npm", + "-e", "NPM_CONFIG_CACHE=/tmp/.cache/npm", ] if not network: @@ -231,18 +251,19 @@ def sandbox_run_detached( def sandbox_stop(container_id: str, timeout: int = 10) -> None: """Stop a running container by ID.""" + cli = _get_container_cli() or "docker" try: - # nosec B603, B607 - Static docker stop command with container ID parameter + # nosec B603 - Static container stop command with container ID parameter subprocess.run( - ["docker", "stop", "-t", str(timeout), container_id], + [cli, "stop", "-t", str(timeout), container_id], capture_output=True, timeout=timeout + 5, ) except (subprocess.TimeoutExpired, OSError): # Force kill if graceful stop fails - # nosec B603, B607 - Static docker kill command with container ID parameter + # nosec B603 - Static container kill command with container ID parameter subprocess.run( - ["docker", "kill", container_id], + [cli, "kill", container_id], capture_output=True, timeout=5, ) @@ -250,10 +271,11 @@ def sandbox_stop(container_id: str, timeout: int = 10) -> None: def sandbox_is_running(container_id: str) -> bool: """Check whether a container is still running.""" + cli = _get_container_cli() or "docker" try: - # nosec B603, B607 - Static docker inspect command with container ID parameter + # nosec B603 - Static container inspect command with container ID parameter result = subprocess.run( - ["docker", "inspect", "-f", "{{.State.Running}}", container_id], + [cli, "inspect", "-f", "{{.State.Running}}", container_id], capture_output=True, text=True, timeout=5, @@ -265,10 +287,11 @@ def sandbox_is_running(container_id: str) -> bool: def sandbox_logs(container_id: str) -> tuple[str, str]: """Return (stdout, stderr) from a running or stopped container.""" + cli = _get_container_cli() or "docker" try: - # nosec B603, B607 - Static docker logs command with container ID parameter + # nosec B603 - Static container logs command with container ID parameter result = subprocess.run( - ["docker", "logs", container_id], + [cli, "logs", container_id], capture_output=True, text=True, timeout=10, diff --git a/scripts/aidlc-evaluator/pyproject.toml b/scripts/aidlc-evaluator/pyproject.toml index 2387e7d2..98c56e2c 100644 --- a/scripts/aidlc-evaluator/pyproject.toml +++ b/scripts/aidlc-evaluator/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "aidlc-reporting", "aidlc-shared", "aidlc-trend-reports", + "aidlc-cli-harness", ] [tool.uv.workspace] @@ -39,6 +40,7 @@ aidlc-nonfunctional = { workspace = true } aidlc-reporting = { workspace = true } aidlc-shared = { workspace = true } aidlc-trend-reports = { workspace = true } +aidlc-cli-harness = { workspace = true } [dependency-groups] dev = [ diff --git a/scripts/aidlc-evaluator/run.py b/scripts/aidlc-evaluator/run.py index 3190e7ca..c9ac74aa 100644 --- a/scripts/aidlc-evaluator/run.py +++ b/scripts/aidlc-evaluator/run.py @@ -5,15 +5,17 @@ It dispatches to specialized runner scripts in the scripts/ directory. Available modes: - - full Full evaluation (execute workflow + score outputs) - - cli Evaluation through a CLI AI assistant (kiro-cli, claude-code, etc.) - - ide Evaluation through an IDE AI assistant (cursor, cline, kiro) - - batch Batch evaluation across multiple models - - compare Generate cross-model comparison report - - ext-test Test extension hooks with different opt-in configurations - - ext-report Regenerate extension test comparison report - - trend Generate trend report across AIDLC rules releases - - test Run unit tests for all packages + - full Full evaluation (execute workflow + score outputs) + - cli Evaluation through a CLI AI assistant (kiro-cli, claude-code, etc.) + - ide Evaluation through an IDE AI assistant (cursor, cline, kiro) + - batch Batch evaluation across multiple models + - compare Generate cross-model comparison report + - ext-test Test extension hooks with different opt-in configurations + - ext-report Regenerate extension test comparison report + - git-compare Compare multiple git refs across scenarios with repeated runs + - git-compare-report Regenerate git comparison reports from existing runs + - trend Generate trend report across AIDLC rules releases + - test Run unit tests for all packages Usage: # Full pipeline evaluation @@ -37,6 +39,12 @@ # Regenerate extension comparison report python run.py ext-report --runs-dir runs/sci-calc/extension-test + # Compare git refs (branches, tags, commits) + python run.py git-compare --refs main,feat/my-feature --scenarios sci-calc --runs-per-ref 3 + + # Regenerate git comparison reports from existing runs + python run.py git-compare-report --runs-dir runs/sci-calc/git-compare + # Generate trend report across releases python run.py trend --baseline test_cases/sci-calc/golden.yaml @@ -47,6 +55,7 @@ python run.py full --help python run.py cli --help python run.py ext-test --help + python run.py git-compare --help """ from __future__ import annotations @@ -59,6 +68,52 @@ REPO_ROOT = Path(__file__).resolve().parent SCRIPTS_DIR = REPO_ROOT / "scripts" +# Modes that require Docker sandbox +DOCKER_DEPENDENT_MODES = {"full", "cli", "ide", "batch", "git-compare", "ext-test"} + + +def _container_cli() -> str | None: + """Return the first available container CLI: docker or podman.""" + import shutil + for cli in ("docker", "podman"): + if shutil.which(cli): + return cli + return None + + +def check_docker_sandbox() -> bool: + """Check if a container runtime is available and the sandbox image exists. + + Supports both Docker and Podman (checks in that order). + + Returns: + True if a container runtime and sandbox image are available, False otherwise + """ + cli = _container_cli() + if cli is None: + return False + try: + # nosemgrep: dangerous-subprocess-use-audit + result = subprocess.run( # nosec B603 + [cli, "info"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=5, + ) + if result.returncode != 0: + return False + + # nosemgrep: dangerous-subprocess-use-audit + result = subprocess.run( # nosec B603 + [cli, "images", "-q", "aidlc-sandbox:latest"], + capture_output=True, + text=True, + timeout=5, + ) + return bool(result.stdout.strip()) + except (subprocess.TimeoutExpired, FileNotFoundError): + return False + def main() -> None: parser = argparse.ArgumentParser( @@ -124,6 +179,20 @@ def main() -> None: add_help=False, ) + # Git compare mode + subparsers.add_parser( + "git-compare", + help="Compare multiple git refs across scenarios with repeated runs", + add_help=False, + ) + + # Git compare report regeneration mode + subparsers.add_parser( + "git-compare-report", + help="Regenerate git comparison reports from existing runs", + add_help=False, + ) + # Trend report mode subparsers.add_parser( "trend", @@ -154,6 +223,8 @@ def main() -> None: "compare": SCRIPTS_DIR / "run_comparison_report.py", "ext-test": SCRIPTS_DIR / "run_extension_test.py", "ext-report": SCRIPTS_DIR / "regenerate_extension_report.py", + "git-compare": SCRIPTS_DIR / "run_git_compare.py", + "git-compare-report": SCRIPTS_DIR / "regenerate_git_compare_report.py", "trend": SCRIPTS_DIR / "run_trend_report.py", "test": SCRIPTS_DIR / "run_evaluation.py", # test mode is in run_evaluation.py } @@ -174,6 +245,29 @@ def main() -> None: # Forward all remaining arguments cmd.extend(remaining) + # Check container sandbox availability for modes that need it. + # Skip when --no-sandbox is explicitly passed (sandbox disabled by user). + sandbox_disabled = "--no-sandbox" in remaining + if args.mode in DOCKER_DEPENDENT_MODES and not sandbox_disabled: + if not check_docker_sandbox(): + print("=" * 70, file=sys.stderr) + print("ERROR: Docker sandbox image not found", file=sys.stderr) + print("=" * 70, file=sys.stderr) + print(file=sys.stderr) + print("The evaluation framework requires the Docker sandbox image", file=sys.stderr) + print("'aidlc-sandbox:latest' to run generated code safely.", file=sys.stderr) + print(file=sys.stderr) + print("To build the image, run:", file=sys.stderr) + print(" ./docker/sandbox/build.sh", file=sys.stderr) + print(file=sys.stderr) + print("Or manually:", file=sys.stderr) + print(" docker build -t aidlc-sandbox:latest docker/sandbox/", file=sys.stderr) + print(file=sys.stderr) + print("To run without Docker (not recommended for untrusted code),", file=sys.stderr) + print("set 'execution.sandbox.enabled: false' in config/default.yaml", file=sys.stderr) + print("=" * 70, file=sys.stderr) + sys.exit(1) + # Execute the script try: # nosec B603 - Executing trusted framework scripts from scripts/ directory diff --git a/scripts/aidlc-evaluator/scripts/generate_html_report.py b/scripts/aidlc-evaluator/scripts/generate_html_report.py new file mode 100644 index 00000000..39b5b2c9 --- /dev/null +++ b/scripts/aidlc-evaluator/scripts/generate_html_report.py @@ -0,0 +1,931 @@ +#!/usr/bin/env python3 +"""Generate interactive HTML report with charts for git-compare results.""" + +import json +from pathlib import Path + + +def generate_interactive_html_report( + scenarios: list[str], + version_names: list[str], + all_results: list[dict], + generated_at: str, + runs_dir: Path, +) -> str: + """Generate an interactive HTML report with charts and navigation. + + Args: + scenarios: List of scenario names + version_names: List of version names in order + all_results: List of run result dicts with version_name, scenario, output_dir, etc. + generated_at: ISO timestamp of report generation + runs_dir: Path to runs directory for loading metrics + + Returns: + HTML string + """ + from run_git_compare import ( + load_run_metrics, + get_metric_value, + METRIC_ROWS, + ) + + # Collect metrics per version per scenario + scenario_data = {} + for scenario_name in scenarios: + scenario_results = [r for r in all_results if r["scenario"] == scenario_name] + if not scenario_results: + continue + + # Group by version + version_metrics = {vn: [] for vn in version_names} + for result in scenario_results: + vn = result["version_name"] + folder = Path(result["output_dir"]) + if folder.is_dir(): + metrics = load_run_metrics(folder) + if metrics: + version_metrics[vn].append(metrics) + + scenario_data[scenario_name] = version_metrics + + # Compute aggregated metrics for charts + chart_data = _prepare_chart_data(version_names, scenario_data, scenarios) + + # Generate HTML + html = f""" + + + + + Git Version Comparison Report + + + + +
+

🚀 Git Version Comparison Report

+
+
Generated: {generated_at}
+
Versions: {', '.join(version_names)}
+
Scenarios: {', '.join(scenarios)}
+
+
+ +
+
+ + + + + + +
+ +
+ {_generate_overview_section(version_names, chart_data, scenarios)} +
+ +
+ {_generate_performance_section(version_names, chart_data)} +
+ +
+ {_generate_quality_section(version_names, chart_data)} +
+ +
+ {_generate_tests_section(version_names, chart_data)} +
+ +
+ {_generate_artifacts_section(version_names, chart_data)} +
+ +
+ {_generate_raw_data_section(version_names, scenario_data, scenarios, all_results)} +
+
+ + + +""" + + return html + + +def _get_t_critical(n: int, confidence: float = 0.95) -> float: + """Get t-critical value for confidence interval. + + Uses t-distribution for small samples, z for large samples. + For 95% CI (two-tailed). + """ + if n < 2: + return 1.0 + + # t-critical values for 95% CI (two-tailed, α=0.05) + t_table = { + 2: 12.706, + 3: 4.303, + 4: 3.182, + 5: 2.776, + 6: 2.571, + 7: 2.447, + 8: 2.365, + 9: 2.306, + 10: 2.262, + 15: 2.145, + 20: 2.086, + 30: 2.045, + } + + # Use lookup table or approximate for large n + if n in t_table: + return t_table[n] + elif n > 30: + return 1.96 # z-value for 95% CI with large samples + else: + # Interpolate or use closest value + return 2.0 + + +def _prepare_chart_data(version_names: list[str], scenario_data: dict, scenarios: list[str]) -> dict: + """Prepare chart data structure for all metrics.""" + from run_git_compare import get_metric_value, METRIC_ROWS, _mean, _stdev + import math + + chart_data = {} + + # Key metrics to chart + chart_metrics = [ + ("tests_pass_pct", "Unit Test Pass %", True), + ("contract_passed", "Contract Tests Passed", True), + ("qualitative_score", "Qualitative Score", True), + ("wall_clock_min", "Execution Time (min)", False), + ("total_tokens", "Total Tokens", False), + ("lint_total", "Lint Findings", False), + ("security_total", "Security Findings", False), + ("lines_of_code", "Lines of Code", True), + ] + + for metric_key, metric_name, higher_is_better in chart_metrics: + chart_data[metric_key] = { + "name": metric_name, + "higher_is_better": higher_is_better, + "versions": version_names, + "scenarios": scenarios, + "values": [], # One entry per version + } + + for vn in version_names: + version_data = [] + for scenario in scenarios: + if scenario not in scenario_data: + version_data.append({"avg": None, "std": None}) + continue + + mlist = scenario_data[scenario].get(vn, []) + vals = [v for v in (get_metric_value(m, metric_key) for m in mlist) if v is not None] + + if not vals: + version_data.append({"avg": None, "ci": None, "n": 0}) + elif len(vals) == 1: + version_data.append({"avg": vals[0], "ci": None, "n": 1}) + else: + n = len(vals) + avg = _mean(vals) + std = _stdev(vals) + # Calculate 95% confidence interval: t * (std / sqrt(n)) + t_crit = _get_t_critical(n) + sem = std / math.sqrt(n) + ci_half_width = t_crit * sem + version_data.append({"avg": avg, "ci": ci_half_width, "n": n}) + + chart_data[metric_key]["values"].append(version_data) + + return chart_data + + +def _generate_overview_section(version_names: list[str], chart_data: dict, scenarios: list[str]) -> str: + """Generate overview section HTML.""" + from run_git_compare import get_metric_value, _mean, _stdev + + # Calculate key metrics + baseline = version_names[0] if version_names else None + + html = '
' + html += '

📊 Overview

' + html += '
' + + # Show key metrics for each version + for idx, vn in enumerate(version_names): + qualitative = chart_data.get("qualitative_score", {}) + if qualitative and qualitative["values"]: + scores = [v["avg"] for v in qualitative["values"][idx] if v["avg"] is not None] + avg_score = sum(scores) / len(scores) if scores else 0 + + delta_html = "" + if idx > 0 and baseline: + baseline_scores = [v["avg"] for v in qualitative["values"][0] if v["avg"] is not None] + baseline_avg = sum(baseline_scores) / len(baseline_scores) if baseline_scores else 0 + delta = avg_score - baseline_avg + if abs(delta) > 0.001: + delta_class = "better" if delta > 0 else "worse" + delta_html = f'
{delta:+.3f} vs {baseline}
' + + html += f''' +
+

{vn}

+
{avg_score:.3f}
+
Qualitative Score
+ {delta_html} +
+ ''' + + html += '
' + + # Add summary table + html += '

Key Metrics Summary

' + html += '' + for vn in version_names: + html += f'' + html += '' + + # Key metrics to show + summary_metrics = [ + ("qualitative_score", "Qualitative Score", 3), + ("tests_pass_pct", "Unit Test Pass %", 1), + ("contract_passed", "Contract Tests Passed", 0), + ("wall_clock_min", "Execution Time (min)", 1), + ("total_tokens", "Total Tokens", 0), + ("lines_of_code", "Lines of Code", 0), + ] + + for metric_key, metric_name, decimals in summary_metrics: + html += f'' + metric_data = chart_data.get(metric_key, {}) + if metric_data and metric_data.get("values"): + for idx in range(len(version_names)): + # Get the first scenario's data for this version (usually only one scenario) + version_data = metric_data["values"][idx] + if version_data and len(version_data) > 0: + point = version_data[0] # First scenario + if point["avg"] is None: + html += '' + elif point["ci"] is None or point["ci"] == 0: + html += f'' + else: + lower = point["avg"] - point["ci"] + upper = point["avg"] + point["ci"] + html += f'' + else: + html += '' + else: + for _ in version_names: + html += '' + html += '' + + html += '
Metric{vn}
{metric_name}{point["avg"]:.{decimals}f}{point["avg"]:.{decimals}f}
(95% CI: {lower:.{decimals}f}-{upper:.{decimals}f})
' + + # Key metrics charts + html += '
' + html += '
' + + html += '
' + return html + + +def _generate_performance_section(version_names: list[str], chart_data: dict) -> str: + """Generate performance section HTML.""" + html = '
' + html += '

⚡ Performance Metrics

' + html += '
' + html += '
' + html += '
' + html += '
' + return html + + +def _generate_quality_section(version_names: list[str], chart_data: dict) -> str: + """Generate code quality section HTML.""" + html = '
' + html += '

🔍 Code Quality

' + html += '
' + html += '
' + html += '
' + html += '
' + return html + + +def _generate_tests_section(version_names: list[str], chart_data: dict) -> str: + """Generate testing section HTML.""" + html = '
' + html += '

✅ Testing Metrics

' + html += '
' + html += '
' + html += '
' + return html + + +def _generate_artifacts_section(version_names: list[str], chart_data: dict) -> str: + """Generate artifacts section HTML.""" + html = '
' + html += '

📦 Generated Artifacts

' + html += '
' + html += '
' + html += '
' + return html + + +def _generate_raw_data_section(version_names: list[str], scenario_data: dict, scenarios: list[str], all_results: list[dict]) -> str: + """Generate raw data section HTML.""" + from run_git_compare import get_metric_value, _mean, _stdev + + html = '
' + html += '

📋 Raw Data

' + + for scenario in scenarios: + html += f'

{scenario}

' + html += '' + + for vn in version_names: + html += f'' + + html += '' + + # Key metrics + metrics_to_show = [ + ("tests_pass_pct", "Unit Test Pass %", 1), + ("tests_passed", "Unit Tests Passed", 0), + ("contract_passed", "Contract Tests Passed", 0), + ("qualitative_score", "Qualitative Score", 3), + ("wall_clock_min", "Execution Time (min)", 1), + ("total_tokens", "Total Tokens", 0), + ("lines_of_code", "Lines of Code", 0), + ] + + for metric_key, metric_name, decimals in metrics_to_show: + html += f'' + + for vn in version_names: + if scenario not in scenario_data: + html += '' + continue + + mlist = scenario_data[scenario].get(vn, []) + vals = [v for v in (get_metric_value(m, metric_key) for m in mlist) if v is not None] + + if not vals: + html += '' + elif len(vals) == 1: + html += f'' + else: + avg = _mean(vals) + std = _stdev(vals) + html += f'' + + html += '' + + html += '
Metric{vn}
{metric_name}{vals[0]:.{decimals}f}{avg:.{decimals}f} ± {std:.{decimals}f}
' + + # Run status table + html += '

Run Status

' + html += '' + + for result in sorted(all_results, key=lambda x: (x["version_name"], x["scenario"], x["run_index"])): + status_class = "status-pass" if result["status"] == "success" else "status-fail" + duration = result.get("elapsed_seconds", 0) / 60 + html += f''' + + + + + + + + + ''' + + html += '
VersionScenarioRunStatusDurationOutput
{result["version_name"]}{result["scenario"]}{result["run_index"]}{result["status"].upper()}{duration:.1f} min{result["output_dir"]}
' + html += '
' + return html + + +def _generate_chart_init_calls() -> str: + """Generate JavaScript calls to initialize all charts.""" + return """ + createLineChart('chart-overview-quality', 'Qualitative Score', 'qualitative_score', true); + createLineChart('chart-overview-performance', 'Execution Time (min)', 'wall_clock_min', false); + createLineChart('chart-perf-time', 'Execution Time (min)', 'wall_clock_min', false); + createLineChart('chart-perf-tokens', 'Total Tokens', 'total_tokens', false); + createLineChart('chart-quality-lint', 'Lint Findings', 'lint_total', false); + createLineChart('chart-quality-security', 'Security Findings', 'security_total', false); + createLineChart('chart-quality-qualitative', 'Qualitative Score', 'qualitative_score', true); + createLineChart('chart-tests-unit', 'Unit Test Pass %', 'tests_pass_pct', true); + createLineChart('chart-tests-contract', 'Contract Tests Passed', 'contract_passed', true); + createLineChart('chart-artifacts-loc', 'Lines of Code', 'lines_of_code', true); + """ diff --git a/scripts/aidlc-evaluator/scripts/regenerate_git_compare_report.py b/scripts/aidlc-evaluator/scripts/regenerate_git_compare_report.py new file mode 100644 index 00000000..32fa44e9 --- /dev/null +++ b/scripts/aidlc-evaluator/scripts/regenerate_git_compare_report.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +"""Regenerate git comparison reports from completed runs. + +Scans a git-compare runs directory for its git-compare-summary.yaml, groups +run folders by (version, scenario), and regenerates all per-scenario detail +reports and the rollup report without re-running any evaluations. + +Usage: + python run.py git-compare-report --runs-dir runs/sci-calc/git-compare + python run.py git-compare-report --runs-dir runs/git-compare +""" + +from __future__ import annotations + +import argparse +import sys +from datetime import UTC, datetime +from pathlib import Path + +import yaml + +REPO_ROOT = Path(__file__).resolve().parent.parent + +# Add scripts dir so we can import shared report logic from run_git_compare +sys.path.insert(0, str(REPO_ROOT / "scripts")) +# Add packages needed by run_git_compare imports +sys.path.insert(0, str(REPO_ROOT / "packages" / "shared" / "src")) +sys.path.insert(0, str(REPO_ROOT / "packages" / "reporting" / "src")) + +from run_git_compare import write_reports # noqa: E402 + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="regenerate_git_compare_report", + description="Regenerate git comparison reports from completed runs", + ) + parser.add_argument( + "--runs-dir", type=Path, required=True, + help="Git compare runs directory containing git-compare-summary.yaml", + ) + args = parser.parse_args() + + summary_path = args.runs_dir / "git-compare-summary.yaml" + if not summary_path.exists(): + print(f"Error: {summary_path} not found", file=sys.stderr) + print( + "Make sure --runs-dir points to the git-compare output directory " + "that contains git-compare-summary.yaml.", + file=sys.stderr, + ) + sys.exit(1) + + with open(summary_path, encoding="utf-8") as f: + summary = yaml.safe_load(f) or {} + + version_names: list[str] = summary.get("version_names", []) + scenarios: list[str] = summary.get("scenarios", []) + all_results: list[dict] = summary.get("runs", []) + + if not version_names or not scenarios or not all_results: + print( + "Error: git-compare-summary.yaml is missing version_names, scenarios, or runs.", + file=sys.stderr, + ) + sys.exit(1) + + print( + f"Loaded summary: {len(all_results)} run(s) across " + f"{len(version_names)} version(s) and {len(scenarios)} scenario(s)" + ) + print(f" Versions: {', '.join(version_names)}") + print(f" Scenarios: {', '.join(scenarios)}") + + generated_at = datetime.now(UTC).isoformat(timespec="seconds") + + write_reports( + runs_dir=args.runs_dir, + scenarios=scenarios, + version_names=version_names, + all_results=all_results, + generated_at=generated_at, + ) + + print(f"\nReports regenerated in: {args.runs_dir / 'comparison'}") + + +if __name__ == "__main__": + main() diff --git a/scripts/aidlc-evaluator/scripts/run_cli_evaluation.py b/scripts/aidlc-evaluator/scripts/run_cli_evaluation.py index c5129f51..150372f5 100644 --- a/scripts/aidlc-evaluator/scripts/run_cli_evaluation.py +++ b/scripts/aidlc-evaluator/scripts/run_cli_evaluation.py @@ -41,7 +41,7 @@ # Add cli-harness to path sys.path.insert(0, str(PACKAGES / "cli-harness" / "src")) -from cli_harness.registry import get_adapter, list_adapters # noqa: E402 +from cli_harness.registry import get_adapter, list_adapters, load_adapters_from_config # noqa: E402 from cli_harness.orchestrator import run_cli_evaluation # noqa: E402 _SLUG_MAX_LEN = 80 @@ -176,6 +176,7 @@ def main() -> None: parser.add_argument("--profile", default=None, help="AWS profile (default: from config YAML)") parser.add_argument("--region", default=None, help="AWS region (default: from config YAML)") parser.add_argument("--scorer-model", default=None, help="Bedrock model for scoring (default: from config YAML)") + parser.add_argument("--simulator-model", default=None, help="Bedrock model for human simulator (default: from config YAML models.simulator.model_id)") parser.add_argument("--model", default=None, help="Model to use with the CLI adapter (e.g., claude-sonnet-4)") parser.add_argument( "--verbose", "-v", action="store_true", @@ -220,6 +221,9 @@ def main() -> None: with open(args.config, encoding="utf-8") as f: cfg_data = yaml.safe_load(f) or {} + # Register any custom adapters declared in config before resolving --cli + load_adapters_from_config(cfg_data) + if args.profile is None: args.profile = cfg_data.get("aws", {}).get("profile") if args.region is None: @@ -232,6 +236,10 @@ def main() -> None: parser.error( "--scorer-model is required (or set models.scorer.model_id in config YAML)" ) + if args.simulator_model is None: + args.simulator_model = ( + cfg_data.get("models", {}).get("simulator", {}).get("model_id") + ) # ── Resolve AIDLC rules config ──────────────────────────────────────── aidlc_cfg = cfg_data.get("aidlc", {}) @@ -281,6 +289,7 @@ def main() -> None: profile=args.profile, region=args.region, scorer_model=args.scorer_model, + simulator_model=args.simulator_model, model=args.model, rules_source=rules_source, rules_ref=rules_ref, diff --git a/scripts/aidlc-evaluator/scripts/run_evaluation.py b/scripts/aidlc-evaluator/scripts/run_evaluation.py index d57a08ae..03329707 100644 --- a/scripts/aidlc-evaluator/scripts/run_evaluation.py +++ b/scripts/aidlc-evaluator/scripts/run_evaluation.py @@ -38,10 +38,12 @@ import argparse import os +import re import subprocess import sys from datetime import datetime, timezone from pathlib import Path +from urllib.parse import urlparse import yaml @@ -198,6 +200,7 @@ def _rel(p: Path | None) -> str | None: "scorer_model": args.scorer_model, "executor_model": args.executor_model, "rules_ref": args.rules_ref, + "rules_repo": args.rules_repo, "output_dir": _rel(args.output_dir), "sandbox": args.sandbox, "report_format": args.report_format, @@ -225,59 +228,6 @@ def _rel(p: Path | None) -> str | None: atomic_yaml_dump(meta, meta_path) -_SENTINEL_NAME = ".last_run_folder" - - -def _read_run_sentinel(output_dir: Path) -> Path | None: - """Read the sentinel file written by create_run_folder(). - - Returns the run folder path if the sentinel exists and the directory - is valid, otherwise None. The sentinel is removed after reading so - it does not confuse subsequent runs. - """ - sentinel = output_dir / _SENTINEL_NAME - if not sentinel.is_file(): - return None - try: - run_folder = Path(sentinel.read_text(encoding="utf-8").strip()) - sentinel.unlink(missing_ok=True) - if run_folder.is_dir(): - return run_folder - except OSError: - pass - return None - - -def _list_run_folders(output_dir: Path | None = None) -> set[Path]: - """Return the current set of run folders under runs/. - - Args: - output_dir: Directory to search for run folders. Defaults to REPO_ROOT / "runs". - """ - runs_dir = output_dir if output_dir else REPO_ROOT / "runs" - if not runs_dir.is_dir(): - return set() - return {d for d in runs_dir.iterdir() if d.is_dir() and not d.name.startswith(".")} - - -def _find_new_run(before: set[Path], output_dir: Path | None = None) -> Path | None: - """Find the single new run folder created since *before* was captured. - - Falls back to the newest folder if multiple appeared (shouldn't happen - in normal single-run usage). - - Args: - before: Set of run folders that existed before execution. - output_dir: Directory to search for new run folders. Defaults to REPO_ROOT / "runs". - - .. deprecated:: - Prefer :func:`_read_run_sentinel` which avoids the TOCTOU race - condition inherent in before/after directory listing. - """ - after = _list_run_folders(output_dir) - new = sorted(after - before, reverse=True) - return new[0] if new else None - def _find_latest_run(scenario_name: str | None = None) -> Path | None: """Find the most recent timestamped run folder under runs/. @@ -304,17 +254,59 @@ def _find_latest_run(scenario_name: str | None = None) -> Path | None: # ── stages ─────────────────────────────────────────────────────────────────── -def stage_execute(args: argparse.Namespace) -> Path | None: +_SLUG_MAX_LEN = 80 + + +def _rules_slug(cfg_data: dict, args: argparse.Namespace) -> str: + """Derive a filesystem-safe slug matching runner.py's _rules_slug().""" + aidlc = cfg_data.get("aidlc", {}) + rules_source = aidlc.get("rules_source", "git") + rules_local_path = aidlc.get("rules_local_path") + rules_repo = args.rules_repo or aidlc.get("rules_repo", "") + rules_ref = args.rules_ref or aidlc.get("rules_ref", "main") + + if rules_source == "local" and rules_local_path: + raw = f"local_{Path(rules_local_path).name}" + else: + path = urlparse(rules_repo).path.rstrip("/") + repo_name = Path(path).stem + raw = f"{repo_name}_{rules_ref}" + + slug = raw.replace(" ", "-") + slug = re.sub(r"[^a-zA-Z0-9._-]", "", slug) + return slug[:_SLUG_MAX_LEN] + + +def stage_execute(args: argparse.Namespace, cfg_data: dict) -> Path | None: """Stage 1: Run the AIDLC workflow via packages/execution. + The run folder is pre-allocated here with the same timestamp+slug format + used by runner.py, then passed as the exact --output-dir. This makes the + folder deterministic and eliminates all post-hoc discovery, which is + required for safe parallel execution. + Returns the run folder even if the runner exits non-zero, as long as aidlc-docs were produced (the swarm may fail on a late handoff after all documents are already written). """ + # Pre-allocate the run folder with the same naming convention as runner.py. + # Passing a timestamped path as --output-dir triggers runner.py's Mode 1 + # (use the path directly rather than creating a new timestamped subfolder). + parent_dir = args.output_dir + if not parent_dir and hasattr(args, "_scenario_name"): + parent_dir = REPO_ROOT / "runs" / args._scenario_name + parent_dir = parent_dir or (REPO_ROOT / "runs") + parent_dir.mkdir(parents=True, exist_ok=True) + + timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") + slug = _rules_slug(cfg_data, args) + run_folder = parent_dir / f"{timestamp}-{slug}" + cmd = [ sys.executable, "-m", "aidlc_runner", "--vision", str(args.vision), "--config", str(args.config), + "--output-dir", str(run_folder), ] if args.tech_env: cmd += ["--tech-env", str(args.tech_env)] @@ -326,12 +318,10 @@ def stage_execute(args: argparse.Namespace) -> Path | None: cmd += ["--executor-model", args.executor_model] if args.rules_ref: cmd += ["--rules-ref", args.rules_ref] - # Route output under runs// by default - output_dir = args.output_dir - if not output_dir and hasattr(args, "_scenario_name"): - output_dir = REPO_ROOT / "runs" / args._scenario_name - if output_dir: - cmd += ["--output-dir", str(output_dir)] + if args.rules_repo: + cmd += ["--rules-repo", args.rules_repo] + if args.openapi and args.openapi.is_file(): + cmd += ["--openapi", str(args.openapi)] env_pythonpath = os.pathsep.join([ str(PACKAGES / "execution" / "src"), @@ -339,22 +329,9 @@ def stage_execute(args: argparse.Namespace) -> Path | None: ]) env = {**os.environ, "PYTHONPATH": env_pythonpath} - # Determine the output directory so we can read the sentinel file after. - effective_output_dir = output_dir or (REPO_ROOT / "runs") - - # Snapshot for the legacy fallback (in case the runner doesn't write - # the sentinel, e.g. older runner versions). - existing_runs = _list_run_folders(output_dir) - result = _run_cmd(cmd, "Stage 1: AIDLC Workflow Execution", env=env) - # Prefer the sentinel file written by create_run_folder() — it avoids - # the TOCTOU race inherent in before/after directory listing. - run_folder = _read_run_sentinel(effective_output_dir) - if run_folder is None: - # Fall back to directory-diff for backwards compatibility. - run_folder = _find_new_run(existing_runs, output_dir) - if run_folder is None: + if not run_folder.is_dir(): return None docs_dir = run_folder / "aidlc-docs" @@ -652,6 +629,10 @@ def build_parser() -> argparse.ArgumentParser: "--rules-ref", default=None, help="Git ref (branch/tag/commit) for AIDLC rules (overrides config value)", ) + parser.add_argument( + "--rules-repo", default=None, + help="Git repository URL for AIDLC rules (overrides config aidlc.rules_repo)", + ) parser.add_argument( "--executor-model", default=None, help="Override executor model ID", @@ -789,7 +770,7 @@ def main() -> None: print(f" Sandbox: {'enabled' if args.sandbox else 'disabled'}") # Stage 1: Execute the AIDLC workflow - run_folder = stage_execute(args) + run_folder = stage_execute(args, cfg_data) if run_folder is None: print("\n[ABORT] Execution stage failed.", file=sys.stderr) sys.exit(1) diff --git a/scripts/aidlc-evaluator/scripts/run_git_compare.py b/scripts/aidlc-evaluator/scripts/run_git_compare.py new file mode 100644 index 00000000..74b7a09f --- /dev/null +++ b/scripts/aidlc-evaluator/scripts/run_git_compare.py @@ -0,0 +1,1398 @@ +#!/usr/bin/env python3 +"""Git Version Comparison Runner — compare multiple versions of AIDLC rules. + +Runs the AIDLC evaluation pipeline against multiple versions, where each +version specifies a git ref and optionally its own repository URL (GitHub, +GitLab, any git host), executor model, and base config. Supports repeated +runs per version for non-determinism analysis. + +Generates per-scenario detail reports (raw numbers per run) and a rollup +report with avg +/- std dev aggregated across repeated runs. + +Usage: + # Simple ref comparison (all refs share the repo URL from config) + python run.py git-compare \\ + --refs main,feat/my-feature \\ + --scenarios sci-calc \\ + --runs-per-ref 3 + + # Per-version sources via a versions file (different repos, models, etc.) + python run.py git-compare \\ + --versions-file versions.yaml \\ + --scenarios sci-calc,all-stages \\ + --runs-per-ref 2 + + # Incremental mode: add new versions to existing comparison + python run.py git-compare \\ + --versions-file versions-expanded.yaml \\ + --scenarios sci-calc \\ + --runs-per-ref 2 \\ + --runs-dir runs/sci-calc/git-compare \\ + --incremental + + # Parallel execution: run up to 3 evaluations concurrently + python run.py git-compare \\ + --versions-file versions.yaml \\ + --scenarios sci-calc \\ + --runs-per-ref 3 \\ + --max-parallel 3 + + # Regenerate reports from existing runs + python run.py git-compare-report \\ + --runs-dir runs/sci-calc/git-compare + +Versions file format (versions.yaml): + versions: + - name: main-github + ref: main + repo: https://github.com/awslabs/aidlc-workflows.git + + - name: my-feature-gitlab + ref: feat/new-rules + repo: https://gitlab.com/myorg/aidlc-fork.git + executor_model: global.anthropic.claude-sonnet-4-6-v1 # optional + config: config/sonnet-4-6.yaml # optional + +Incremental mode: + In incremental mode (--incremental), the script: + 1. Loads existing git-compare-summary.yaml from --runs-dir + 2. Identifies which versions have already been tested + 3. Runs evaluations ONLY for new versions not in the existing summary + 4. Merges new results with existing data + 5. Regenerates all reports with the complete dataset + + Example workflow: + # Week 1: Test 2 versions + python run.py git-compare --versions-file v1-2.yaml --scenarios sci-calc --runs-per-ref 3 + + # Week 2: Add 3rd version (only runs 1 new version, ~30 min vs ~90 min) + python run.py git-compare --versions-file v1-3.yaml --scenarios sci-calc --runs-per-ref 3 \\ + --runs-dir runs/sci-calc/git-compare --incremental + + Use --force-rerun to re-run versions that already exist in the summary. +""" + +from __future__ import annotations + +import argparse +import math +import os +import shutil +import subprocess +import sys +import threading +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, field +from datetime import UTC, datetime +from pathlib import Path + +import yaml + +REPO_ROOT = Path(__file__).resolve().parent.parent +CONFIG_DIR = REPO_ROOT / "config" +DEFAULT_CONFIG = CONFIG_DIR / "default.yaml" +TEST_CASES_DIR = REPO_ROOT / "test_cases" +SCRIPTS_DIR = REPO_ROOT / "scripts" + +# Add shared and reporting packages to path +sys.path.insert(0, str(REPO_ROOT / "packages" / "shared" / "src")) +sys.path.insert(0, str(REPO_ROOT / "packages" / "reporting" / "src")) + +from shared.scenario import resolve_scenario, Scenario # noqa: E402 +from reporting.baseline import BaselineMetrics, extract_baseline # noqa: E402 +from reporting.collector import collect # noqa: E402 + + +# ── Version spec ─────────────────────────────────────────────────────────────── + + +@dataclass +class Version: + """A single version to compare — a named (repo, ref) pair with optional overrides.""" + + name: str + """Display label used in report column headers and run folder names.""" + + ref: str + """Git ref: branch name, tag, or commit SHA.""" + + repo: str | None = None + """Git repository URL. None means use the value from the base config YAML.""" + + executor_model: str | None = None + """Per-version executor model override. None means use the global default.""" + + config: Path | None = None + """Per-version base config YAML. None means use the global --config value.""" + + +def parse_versions_file(path: Path) -> list[Version]: + """Load a versions YAML file and return a list of Version objects. + + Expected format:: + + versions: + - name: main-github + ref: main + repo: https://github.com/awslabs/aidlc-workflows.git + - name: my-feature + ref: feat/my-feature + repo: https://gitlab.com/myorg/fork.git + executor_model: global.anthropic.claude-sonnet-4-6-v1 + config: config/sonnet-4-6.yaml # resolved relative to versions file + + ``repo``, ``executor_model``, and ``config`` are all optional. + """ + with open(path, encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + + raw = data.get("versions", []) + if not raw: + raise ValueError(f"versions file {path} contains no 'versions' list") + + versions: list[Version] = [] + for i, v in enumerate(raw): + name = v.get("name", "").strip() + if not name: + raise ValueError(f"version entry {i + 1} in {path} is missing 'name'") + ref = v.get("ref", "").strip() + if not ref: + raise ValueError(f"version '{name}' in {path} is missing 'ref'") + + cfg_override: Path | None = None + if v.get("config"): + cfg_path = Path(v["config"]) + if not cfg_path.is_absolute(): + cfg_path = path.parent / cfg_path + cfg_override = cfg_path + + versions.append(Version( + name=name, + ref=ref, + repo=v.get("repo") or None, + executor_model=v.get("executor_model") or None, + config=cfg_override, + )) + + return versions + + +def versions_from_refs(refs: list[str]) -> list[Version]: + """Build a list of Versions from a plain list of git refs. + + The version name is derived from the ref by replacing '/' with '_' and + truncating to 40 characters (same slug logic used for folder names). + The repo field is left None so each run inherits the repo URL from config. + """ + return [Version(name=ref_to_slug(ref), ref=ref) for ref in refs] + + +# ── Metrics and formatting ───────────────────────────────────────────────────── + + +# Metric rows used in all reports: (display_name, attr_name, higher_is_better) +# attr_name="" marks a section-header row (no data cell). +# "wall_clock_min" is a computed alias for wall_clock_ms / 60000. +METRIC_ROWS: list[tuple[str, str, bool]] = [ + ("**Unit Tests**", "", True), + ("Pass %", "tests_pass_pct", True), + ("Passed", "tests_passed", True), + ("Failed", "tests_failed", False), + ("Total", "tests_total", True), + ("Coverage %", "coverage_pct", True), + ("**Contract Tests**", "", True), + ("Passed", "contract_passed", True), + ("Failed", "contract_failed", False), + ("Total", "contract_total", True), + ("**Code Quality**", "", True), + ("Lint Errors", "lint_errors", False), + ("Lint Warnings", "lint_warnings", False), + ("Lint Total", "lint_total", False), + ("Security Findings", "security_total", False), + ("Security High", "security_high", False), + ("Duplication Blocks", "duplication_blocks", False), + ("**Qualitative**", "", True), + ("Overall Score", "qualitative_score", True), + ("Inception Score", "inception_score", True), + ("Construction Score", "construction_score", True), + ("**Artifacts**", "", True), + ("Source Files", "source_files", True), + ("Test Files", "test_files", True), + ("Total Files", "total_files", True), + ("Lines of Code", "lines_of_code", True), + ("Doc Files", "doc_files", True), + ("**Execution**", "", True), + ("Total Tokens", "total_tokens", False), + ("Executor Tokens", "executor_total_tokens", False), + ("Simulator Tokens", "simulator_total_tokens", False), + ("Wall Clock (min)", "wall_clock_min", False), + ("Handoffs", "handoffs", False), + ("**Context Size**", "", True), + ("Max Tokens", "context_size_max", False), + ("Avg Tokens", "context_size_avg", False), + ("Median Tokens", "context_size_median", False), +] + + +def ref_to_slug(ref: str, max_len: int = 40) -> str: + """Convert a git ref or version name to a filesystem-safe slug. + + Replaces '/' with '_' and truncates to max_len characters. + """ + return ref.replace("/", "_")[:max_len] + + +def get_metric_value(metrics: BaselineMetrics, attr: str) -> float | None: + """Extract a metric value from BaselineMetrics, handling the wall_clock_min alias.""" + if attr == "wall_clock_min": + return metrics.wall_clock_ms / 60000 if metrics.wall_clock_ms else None + return getattr(metrics, attr, None) + + +def format_num(val: float | int | None, decimals: int = 1) -> str: + """Format a number for display, returning em-dash for None.""" + if val is None: + return "\u2014" + if isinstance(val, float): + return f"{val:.{decimals}f}" + return str(val) + + +def _mean(values: list[float]) -> float: + return sum(values) / len(values) + + +def _stdev(values: list[float]) -> float: + if len(values) < 2: + return 0.0 + m = _mean(values) + return math.sqrt(sum((v - m) ** 2 for v in values) / (len(values) - 1)) + + +def load_run_metrics(run_folder: Path) -> BaselineMetrics | None: + """Load evaluation metrics from a run folder.""" + try: + data = collect(run_folder) + return extract_baseline(data) + except Exception as e: + print(f" [WARN] Failed to collect metrics from {run_folder}: {e}", file=sys.stderr) + return None + + +# ── Execution ────────────────────────────────────────────────────────────────── + + +def run_single_evaluation( + version: Version, + scenario: Scenario, + run_index: int, + runs_per_ref: int, + runs_dir: Path, + base_config: Path, + profile: str, + region: str, + scorer_model: str, + default_executor_model: str | None, + use_sandbox: bool, +) -> dict: + """Run a single evaluation for one (version, scenario, run_index) combination. + + The effective config, executor model, and rules repo/ref are resolved + by layering version-level overrides on top of the global defaults. + + Returns a summary dict describing the run result. + """ + effective_config = version.config or base_config + effective_executor = version.executor_model or default_executor_model + folder_slug = ref_to_slug(version.name) + + # Generate folder name upfront - orchestrator controls the output location + timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%S-%f") + folder_name = f"{timestamp}-{folder_slug}" + run_folder = runs_dir / folder_name + + _safe_print(f"\n{'=' * 70}") + _safe_print(f" Version: {version.name}") + _safe_print(f" Ref: {version.ref}") + if version.repo: + _safe_print(f" Repo: {version.repo}") + _safe_print(f" Scenario: {scenario.name}") + _safe_print(f" Run: {run_index}/{runs_per_ref}") + _safe_print(f"{'=' * 70}\n") + + cmd = [ + sys.executable, str(SCRIPTS_DIR / "run_evaluation.py"), + "--config", str(effective_config), + "--vision", str(scenario.vision_path), + "--golden", str(scenario.golden_aidlc_docs_path), + "--profile", profile, + "--region", region, + "--scorer-model", scorer_model, + "--rules-ref", version.ref, + "--report-format", "both", + "--output-dir", str(run_folder), # Pass full folder path, not parent dir + ] + + if version.repo: + cmd += ["--rules-repo", version.repo] + if scenario.tech_env_path.is_file(): + cmd += ["--tech-env", str(scenario.tech_env_path)] + if scenario.openapi_path.is_file(): + cmd += ["--openapi", str(scenario.openapi_path)] + if scenario.golden_baseline_path.is_file(): + cmd += ["--baseline", str(scenario.golden_baseline_path)] + if effective_executor: + cmd += ["--executor-model", effective_executor] + cmd.append("--sandbox" if use_sandbox else "--no-sandbox") + + # Create log directory + log_dir = runs_dir / ".git-compare-logs" + log_dir.mkdir(parents=True, exist_ok=True) + log_path = log_dir / f"{timestamp}-{folder_slug}-{scenario.name}-run{run_index}.log" + + start_monotonic = time.monotonic() # Track elapsed time + started_at = datetime.now(UTC).isoformat(timespec="seconds") + runs_dir.mkdir(parents=True, exist_ok=True) + + with open(log_path, "w", encoding="utf-8") as log_file: + # Write header to identify this run in the log + log_file.write(f"=== Git-Compare Run Log ===\n") + log_file.write(f"Version: {version.name}\n") + log_file.write(f"Ref: {version.ref}\n") + log_file.write(f"Repo: {version.repo or '(from config)'}\n") + log_file.write(f"Scenario: {scenario.name}\n") + log_file.write(f"Run: {run_index}/{runs_per_ref}\n") + log_file.write(f"Started: {started_at}\n") + log_file.write(f"{'=' * 70}\n\n") + log_file.flush() + + result = subprocess.run(cmd, stdout=log_file, stderr=subprocess.STDOUT) # nosec B603 + + elapsed_s = time.monotonic() - start_monotonic + status = "success" if result.returncode == 0 else "failed" + _safe_print( + f" [{status.upper()}] version={version.name}, scenario={scenario.name}, " + f"run={run_index} \u2014 {elapsed_s / 60:.1f} min (exit {result.returncode})" + ) + + # We told run_evaluation.py exactly where to write, so use that folder + if run_folder.is_dir(): + output_dir = run_folder + _safe_print(f" Output: {output_dir.name}") + + # Move log file with descriptive name + final_log_name = f"git-compare-{folder_slug}-{scenario.name}-run{run_index}.log" + final_log_path = output_dir / final_log_name + shutil.move(str(log_path), str(final_log_path)) + + # Write metadata to identify this run + meta = { + "git_compare_version_name": version.name, + "git_compare_ref": version.ref, + "git_compare_repo": version.repo, + "git_compare_scenario": scenario.name, + "git_compare_run_index": run_index, + "git_compare_runs_per_version": runs_per_ref, + } + with open(output_dir / "git-compare-meta.yaml", "w", encoding="utf-8") as f: + yaml.safe_dump(meta, f, default_flow_style=False, sort_keys=False) + else: + _safe_print(f" [WARN] Run folder not created: {run_folder}") + output_dir = run_folder # Use the expected path even if it doesn't exist + output_dir.mkdir(parents=True, exist_ok=True) + shutil.move(str(log_path), str(output_dir / "git-compare-run.log")) + + # Clean temp log dir if empty + if log_dir.exists() and not any(log_dir.iterdir()): + log_dir.rmdir() + + return { + "version_name": version.name, + "ref": version.ref, + "repo": version.repo, + "scenario": scenario.name, + "run_index": run_index, + "started_at": started_at, + "elapsed_seconds": round(elapsed_s, 1), + "exit_code": result.returncode, + "status": status, + "output_dir": str(output_dir), + } + + +# ── Report generation ────────────────────────────────────────────────────────── + + +def _run_label(res: dict) -> str: + """Column label for an individual run in the detail report.""" + return f"{res['version_name']} run-{res['run_index']}" + + +def generate_scenario_detail_report( + scenario_name: str, + version_names: list[str], + run_results: list[dict], + generated_at: str, +) -> str: + """Generate a per-scenario detail report with one column per individual run. + + Columns are ordered by version (preserving the order in version_names) + then by run index. Each cell contains the raw numeric value for that run. + """ + lines: list[str] = [ + f"# Git Version Comparison \u2014 {scenario_name}", + "", + f"**Scenario:** {scenario_name}", + f"**Generated:** {generated_at}", + "", + "## Run Detail (Raw Numbers)", + "", + "Each column is one individual run. " + "Runs are grouped by version (in the order specified) then sorted by run index.", + "", + ] + + version_order = {n: i for i, n in enumerate(version_names)} + sorted_results = sorted( + run_results, + key=lambda r: (version_order.get(r["version_name"], 999), r["run_index"]), + ) + + col_labels: list[str] = [] + col_metrics: list[BaselineMetrics | None] = [] + for res in sorted_results: + col_labels.append(_run_label(res)) + folder = Path(res["output_dir"]) + col_metrics.append(load_run_metrics(folder) if folder.is_dir() else None) + + header = "| Metric |" + separator = "|--------|" + for label in col_labels: + header += f" {label} |" + separator += "---------|" + lines.append(header) + lines.append(separator) + + for display_name, attr, _ in METRIC_ROWS: + if not attr: + row = f"| {display_name} |" + for _ in col_labels: + row += " |" + lines.append(row) + continue + + row = f"| {display_name} |" + for metrics in col_metrics: + if metrics is None: + row += " \u2014 |" + else: + val = get_metric_value(metrics, attr) + row += f" {format_num(val)} |" + lines.append(row) + + lines.append("") + + # Run status table + lines.extend([ + "", + "## Run Status", + "", + "| Version | Ref | Repo | Run | Status | Duration (min) | Output |", + "|---------|-----|------|-----|--------|----------------|--------|", + ]) + for res in sorted_results: + marker = "PASS" if res["status"] == "success" else "FAIL" + duration = res.get("elapsed_seconds", 0) / 60 + repo_display = res.get("repo") or "*(from config)*" + lines.append( + f"| {res['version_name']} | {res['ref']} | {repo_display} " + f"| {res['run_index']} | {marker} | {duration:.1f} " + f"| `{res['output_dir']}` |" + ) + lines.append("") + + return "\n".join(lines) + + +def _build_rollup_section( + scenario_name: str, + version_names: list[str], + run_results: list[dict], +) -> list[str]: + """Build markdown lines for one scenario's rollup table (avg +/- std dev).""" + lines: list[str] = [ + f"## Scenario: {scenario_name}", + "", + ] + + # Group loaded metrics by version name + version_metrics: dict[str, list[BaselineMetrics]] = {n: [] for n in version_names} + for res in run_results: + vn = res["version_name"] + folder = Path(res["output_dir"]) + if folder.is_dir(): + m = load_run_metrics(folder) + if m is not None: + version_metrics.setdefault(vn, []).append(m) + + # Build column descriptors: (header_label, version_name, metrics_list) + columns: list[tuple[str, str, list[BaselineMetrics]]] = [] + for vn in version_names: + mlist = version_metrics.get(vn, []) + columns.append((f"{vn} (n={len(mlist)})", vn, mlist)) + + if not any(mlist for _, _, mlist in columns): + lines.append("_No metrics available for this scenario._") + return lines + + baseline_name = version_names[0] if version_names else None + + header = "| Metric |" + separator = "|--------|" + for label, _, _ in columns: + header += f" {label} |" + separator += "---------|" + lines.append(header) + lines.append(separator) + + for display_name, attr, higher_is_better in METRIC_ROWS: + if not attr: + row = f"| {display_name} |" + for _ in columns: + row += " |" + lines.append(row) + continue + + # Compute per-version (avg, stdev) + version_stats: list[tuple[float | None, float | None]] = [] + for _, vn, mlist in columns: + vals = [v for v in (get_metric_value(m, attr) for m in mlist) if v is not None] + if not vals: + version_stats.append((None, None)) + elif len(vals) == 1: + version_stats.append((vals[0], None)) + else: + version_stats.append((_mean(vals), _stdev(vals))) + + baseline_avg = version_stats[0][0] if version_stats else None + + row = f"| {display_name} |" + for i, (_label, _vn, _mlist) in enumerate(columns): + avg, std = version_stats[i] + if avg is None: + row += " \u2014 |" + continue + + cell = format_num(avg) + if std is not None and std > 0: + cell += f" \u00b1 {format_num(std)}" + + # Delta indicator vs baseline version (skip for the baseline column itself) + if i > 0 and baseline_avg is not None: + delta = avg - baseline_avg + if abs(delta) > 0.001: + cell += (" ^" if delta > 0 else " v") if higher_is_better \ + else (" v" if delta > 0 else " ^") + + row += f" {cell} |" + lines.append(row) + + lines.append("") + lines.append( + f"**Legend:** ^ = better than `{baseline_name}` (baseline version), " + f"v = worse. \u00b1 = sample std dev across repeated runs." + ) + + return lines + + +def generate_rollup_report( + scenarios: list[str], + version_names: list[str], + all_results: list[dict], + generated_at: str, +) -> str: + """Generate the multi-scenario rollup report (avg +/- std dev per version). + + One section per scenario; delta indicators vs the first version listed. + """ + lines: list[str] = [ + "# Git Version Comparison \u2014 Rollup Report", + "", + f"**Generated:** {generated_at}", + f"**Versions:** {', '.join(version_names)}", + f"**Scenarios:** {', '.join(scenarios)}", + "", + "> Values shown as `avg \u00b1 std_dev` when multiple runs were performed.", + "> ^ = better than baseline version (first version listed), v = worse.", + "", + ] + + for scenario_name in scenarios: + scenario_results = [r for r in all_results if r["scenario"] == scenario_name] + lines.extend(_build_rollup_section(scenario_name, version_names, scenario_results)) + lines.append("") + + return "\n".join(lines) + + +def write_reports( + runs_dir: Path, + scenarios: list[str], + version_names: list[str], + all_results: list[dict], + generated_at: str, +) -> None: + """Write all per-scenario detail reports and the rollup report to disk. + + Outputs are written to /comparison/: + - -report.md / -report.yaml (one per scenario) + - rollup-report.md / rollup-data.yaml + """ + comparison_dir = runs_dir / "comparison" + comparison_dir.mkdir(parents=True, exist_ok=True) + + for scenario_name in scenarios: + scenario_results = [r for r in all_results if r["scenario"] == scenario_name] + if not scenario_results: + continue + + print(f" Writing detail report: {scenario_name}...") + md = generate_scenario_detail_report( + scenario_name=scenario_name, + version_names=version_names, + run_results=scenario_results, + generated_at=generated_at, + ) + md_path = comparison_dir / f"{scenario_name}-report.md" + md_path.write_text(md, encoding="utf-8") + print(f" {md_path}") + + yaml_data: dict = { + "generated_at": generated_at, + "scenario": scenario_name, + "version_names": version_names, + "runs": [ + { + "version_name": r["version_name"], + "ref": r["ref"], + "repo": r.get("repo"), + "run_index": r["run_index"], + "status": r["status"], + "elapsed_seconds": r.get("elapsed_seconds"), + "output_dir": r["output_dir"], + } + for r in sorted(scenario_results, key=lambda x: (x["version_name"], x["run_index"])) + ], + } + yaml_path = comparison_dir / f"{scenario_name}-report.yaml" + with open(yaml_path, "w", encoding="utf-8") as f: + yaml.safe_dump(yaml_data, f, default_flow_style=False, sort_keys=False) + print(f" {yaml_path}") + + print(" Writing rollup report...") + rollup_md = generate_rollup_report( + scenarios=scenarios, + version_names=version_names, + all_results=all_results, + generated_at=generated_at, + ) + rollup_md_path = comparison_dir / "rollup-report.md" + rollup_md_path.write_text(rollup_md, encoding="utf-8") + print(f" {rollup_md_path}") + + rollup_yaml: dict = { + "generated_at": generated_at, + "version_names": version_names, + "scenarios": scenarios, + "runs": all_results, + } + rollup_yaml_path = comparison_dir / "rollup-data.yaml" + with open(rollup_yaml_path, "w", encoding="utf-8") as f: + yaml.safe_dump(rollup_yaml, f, default_flow_style=False, sort_keys=False) + print(f" {rollup_yaml_path}") + + print(" Writing interactive HTML report...") + from generate_html_report import generate_interactive_html_report + html_report = generate_interactive_html_report( + scenarios=scenarios, + version_names=version_names, + all_results=all_results, + generated_at=generated_at, + runs_dir=runs_dir, + ) + html_path = comparison_dir / "interactive-report.html" + html_path.write_text(html_report, encoding="utf-8") + print(f" {html_path}") + + +# ── Parallel execution ──────────────────────────────────────────────────────── + + +# Global lock for thread-safe printing +_print_lock = threading.Lock() + + +def _safe_print(*args, **kwargs): + """Thread-safe print for parallel execution.""" + with _print_lock: + print(*args, **kwargs) + + +@dataclass +class WorkItem: + """A single evaluation work item for parallel execution.""" + version: Version + scenario: "Scenario" + run_index: int + runs_per_ref: int + runs_dir: Path + base_config: Path + profile: str + region: str + scorer_model: str + default_executor_model: str | None + use_sandbox: bool + + +def execute_work_item(item: WorkItem) -> dict: + """Execute a single evaluation work item (thread-safe wrapper). + + This is called by ThreadPoolExecutor and wraps run_single_evaluation + with thread-safe output handling. + """ + return run_single_evaluation( + version=item.version, + scenario=item.scenario, + run_index=item.run_index, + runs_per_ref=item.runs_per_ref, + runs_dir=item.runs_dir, + base_config=item.base_config, + profile=item.profile, + region=item.region, + scorer_model=item.scorer_model, + default_executor_model=item.default_executor_model, + use_sandbox=item.use_sandbox, + ) + + +def run_parallel_evaluations( + work_items: list[WorkItem], + max_workers: int, +) -> list[dict]: + """Run evaluations in parallel with progress tracking. + + Args: + work_items: List of work items to execute + max_workers: Maximum number of concurrent workers + + Returns: + List of result dicts in original submission order + """ + all_results = [] + total = len(work_items) + completed = 0 + + _safe_print(f"\n{'=' * 70}") + _safe_print(f" Parallel Execution: {total} runs, max {max_workers} concurrent") + _safe_print(f"{'=' * 70}\n") + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit all work with tracking + future_to_item = { + executor.submit(execute_work_item, item): (i, item) + for i, item in enumerate(work_items) + } + + # Collect results as they complete + for future in as_completed(future_to_item): + idx, item = future_to_item[future] + completed += 1 + + try: + result = future.result() + all_results.append((idx, result)) + + status = "✓" if result.get("status") == "success" else "✗" + duration = result.get("elapsed_seconds", 0) / 60 + _safe_print( + f" [{completed:2d}/{total}] {status} {item.version.name:30s} " + f"{item.scenario.name:15s} run-{item.run_index} ({duration:.1f} min)" + ) + + except Exception as e: + _safe_print( + f" [{completed:2d}/{total}] ✗ {item.version.name:30s} " + f"{item.scenario.name:15s} run-{item.run_index} ERROR: {e}" + ) + # Create error result + error_result = { + "version_name": item.version.name, + "ref": item.version.ref, + "repo": item.version.repo, + "scenario": item.scenario.name, + "run_index": item.run_index, + "status": "error", + "error": str(e), + "output_dir": str( + item.runs_dir + / f"failed-{ref_to_slug(item.version.name)}-{item.scenario.name}-run{item.run_index}" + ), + } + all_results.append((idx, error_result)) + + _safe_print(f"\n{'=' * 70}") + _safe_print(f" Parallel execution complete: {completed}/{total} finished") + _safe_print(f"{'=' * 70}\n") + + # Sort by original submission order + all_results.sort(key=lambda x: x[0]) + return [result for _, result in all_results] + + +# ── Incremental mode helpers ────────────────────────────────────────────────── + + +def load_existing_summary(runs_dir: Path) -> dict | None: + """Load existing git-compare-summary.yaml if it exists. + + Returns: + Summary dict with keys: version_names, scenarios, runs_per_version, runs + Returns None if summary doesn't exist. + + Raises: + ValueError: If summary exists but is malformed. + """ + summary_path = runs_dir / "git-compare-summary.yaml" + if not summary_path.exists(): + return None + + with open(summary_path, encoding="utf-8") as f: + summary = yaml.safe_load(f) or {} + + required_keys = ["version_names", "scenarios", "runs_per_version", "runs"] + missing = [k for k in required_keys if k not in summary] + if missing: + raise ValueError( + f"Existing summary at {summary_path} is missing required keys: {missing}" + ) + + return summary + + +def filter_new_versions( + versions: list[Version], + existing_version_names: set[str], + force_rerun: bool, +) -> tuple[list[Version], list[Version]]: + """Separate versions into new vs. already-tested. + + Args: + versions: All versions from versions file + existing_version_names: Version names from existing summary + force_rerun: If True, treat all versions as new + + Returns: + (new_versions, skipped_versions) + """ + if force_rerun: + return versions, [] + + new_versions = [v for v in versions if v.name not in existing_version_names] + skipped_versions = [v for v in versions if v.name in existing_version_names] + + return new_versions, skipped_versions + + +def merge_summaries( + existing_summary: dict, + new_results: list[dict], + new_versions: list[Version], + new_elapsed_seconds: float, +) -> dict: + """Merge new run results into existing summary. + + Args: + existing_summary: Loaded from git-compare-summary.yaml + new_results: Run results from newly executed versions + new_versions: Version objects for newly tested versions + new_elapsed_seconds: Elapsed time for new runs + + Returns: + Updated summary dict with merged data + """ + new_version_names = [v.name for v in new_versions] + + # Merge version names (preserve order: existing + new) + all_version_names = existing_summary["version_names"] + new_version_names + + # Merge version specs + existing_version_specs = existing_summary.get("versions", []) + new_version_specs = [ + { + "name": v.name, + "ref": v.ref, + "repo": v.repo, + "executor_model": v.executor_model, + } + for v in new_versions + ] + all_version_specs = existing_version_specs + new_version_specs + + # Merge run results + all_runs = existing_summary["runs"] + new_results + + # Update counts + runs_succeeded = sum(1 for r in all_runs if r.get("status") == "success") + runs_failed = sum(1 for r in all_runs if r.get("status") != "success") + + # Track incremental runs + incremental_runs = existing_summary.get("incremental_runs", []) + incremental_runs.append({ + "added_at": datetime.now(UTC).isoformat(timespec="seconds"), + "versions_added": new_version_names, + "runs_added": len(new_results), + "elapsed_seconds": round(new_elapsed_seconds, 1), + }) + + return { + "started_at": existing_summary["started_at"], # Keep original start time + "generated_at": datetime.now(UTC).isoformat(timespec="seconds"), # Update + "total_elapsed_seconds": existing_summary["total_elapsed_seconds"], # Original only + "incremental_runs": incremental_runs, # Track all incremental additions + "version_names": all_version_names, + "versions": all_version_specs, + "scenarios": existing_summary["scenarios"], + "runs_per_version": existing_summary["runs_per_version"], + "total_runs": len(all_runs), + "runs_succeeded": runs_succeeded, + "runs_failed": runs_failed, + "runs": all_runs, + } + + +# ── CLI ──────────────────────────────────────────────────────────────────────── + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="run_git_compare", + description=( + "Compare multiple versions of AIDLC rules across scenarios and repeated runs. " + "Each version can target a different git repository (GitHub, GitLab, etc.), " + "ref, executor model, and base config." + ), + ) + + # Version specification — mutually exclusive + version_group = parser.add_mutually_exclusive_group(required=True) + version_group.add_argument( + "--refs", type=str, default=None, + help=( + "Comma-separated git refs to compare. " + "All refs share the repository URL from the base config YAML. " + "Use --versions-file when different repos or per-version settings are needed." + ), + ) + version_group.add_argument( + "--versions-file", type=Path, default=None, + help=( + "Path to a YAML file defining named versions with per-version repo URL, " + "ref, executor model, and config overrides. " + "Mutually exclusive with --refs." + ), + ) + + parser.add_argument( + "--scenarios", type=str, default="sci-calc", + help="Comma-separated scenario names (default: sci-calc)", + ) + parser.add_argument( + "--runs-per-ref", type=int, default=1, + help="Number of evaluation runs per (version, scenario) pair (default: 1)", + ) + + # Global config (can be overridden per-version via versions file) + parser.add_argument( + "--config", type=Path, default=DEFAULT_CONFIG, + help="Base config YAML (default: config/default.yaml)", + ) + parser.add_argument("--profile", default=None, help="AWS profile") + parser.add_argument("--region", default=None, help="AWS region") + parser.add_argument( + "--executor-model", default=None, + help="Default executor model ID (can be overridden per-version in versions file)", + ) + parser.add_argument("--scorer-model", default=None, help="Override scorer model ID") + + # Output + parser.add_argument( + "--runs-dir", type=Path, default=None, + help=( + "Base directory for all run outputs. " + "Defaults to runs//git-compare/ for a single scenario " + "or runs/git-compare/ when multiple scenarios are specified." + ), + ) + + # Sandbox + sandbox_group = parser.add_mutually_exclusive_group() + sandbox_group.add_argument( + "--sandbox", action="store_true", default=True, + help="Run generated code in Docker sandbox (default)", + ) + sandbox_group.add_argument( + "--no-sandbox", action="store_false", dest="sandbox", + help="Run generated code directly on host (no isolation)", + ) + + # Incremental mode + parser.add_argument( + "--incremental", action="store_true", default=False, + help=( + "Incremental mode: only run evaluations for versions not present in " + "existing git-compare-summary.yaml, then merge results and regenerate " + "reports. Requires --runs-dir to point to an existing git-compare output." + ), + ) + parser.add_argument( + "--force-rerun", action="store_true", default=False, + help=( + "With --incremental, re-run evaluations for versions that already exist " + "in the summary (default: skip existing versions)." + ), + ) + + # Parallel execution + parser.add_argument( + "--max-parallel", type=int, default=1, + help=( + "Maximum number of evaluations to run in parallel (default: 1). " + "Recommended: 2-4 depending on system resources. Each parallel run " + "consumes ~2GB RAM and spawns a Docker container in sandbox mode. " + "Higher values may hit Bedrock API rate limits." + ), + ) + + args = parser.parse_args() + + # Build version list + versions: list[Version] + if args.versions_file: + if not args.versions_file.exists(): + parser.error(f"versions file not found: {args.versions_file}") + try: + versions = parse_versions_file(args.versions_file) + except (ValueError, yaml.YAMLError) as e: + parser.error(str(e)) + else: + refs = [r.strip() for r in args.refs.split(",") if r.strip()] + if not refs: + parser.error("--refs must specify at least one git ref") + versions = versions_from_refs(refs) + + if not versions: + parser.error("No versions to compare") + + # Parse scenarios + scenario_names = [s.strip() for s in args.scenarios.split(",") if s.strip()] + if not scenario_names: + parser.error("--scenarios must specify at least one scenario name") + + resolved_scenarios: list[Scenario] = [] + for name in scenario_names: + try: + resolved_scenarios.append(resolve_scenario(name, TEST_CASES_DIR)) + except FileNotFoundError as e: + parser.error(str(e)) + + # Default runs_dir + if args.runs_dir is None: + if len(resolved_scenarios) == 1: + args.runs_dir = REPO_ROOT / "runs" / resolved_scenarios[0].name / "git-compare" + else: + args.runs_dir = REPO_ROOT / "runs" / "git-compare" + + # Load base config for credential/model defaults + base_cfg: dict = {} + if args.config and args.config.exists(): + with open(args.config, encoding="utf-8") as f: + base_cfg = yaml.safe_load(f) or {} + + if args.profile is None: + args.profile = base_cfg.get("aws", {}).get("profile") + # Allow None profile to use default credentials (e.g., EC2 instance role) + # Just ensure it's explicitly set to something (even if None) + + if args.region is None: + args.region = base_cfg.get("aws", {}).get("region") + if args.region is None: + parser.error("--region is required (or set aws.region in config YAML)") + + if args.scorer_model is None: + args.scorer_model = base_cfg.get("models", {}).get("scorer", {}).get("model_id") + if args.scorer_model is None: + parser.error("--scorer-model is required (or set models.scorer.model_id in config YAML)") + + # Validate parallel execution settings + if args.max_parallel < 1: + parser.error("--max-parallel must be >= 1") + + if args.max_parallel > 8: + print( + f"WARNING: --max-parallel {args.max_parallel} is quite high. " + f"Each parallel run consumes ~2GB RAM and may hit Bedrock rate limits.", + file=sys.stderr + ) + + # Suggest optimal settings based on system resources + cpu_count = os.cpu_count() or 1 + if args.max_parallel > cpu_count: + print( + f"INFO: --max-parallel {args.max_parallel} exceeds CPU count ({cpu_count}). " + f"Consider using --max-parallel {min(cpu_count, 4)} for optimal performance.", + file=sys.stderr + ) + + # Handle incremental mode + existing_summary = None + skipped_versions: list[Version] = [] + all_versions = versions # Keep reference to all versions for final version_names + + if args.incremental: + if not args.runs_dir: + parser.error("--incremental requires --runs-dir to be specified") + if not args.runs_dir.exists(): + parser.error(f"--runs-dir does not exist: {args.runs_dir}") + + try: + existing_summary = load_existing_summary(args.runs_dir) + except ValueError as e: + parser.error(str(e)) + + if existing_summary is None: + parser.error( + f"--incremental requires existing git-compare-summary.yaml in {args.runs_dir}" + ) + + # Validate consistency + existing_scenarios = existing_summary["scenarios"] + if set(scenario_names) != set(existing_scenarios): + parser.error( + f"Scenarios mismatch: new={scenario_names}, existing={existing_scenarios}" + ) + if args.runs_per_ref != existing_summary["runs_per_version"]: + parser.error( + f"--runs-per-ref mismatch: new={args.runs_per_ref}, " + f"existing={existing_summary['runs_per_version']}" + ) + + # Filter versions + existing_version_names = set(existing_summary["version_names"]) + new_versions, skipped_versions = filter_new_versions( + versions, existing_version_names, args.force_rerun + ) + + if skipped_versions: + print("Git Version Comparison (Incremental Mode)") + print(f" Skipping {len(skipped_versions)} already-tested versions:") + for v in skipped_versions: + print(f" - {v.name}") + print() + + if not new_versions: + print("No new versions to test. Regenerating reports from existing data...\n") + write_reports( + runs_dir=args.runs_dir, + scenarios=scenario_names, + version_names=existing_summary["version_names"], + all_results=existing_summary["runs"], + generated_at=datetime.now(UTC).isoformat(timespec="seconds"), + ) + print(f"\n Results: {args.runs_dir}") + sys.exit(0) + + versions = new_versions # Only run new versions + + version_names = [v.name for v in versions] + total_runs = len(versions) * len(resolved_scenarios) * args.runs_per_ref + + mode_str = "Git Version Comparison (Incremental Mode)" if args.incremental else "Git Version Comparison" + print(mode_str) + if args.incremental and existing_summary: + print(f" Existing vers: {len(existing_summary['version_names'])} ({', '.join(existing_summary['version_names'])})") + print(f" New versions: {len(version_names)} ({', '.join(version_names)})") + else: + print(f" Versions: {', '.join(version_names)}") + print(f" Scenarios: {', '.join(s.name for s in resolved_scenarios)}") + print(f" Runs per ver: {args.runs_per_ref}") + print(f" Total runs: {total_runs}") + print(f" Max parallel: {args.max_parallel}") + print(f" Profile: {args.profile}") + print(f" Region: {args.region}") + print(f" Scorer: {args.scorer_model}") + print(f" Output: {args.runs_dir}") + for v in versions: + repo_display = v.repo or "*(from config)*" + model_display = v.executor_model or args.executor_model or "*(from config)*" + print(f" [{v.name}] ref={v.ref} repo={repo_display} model={model_display}") + + overall_start = time.monotonic() + overall_started_at = datetime.now(UTC).isoformat(timespec="seconds") + + # Choose execution mode based on --max-parallel + if args.max_parallel == 1: + # Sequential execution (original behavior) + all_results: list[dict] = [] + for version in versions: + for scenario in resolved_scenarios: + for run_idx in range(1, args.runs_per_ref + 1): + try: + summary = run_single_evaluation( + version=version, + scenario=scenario, + run_index=run_idx, + runs_per_ref=args.runs_per_ref, + runs_dir=args.runs_dir, + base_config=args.config, + profile=args.profile, + region=args.region, + scorer_model=args.scorer_model, + default_executor_model=args.executor_model, + use_sandbox=args.sandbox, + ) + all_results.append(summary) + except Exception as e: + print( + f"\n[ERROR] Failed version={version.name}, " + f"scenario={scenario.name}, run={run_idx}: {e}", + file=sys.stderr, + ) + all_results.append({ + "version_name": version.name, + "ref": version.ref, + "repo": version.repo, + "scenario": scenario.name, + "run_index": run_idx, + "status": "error", + "error": str(e), + "output_dir": str( + args.runs_dir + / f"failed-{ref_to_slug(version.name)}-{scenario.name}-run{run_idx}" + ), + }) + else: + # Parallel execution + work_items = [] + for version in versions: + for scenario in resolved_scenarios: + for run_idx in range(1, args.runs_per_ref + 1): + work_items.append(WorkItem( + version=version, + scenario=scenario, + run_index=run_idx, + runs_per_ref=args.runs_per_ref, + runs_dir=args.runs_dir, + base_config=args.config, + profile=args.profile, + region=args.region, + scorer_model=args.scorer_model, + default_executor_model=args.executor_model, + use_sandbox=args.sandbox, + )) + + all_results = run_parallel_evaluations(work_items, args.max_parallel) + + overall_elapsed = time.monotonic() - overall_start + generated_at = datetime.now(UTC).isoformat(timespec="seconds") + + # Write top-level summary + args.runs_dir.mkdir(parents=True, exist_ok=True) + + # Merge results if incremental mode + report_version_names = version_names + report_all_results = all_results + if existing_summary: + print("\nMerging results with existing runs...") + summary_data = merge_summaries( + existing_summary=existing_summary, + new_results=all_results, + new_versions=versions, + new_elapsed_seconds=overall_elapsed, + ) + report_version_names = summary_data["version_names"] + report_all_results = summary_data["runs"] + else: + version_specs = [ + { + "name": v.name, + "ref": v.ref, + "repo": v.repo, + "executor_model": v.executor_model, + } + for v in versions + ] + summary_data = { + "started_at": overall_started_at, + "generated_at": generated_at, + "total_elapsed_seconds": round(overall_elapsed, 1), + "version_names": version_names, + "versions": version_specs, + "scenarios": [s.name for s in resolved_scenarios], + "runs_per_version": args.runs_per_ref, + "total_runs": total_runs, + "runs_succeeded": sum(1 for r in all_results if r.get("status") == "success"), + "runs_failed": sum(1 for r in all_results if r.get("status") != "success"), + "runs": all_results, + } + + summary_path = args.runs_dir / "git-compare-summary.yaml" + with open(summary_path, "w", encoding="utf-8") as f: + yaml.safe_dump(summary_data, f, default_flow_style=False, sort_keys=False) + print(f"\n Summary: {summary_path}") + + # Generate all reports (with merged data if incremental) + print("\nGenerating reports...") + write_reports( + runs_dir=args.runs_dir, + scenarios=[s.name for s in resolved_scenarios], + version_names=report_version_names, + all_results=report_all_results, + generated_at=generated_at, + ) + + # Final summary + print(f"\n{'=' * 70}") + print(" Git Compare Complete") + print(f"{'=' * 70}") + if existing_summary: + print(f" New runs time: {overall_elapsed / 60:.1f} min") + print(f" New runs: {len(all_results)}") + print(f" Total versions: {len(report_version_names)} ({len(existing_summary['version_names'])} existing + {len(version_names)} new)") + print(f" Total runs: {len(report_all_results)} ({len(existing_summary['runs'])} existing + {len(all_results)} new)") + print(f" Succeeded: {sum(1 for r in report_all_results if r.get('status') == 'success')}") + print(f" Failed: {sum(1 for r in report_all_results if r.get('status') != 'success')}") + else: + print(f" Total time: {overall_elapsed / 60:.1f} min") + print(f" Total runs: {len(all_results)}") + print(f" Succeeded: {sum(1 for r in all_results if r.get('status') == 'success')}") + print(f" Failed: {sum(1 for r in all_results if r.get('status') != 'success')}") + + # Show run details (only new runs in incremental mode) + for r in all_results: + marker = "PASS" if r.get("status") == "success" else "FAIL" + duration = r.get("elapsed_seconds", 0) / 60 + print( + f" [{marker}] {r['version_name']:30s} {r['scenario']:15s} " + f"run-{r['run_index']} {duration:.1f} min" + ) + print(f"\n Results: {args.runs_dir}") + + # Exit status based on all results (including existing in incremental mode) + failed = sum(1 for r in report_all_results if r.get("status") != "success") + sys.exit(1 if failed > 0 else 0) + + +if __name__ == "__main__": + main() diff --git a/scripts/aidlc-evaluator/uv.lock b/scripts/aidlc-evaluator/uv.lock index 0dfeca26..fe96c77e 100644 --- a/scripts/aidlc-evaluator/uv.lock +++ b/scripts/aidlc-evaluator/uv.lock @@ -22,6 +22,8 @@ name = "aidlc-cli-harness" version = "0.1.0" source = { editable = "packages/cli-harness" } dependencies = [ + { name = "anthropic", extra = ["bedrock"] }, + { name = "boto3" }, { name = "pyyaml" }, ] @@ -32,6 +34,8 @@ dev = [ [package.metadata] requires-dist = [ + { name = "anthropic", extras = ["bedrock"], specifier = ">=0.40" }, + { name = "boto3", specifier = ">=1.42.47" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" }, { name = "pyyaml", specifier = ">=6.0" }, ] @@ -57,6 +61,7 @@ name = "aidlc-evaluation-framework" version = "0.1.0" source = { virtual = "." } dependencies = [ + { name = "aidlc-cli-harness" }, { name = "aidlc-contracttest" }, { name = "aidlc-nonfunctional" }, { name = "aidlc-qualitative" }, @@ -80,6 +85,7 @@ dev = [ [package.metadata] requires-dist = [ + { name = "aidlc-cli-harness", editable = "packages/cli-harness" }, { name = "aidlc-contracttest", editable = "packages/contracttest" }, { name = "aidlc-nonfunctional", editable = "packages/nonfunctional" }, { name = "aidlc-qualitative", editable = "packages/qualitative" }, @@ -193,6 +199,8 @@ name = "aidlc-runner" version = "0.1.0" source = { editable = "packages/execution" } dependencies = [ + { name = "anthropic", extra = ["bedrock"] }, + { name = "boto3" }, { name = "pyyaml" }, { name = "strands-agents" }, { name = "strands-agents-tools" }, @@ -200,6 +208,8 @@ dependencies = [ [package.metadata] requires-dist = [ + { name = "anthropic", extras = ["bedrock"], specifier = ">=0.40" }, + { name = "boto3", specifier = ">=1.42.47" }, { name = "pyyaml", specifier = ">=6.0" }, { name = "strands-agents", specifier = ">=0.1.0" }, { name = "strands-agents-tools", specifier = ">=0.1.0" }, @@ -325,6 +335,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, ] +[[package]] +name = "anthropic" +version = "0.97.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "docstring-parser" }, + { name = "httpx" }, + { name = "jiter" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/14/93/f66ea8bfe39f2e6bb9da8e27fa5457ad2520e8f7612dfc547b17fad55c4d/anthropic-0.97.0.tar.gz", hash = "sha256:021e79fd8e21e90ad94dc5ba2bbbd8b1599f424f5b1fab6c06204009cab764be", size = 669502, upload-time = "2026-04-23T20:52:34.445Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/53/b6/8e851369fa661ad0fef2ae6266bf3b7d52b78ccf011720058f4adaca59e2/anthropic-0.97.0-py3-none-any.whl", hash = "sha256:8a1a472dfabcfc0c52ff6a3eecf724ac7e07107a2f6e2367be55ceb42f5d5613", size = 662126, upload-time = "2026-04-23T20:52:32.377Z" }, +] + +[package.optional-dependencies] +bedrock = [ + { name = "boto3" }, + { name = "botocore" }, +] + [[package]] name = "anyio" version = "4.12.1" @@ -631,6 +666,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/77/dc8c558f7593132cf8fefec57c4f60c83b16941c574ac5f619abb3ae7933/dill-0.4.1-py3-none-any.whl", hash = "sha256:1e1ce33e978ae97fcfcff5638477032b801c46c7c65cf717f95fbc2248f79a9d", size = 120019, upload-time = "2026-01-19T02:36:55.663Z" }, ] +[[package]] +name = "distro" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, +] + [[package]] name = "docstring-parser" version = "0.17.0" @@ -836,6 +880,60 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, ] +[[package]] +name = "jiter" +version = "0.14.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6e/c1/0cddc6eb17d4c53a99840953f95dd3accdc5cfc7a337b0e9b26476276be9/jiter-0.14.0.tar.gz", hash = "sha256:e8a39e66dac7153cf3f964a12aad515afa8d74938ec5cc0018adcdae5367c79e", size = 165725, upload-time = "2026-04-10T14:28:42.01Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/97/2a/09f70020898507a89279659a1afe3364d57fc1b2c89949081975d135f6f5/jiter-0.14.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:af72f204cf4d44258e5b4c1745130ac45ddab0e71a06333b01de660ab4187a94", size = 315502, upload-time = "2026-04-10T14:26:47.697Z" }, + { url = "https://files.pythonhosted.org/packages/d6/be/080c96a45cd74f9fce5db4fd68510b88087fb37ffe2541ff73c12db92535/jiter-0.14.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4b77da71f6e819be5fbcec11a453fde5b1d0267ef6ed487e2a392fd8e14e4e3a", size = 314870, upload-time = "2026-04-10T14:26:49.149Z" }, + { url = "https://files.pythonhosted.org/packages/7d/5e/2d0fee155826a968a832cc32438de5e2a193292c8721ca70d0b53e58245b/jiter-0.14.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f4ea612fe8b84b8b04e51d0e78029ecf3466348e25973f953de6e6a59aa4c1", size = 343406, upload-time = "2026-04-10T14:26:50.762Z" }, + { url = "https://files.pythonhosted.org/packages/70/af/bf9ee0d3a4f8dc0d679fc1337f874fe60cdbf841ebbb304b374e1c9aaceb/jiter-0.14.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:62fe2451f8fcc0240261e6a4df18ecbcd58327857e61e625b2393ea3b468aac9", size = 369415, upload-time = "2026-04-10T14:26:52.188Z" }, + { url = "https://files.pythonhosted.org/packages/0f/83/8e8561eadba31f4d3948a5b712fb0447ec71c3560b57a855449e7b8ddc98/jiter-0.14.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6112f26f5afc75bcb475787d29da3aa92f9d09c7858f632f4be6ffe607be82e9", size = 461456, upload-time = "2026-04-10T14:26:53.611Z" }, + { url = "https://files.pythonhosted.org/packages/f6/c9/c5299e826a5fe6108d172b344033f61c69b1bb979dd8d9ddd4278a160971/jiter-0.14.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:215a6cb8fb7dc702aa35d475cc00ddc7f970e5c0b1417fb4b4ac5d82fa2a29db", size = 378488, upload-time = "2026-04-10T14:26:55.211Z" }, + { url = "https://files.pythonhosted.org/packages/5d/37/c16d9d15c0a471b8644b1abe3c82668092a707d9bedcf076f24ff2e380cd/jiter-0.14.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc4ab96a30fb3cb2c7e0cd33f7616c8860da5f5674438988a54ac717caccdbaa", size = 353242, upload-time = "2026-04-10T14:26:56.705Z" }, + { url = "https://files.pythonhosted.org/packages/58/ea/8050cb0dc654e728e1bfacbc0c640772f2181af5dedd13ae70145743a439/jiter-0.14.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:3a99c1387b1f2928f799a9de899193484d66206a50e98233b6b088a7f0c1edb2", size = 356823, upload-time = "2026-04-10T14:26:58.281Z" }, + { url = "https://files.pythonhosted.org/packages/b0/3b/cf71506d270e5f84d97326bf220e47aed9b95e9a4a060758fb07772170ab/jiter-0.14.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ab18d11074485438695f8d34a1b6da61db9754248f96d51341956607a8f39985", size = 392564, upload-time = "2026-04-10T14:27:00.018Z" }, + { url = "https://files.pythonhosted.org/packages/b0/cc/8c6c74a3efb5bd671bfd14f51e8a73375464ca914b1551bc3b40e26ac2c9/jiter-0.14.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:801028dcfc26ac0895e4964cbc0fd62c73be9fd4a7d7b1aaf6e5790033a719b7", size = 520322, upload-time = "2026-04-10T14:27:01.664Z" }, + { url = "https://files.pythonhosted.org/packages/41/24/68d7b883ec959884ddf00d019b2e0e82ba81b167e1253684fa90519ce33c/jiter-0.14.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ad425b087aafb4a1c7e1e98a279200743b9aaf30c3e0ba723aec93f061bd9bc8", size = 552619, upload-time = "2026-04-10T14:27:03.316Z" }, + { url = "https://files.pythonhosted.org/packages/b6/89/b1a0985223bbf3150ff9e8f46f98fc9360c1de94f48abe271bbe1b465682/jiter-0.14.0-cp313-cp313-win32.whl", hash = "sha256:882bcb9b334318e233950b8be366fe5f92c86b66a7e449e76975dfd6d776a01f", size = 205699, upload-time = "2026-04-10T14:27:04.662Z" }, + { url = "https://files.pythonhosted.org/packages/4c/19/3f339a5a7f14a11730e67f6be34f9d5105751d547b615ef593fa122a5ded/jiter-0.14.0-cp313-cp313-win_amd64.whl", hash = "sha256:9b8c571a5dba09b98bd3462b5a53f27209a5cbbe85670391692ede71974e979f", size = 201323, upload-time = "2026-04-10T14:27:06.139Z" }, + { url = "https://files.pythonhosted.org/packages/50/56/752dd89c84be0e022a8ea3720bcfa0a8431db79a962578544812ce061739/jiter-0.14.0-cp313-cp313-win_arm64.whl", hash = "sha256:34f19dcc35cb1abe7c369b3756babf8c7f04595c0807a848df8f26ef8298ef92", size = 191099, upload-time = "2026-04-10T14:27:07.564Z" }, + { url = "https://files.pythonhosted.org/packages/91/28/292916f354f25a1fe8cf2c918d1415c699a4a659ae00be0430e1c5d9ffea/jiter-0.14.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e89bcd7d426a75bb4952c696b267075790d854a07aad4c9894551a82c5b574ab", size = 320880, upload-time = "2026-04-10T14:27:09.326Z" }, + { url = "https://files.pythonhosted.org/packages/ad/c7/b002a7d8b8957ac3d469bd59c18ef4b1595a5216ae0de639a287b9816023/jiter-0.14.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b25beaa0d4447ea8c7ae0c18c688905d34840d7d0b937f2f7bdd52162c98a40", size = 346563, upload-time = "2026-04-10T14:27:11.287Z" }, + { url = "https://files.pythonhosted.org/packages/f9/3b/f8d07580d8706021d255a6356b8fab13ee4c869412995550ce6ed4ddf97d/jiter-0.14.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:651a8758dd413c51e3b7f6557cdc6921faf70b14106f45f969f091f5cda990ea", size = 357928, upload-time = "2026-04-10T14:27:12.729Z" }, + { url = "https://files.pythonhosted.org/packages/47/5b/ac1a974da29e35507230383110ffec59998b290a8732585d04e19a9eb5ba/jiter-0.14.0-cp313-cp313t-win_amd64.whl", hash = "sha256:e1a7eead856a5038a8d291f1447176ab0b525c77a279a058121b5fccee257f6f", size = 203519, upload-time = "2026-04-10T14:27:14.125Z" }, + { url = "https://files.pythonhosted.org/packages/96/6d/9fc8433d667d2454271378a79747d8c76c10b51b482b454e6190e511f244/jiter-0.14.0-cp313-cp313t-win_arm64.whl", hash = "sha256:2e692633a12cda97e352fdcd1c4acc971b1c28707e1e33aeef782b0cbf051975", size = 190113, upload-time = "2026-04-10T14:27:16.638Z" }, + { url = "https://files.pythonhosted.org/packages/4f/1e/354ed92461b165bd581f9ef5150971a572c873ec3b68a916d5aa91da3cc2/jiter-0.14.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:6f396837fc7577871ca8c12edaf239ed9ccef3bbe39904ae9b8b63ce0a48b140", size = 315277, upload-time = "2026-04-10T14:27:18.109Z" }, + { url = "https://files.pythonhosted.org/packages/a6/95/8c7c7028aa8636ac21b7a55faef3e34215e6ed0cbf5ae58258427f621aa3/jiter-0.14.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a4d50ea3d8ba4176f79754333bd35f1bbcd28e91adc13eb9b7ca91bc52a6cef9", size = 315923, upload-time = "2026-04-10T14:27:19.603Z" }, + { url = "https://files.pythonhosted.org/packages/47/40/e2a852a44c4a089f2681a16611b7ce113224a80fd8504c46d78491b47220/jiter-0.14.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce17f8a050447d1b4153bda4fb7d26e6a9e74eb4f4a41913f30934c5075bf615", size = 344943, upload-time = "2026-04-10T14:27:21.262Z" }, + { url = "https://files.pythonhosted.org/packages/fc/1f/670f92adee1e9895eac41e8a4d623b6da68c4d46249d8b556b60b63f949e/jiter-0.14.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f4f1c4b125e1652aefbc2e2c1617b60a160ab789d180e3d423c41439e5f32850", size = 369725, upload-time = "2026-04-10T14:27:22.766Z" }, + { url = "https://files.pythonhosted.org/packages/01/2f/541c9ba567d05de1c4874a0f8f8c5e3fd78e2b874266623da9a775cf46e0/jiter-0.14.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be808176a6a3a14321d18c603f2d40741858a7c4fc982f83232842689fe86dd9", size = 461210, upload-time = "2026-04-10T14:27:24.315Z" }, + { url = "https://files.pythonhosted.org/packages/ce/a9/c31cbec09627e0d5de7aeaec7690dba03e090caa808fefd8133137cf45bc/jiter-0.14.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:26679d58ba816f88c3849306dd58cb863a90a1cf352cdd4ef67e30ccf8a77994", size = 380002, upload-time = "2026-04-10T14:27:26.155Z" }, + { url = "https://files.pythonhosted.org/packages/50/02/3c05c1666c41904a2f607475a73e7a4763d1cbde2d18229c4f85b22dc253/jiter-0.14.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80381f5a19af8fa9aef743f080e34f6b25ebd89656475f8cf0470ec6157052aa", size = 354678, upload-time = "2026-04-10T14:27:27.701Z" }, + { url = "https://files.pythonhosted.org/packages/7d/97/e15b33545c2b13518f560d695f974b9891b311641bdcf178d63177e8801e/jiter-0.14.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:004df5fdb8ecbd6d99f3227df18ba1a259254c4359736a2e6f036c944e02d7c5", size = 358920, upload-time = "2026-04-10T14:27:29.256Z" }, + { url = "https://files.pythonhosted.org/packages/ad/d2/8b1461def6b96ba44530df20d07ef7a1c7da22f3f9bf1727e2d611077bf1/jiter-0.14.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cff5708f7ed0fa098f2b53446c6fa74c48469118e5cd7497b4f1cd569ab06928", size = 394512, upload-time = "2026-04-10T14:27:31.344Z" }, + { url = "https://files.pythonhosted.org/packages/e3/88/837566dd6ed6e452e8d3205355afd484ce44b2533edfa4ed73a298ea893e/jiter-0.14.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:2492e5f06c36a976d25c7cc347a60e26d5470178d44cde1b9b75e60b4e519f28", size = 521120, upload-time = "2026-04-10T14:27:33.299Z" }, + { url = "https://files.pythonhosted.org/packages/89/6b/b00b45c4d1b4c031777fe161d620b755b5b02cdade1e316dcb46e4471d63/jiter-0.14.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:7609cfbe3a03d37bfdbf5052012d5a879e72b83168a363deae7b3a26564d57de", size = 553668, upload-time = "2026-04-10T14:27:34.868Z" }, + { url = "https://files.pythonhosted.org/packages/ad/d8/6fe5b42011d19397433d345716eac16728ac241862a2aac9c91923c7509a/jiter-0.14.0-cp314-cp314-win32.whl", hash = "sha256:7282342d32e357543565286b6450378c3cd402eea333fc1ebe146f1fabb306fc", size = 207001, upload-time = "2026-04-10T14:27:36.455Z" }, + { url = "https://files.pythonhosted.org/packages/e5/43/5c2e08da1efad5e410f0eaaabeadd954812612c33fbbd8fd5328b489139d/jiter-0.14.0-cp314-cp314-win_amd64.whl", hash = "sha256:bd77945f38866a448e73b0b7637366afa814d4617790ecd88a18ca74377e6c02", size = 202187, upload-time = "2026-04-10T14:27:38Z" }, + { url = "https://files.pythonhosted.org/packages/aa/1f/6e39ac0b4cdfa23e606af5b245df5f9adaa76f35e0c5096790da430ca506/jiter-0.14.0-cp314-cp314-win_arm64.whl", hash = "sha256:f2d4c61da0821ee42e0cdf5489da60a6d074306313a377c2b35af464955a3611", size = 192257, upload-time = "2026-04-10T14:27:39.504Z" }, + { url = "https://files.pythonhosted.org/packages/05/57/7dbc0ffbbb5176a27e3518716608aa464aee2e2887dc938f0b900a120449/jiter-0.14.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1bf7ff85517dd2f20a5750081d2b75083c1b269cf75afc7511bdf1f9548beb3b", size = 323441, upload-time = "2026-04-10T14:27:41.039Z" }, + { url = "https://files.pythonhosted.org/packages/83/6e/7b3314398d8983f06b557aa21b670511ec72d3b79a68ee5e4d9bff972286/jiter-0.14.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8ef8791c3e78d6c6b157c6d360fbb5c715bebb8113bc6a9303c5caff012754a", size = 348109, upload-time = "2026-04-10T14:27:42.552Z" }, + { url = "https://files.pythonhosted.org/packages/ae/4f/8dc674bcd7db6dba566de73c08c763c337058baff1dbeb34567045b27cdc/jiter-0.14.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e74663b8b10da1fe0f4e4703fd7980d24ad17174b6bb35d8498d6e3ebce2ae6a", size = 368328, upload-time = "2026-04-10T14:27:44.574Z" }, + { url = "https://files.pythonhosted.org/packages/3b/5f/188e09a1f20906f98bbdec44ed820e19f4e8eb8aff88b9d1a5a497587ff3/jiter-0.14.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1aca29ba52913f78362ec9c2da62f22cdc4c3083313403f90c15460979b84d9b", size = 463301, upload-time = "2026-04-10T14:27:46.717Z" }, + { url = "https://files.pythonhosted.org/packages/ac/f0/19046ef965ed8f349e8554775bb12ff4352f443fbe12b95d31f575891256/jiter-0.14.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8b39b7d87a952b79949af5fef44d2544e58c21a28da7f1bae3ef166455c61746", size = 378891, upload-time = "2026-04-10T14:27:48.32Z" }, + { url = "https://files.pythonhosted.org/packages/c4/c3/da43bd8431ee175695777ee78cf0e93eacbb47393ff493f18c45231b427d/jiter-0.14.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78d918a68b26e9fab068c2b5453577ef04943ab2807b9a6275df2a812599a310", size = 360749, upload-time = "2026-04-10T14:27:49.88Z" }, + { url = "https://files.pythonhosted.org/packages/72/26/e054771be889707c6161dbdec9c23d33a9ec70945395d70f07cfea1e9a6f/jiter-0.14.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:b08997c35aee1201c1a5361466a8fb9162d03ae7bf6568df70b6c859f1e654a4", size = 358526, upload-time = "2026-04-10T14:27:51.504Z" }, + { url = "https://files.pythonhosted.org/packages/c3/0f/7bea65ea2a6d91f2bf989ff11a18136644392bf2b0497a1fa50934c30a9c/jiter-0.14.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:260bf7ca20704d58d41f669e5e9fe7fe2fa72901a6b324e79056f5d52e9c9be2", size = 393926, upload-time = "2026-04-10T14:27:53.368Z" }, + { url = "https://files.pythonhosted.org/packages/3c/a1/b1ff7d70deef61ac0b7c6c2f12d2ace950cdeecb4fdc94500a0926802857/jiter-0.14.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:37826e3df29e60f30a382f9294348d0238ef127f4b5d7f5f8da78b5b9e050560", size = 521052, upload-time = "2026-04-10T14:27:55.058Z" }, + { url = "https://files.pythonhosted.org/packages/0b/7b/3b0649983cbaf15eda26a414b5b1982e910c67bd6f7b1b490f3cfc76896a/jiter-0.14.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:645be49c46f2900937ba0eaf871ad5183c96858c0af74b6becc7f4e367e36e06", size = 553716, upload-time = "2026-04-10T14:27:57.269Z" }, + { url = "https://files.pythonhosted.org/packages/97/f8/33d78c83bd93ae0c0af05293a6660f88a1977caef39a6d72a84afab94ce0/jiter-0.14.0-cp314-cp314t-win32.whl", hash = "sha256:2f7877ed45118de283786178eceaf877110abacd04fde31efff3940ae9672674", size = 207957, upload-time = "2026-04-10T14:27:59.285Z" }, + { url = "https://files.pythonhosted.org/packages/d6/ac/2b760516c03e2227826d1f7025d89bf6bf6357a28fe75c2a2800873c50bf/jiter-0.14.0-cp314-cp314t-win_amd64.whl", hash = "sha256:14c0cb10337c49f5eafe8e7364daca5e29a020ea03580b8f8e6c597fed4e1588", size = 204690, upload-time = "2026-04-10T14:28:00.962Z" }, + { url = "https://files.pythonhosted.org/packages/dc/2e/a44c20c58aeed0355f2d326969a181696aeb551a25195f47563908a815be/jiter-0.14.0-cp314-cp314t-win_arm64.whl", hash = "sha256:5419d4aa2024961da9fe12a9cfe7484996735dca99e8e090b5c88595ef1951ff", size = 191338, upload-time = "2026-04-10T14:28:02.853Z" }, +] + [[package]] name = "jmespath" version = "1.1.0" @@ -1818,6 +1916,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/27/72/428fb01a1043ddbb3f66297363406d6e69ddff5ad89c4d07945a3753a235/slack_sdk-3.40.0-py2.py3-none-any.whl", hash = "sha256:f2bada5ed3adb10a01e154e90db01d6d8938d0461b5790c12bcb807b2d28bbe2", size = 312786, upload-time = "2026-02-10T22:12:11.258Z" }, ] +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, +] + [[package]] name = "soupsieve" version = "2.8.3"