diff --git a/scripts/aidlc-evaluator/.gitleaks.toml b/scripts/aidlc-evaluator/.gitleaks.toml
new file mode 100644
index 00000000..856ebb71
--- /dev/null
+++ b/scripts/aidlc-evaluator/.gitleaks.toml
@@ -0,0 +1,8 @@
+# Gitleaks configuration for aidlc-evaluator
+# Suppress false positives from test fixtures that intentionally contain fake credentials.
+
+[allowlist]
+description = "Fake credentials used in test_credential_scrubber.py test fixtures"
+paths = [
+    "packages/shared/tests/test_credential_scrubber.py",
+]
diff --git a/scripts/aidlc-evaluator/ARCHITECTURE.md b/scripts/aidlc-evaluator/ARCHITECTURE.md
index 1628f402..45c2f42e 100644
--- a/scripts/aidlc-evaluator/ARCHITECTURE.md
+++ b/scripts/aidlc-evaluator/ARCHITECTURE.md
@@ -460,6 +460,36 @@ get_adapter(name)     ← lazy import from registry
 
 Supported adapters: Cursor, Cline, Copilot, Kiro, Windsurf, Antigravity.
 
+### 6.5 CLI Evaluation (`run_cli_evaluation.py`)
+
+Runs the AIDLC workflow through CLI-based AI assistants (Claude Code, Kiro CLI, etc.):
+
+```text
+load_adapters_from_config(cfg_data)  ← register any custom adapters from config.yaml
+  │
+get_adapter(name)     ← lazy import from registry
+  │
+  ├── check_prerequisites()
+  ├── HumanSimulator built once by orchestrator (vision + tech_env + openapi injected)
+  ├── adapter.run(config) ──► CLI-specific automation + simulator gate reviews
+  ├── normalize_output()  ──► standard run folder layout
+  └── run_evaluation.py --evaluate-only  ──► stages 2-6
+```
+
+**Adapter pattern**: Each CLI tool is implemented as a subclass of `CLIAdapter` (`packages/cli-harness/src/cli_harness/adapter.py`) with three methods:
+
+- `name` — human-readable identifier (e.g. `"kiro-cli"`)
+- `check_prerequisites()` — verify the CLI tool is installed and credentials are valid
+- `run(config: AdapterConfig) -> AdapterResult` — execute the AIDLC workflow and return results
+
+**HumanSimulator injection**: The orchestrator constructs a single `HumanSimulator` with the full document context (vision, tech-env, OpenAPI spec) before calling the adapter. It is passed in as `config.simulator`. Adapters access it via `config.simulator.respond(message)` — they do not construct it themselves.
+
+**Simulator gates**: Adapters use `config.simulator` to inject human-reviewer feedback at key workflow stages. The kiro-cli adapter uses 4 stage gates (requirements → design → code-gen plan → construction); the claude-code-sdk adapter intercepts `handoff_to_simulator` tool calls inline.
+
+**Plugin registration**: Custom adapters can be added without modifying framework code — see [Adding a New CLI Adapter](#adding-a-new-cli-adapter) below.
+
+Supported built-in adapters: `claude-code`, `claude-code-sdk`, `kiro-cli`.
+
 ---
 
 ## 7. Data Flow: YAML Artifact Graph
@@ -633,6 +663,94 @@ The default test case is `sci-calc` (a scientific calculator API). All CLI defau
 1. Create `config/<model-name>.yaml` with `models.executor.model_id` set to the Bedrock model ID
 2. The batch runner will automatically discover it
 
+### Adding a New CLI Adapter
+
+CLI adapters live in `packages/cli-harness` and follow a plugin pattern — no framework code changes are needed.
+
+**Step 1 — Implement the adapter**
+
+Create a module anywhere importable (e.g. `packages/cli-harness/src/cli_harness/adapters/my_tool.py`):
+
+```python
+from cli_harness.adapter import AdapterConfig, AdapterResult, CLIAdapter
+
+class MyToolAdapter(CLIAdapter):
+    @property
+    def name(self) -> str:
+        return "my-tool"
+
+    def check_prerequisites(self) -> tuple[bool, str]:
+        import shutil
+        if not shutil.which("my-tool"):
+            return False, "'my-tool' not found in PATH"
+        return True, "my-tool found"
+
+    def run(self, config: AdapterConfig) -> AdapterResult:
+        import time, shutil
+        from cli_harness.normalizer import normalize_output
+
+        start = time.monotonic()
+        workspace = config.output_dir / "workspace"
+        workspace.mkdir(parents=True, exist_ok=True)
+
+        # Copy inputs, inject rules, run the CLI tool...
+        # Use config.simulator.respond(message) at review gates.
+        simulator = config.simulator  # pre-built with vision/tech_env/openapi context
+        if simulator is None:
+            raise RuntimeError("my-tool requires a simulator (set --simulator-model)")
+
+        # ... run CLI tool stages, call simulator.respond() between stages ...
+
+        elapsed = time.monotonic() - start
+        normalize_output(
+            source_dir=workspace,
+            output_dir=config.output_dir,
+            adapter_name=self.name,
+            elapsed_seconds=elapsed,
+        )
+        dst_docs = config.output_dir / "aidlc-docs"
+        return AdapterResult(
+            success=dst_docs.is_dir(),
+            output_dir=config.output_dir,
+            aidlc_docs_dir=dst_docs if dst_docs.is_dir() else None,
+            workspace_dir=workspace,
+            elapsed_seconds=elapsed,
+        )
+```
+
+**Step 2 — Register in config** (no framework edits needed)
+
+Add one line to `config/default.yaml` (or your own config file):
+
+```yaml
+cli:
+  adapters:
+    my-tool: "cli_harness.adapters.my_tool.MyToolAdapter"
+```
+
+**Step 3 — Verify**
+
+```bash
+# Confirm it appears
+uv run python run.py cli --list
+
+# Check prerequisites
+uv run python run.py cli --cli my-tool --check-only
+
+# Run evaluation
+uv run python run.py cli --cli my-tool --scenario sci-calc
+```
+
+**Key contracts for adapter implementors:**
+
+| What             | Where                                                                   | Notes                                                                     |
+| ---------------- | ----------------------------------------------------------------------- | ------------------------------------------------------------------------- |
+| Abstract base    | `cli_harness/adapter.py` - `CLIAdapter`                                 | Implement `name`, `check_prerequisites`, `run`                            |
+| Simulator        | `config.simulator` (`HumanSimulator`)                                   | Call `.respond(message)` at review gates; never construct it yourself     |
+| Output layout    | `cli_harness/normalizer.py` (`normalize_output()`)                      | Call at end of `run()` to write `run-meta.yaml` / `run-metrics.yaml`      |
+| Post-run tests   | `aidlc_runner.post_run.run_post_evaluation()`                           | Optional; call after `normalize_output()` to run generated project tests  |
+| Document context | `config.vision_path`, `config.tech_env_path`, `config.openapi_content`  | Available if needed; simulator already has this context                   |
+
 ### Adding a New IDE Adapter
 
 1. Create `packages/ide-harness/src/ide_harness/adapters/<name>.py`
diff --git a/scripts/aidlc-evaluator/CONTRIBUTING.md b/scripts/aidlc-evaluator/CONTRIBUTING.md
index c8b1fb7f..e72e3120 100644
--- a/scripts/aidlc-evaluator/CONTRIBUTING.md
+++ b/scripts/aidlc-evaluator/CONTRIBUTING.md
@@ -36,11 +36,14 @@ git checkout -b feature/your-feature-name
 
 Work in the appropriate package:
 
-- `aidlc-runner/` - Execution Framework (two-agent AIDLC workflow runner)
+- `packages/execution/` - Execution Framework (two-agent AIDLC workflow runner)
+- `packages/cli-harness/` - CLI Adapter Framework (Claude Code, Kiro CLI, custom tools)
+- `packages/ide-harness/` - IDE Adapter Framework (Cursor, Cline, Kiro, etc.)
 - `packages/qualitative/` - Semantic Evaluation (intent & design similarity scoring)
 - `packages/quantitative/` - Code Evaluation (linting, security, organization)
 - `packages/nonfunctional/` - NFR Evaluation (tokens, timing, consistency)
 - `packages/reporting/` - Report generation
+- `packages/trend-reports/` - Cross-release trend reporting
 - `packages/shared/` - Common utilities
 
 Or contribute to other work streams:
@@ -96,14 +99,17 @@ git commit -m "Add token tracking to nonfunctional package"
 
 The project is organized around six big rocks. Your changes will typically fall into one or more of these:
 
-| Work Stream             | Description                                   | Package / Area            |
-| ----------------------- | --------------------------------------------- | ------------------------- |
-| **Golden Test Case**    | Curated baseline test inputs                  | `test_cases/`             |
-| **Execution Framework** | Two-agent AIDLC workflow runner (Owner: Jeff) | `aidlc-runner/`           |
-| **Semantic Evaluation** | Intent & design similarity scoring            | `packages/qualitative/`   |
-| **Code Evaluation**     | Linting, security, organization               | `packages/quantitative/`  |
-| **NFR Evaluation**      | Tokens, timing, consistency                   | `packages/nonfunctional/` |
-| **GitHub CI/CD**        | Pipeline integration & management             | `.github/workflows/`      |
+| Work Stream             | Description                                   | Package / Area               |
+| ----------------------- | --------------------------------------------- | ---------------------------- |
+| **Golden Test Case**    | Curated baseline test inputs                  | `test_cases/`                |
+| **Execution Framework** | Two-agent AIDLC workflow runner               | `packages/execution/`        |
+| **CLI Adapters**        | CLI tool integrations (Claude Code, Kiro CLI) | `packages/cli-harness/`      |
+| **IDE Adapters**        | IDE tool integrations (Cursor, Cline, etc.)   | `packages/ide-harness/`      |
+| **Semantic Evaluation** | Intent & design similarity scoring            | `packages/qualitative/`      |
+| **Code Evaluation**     | Linting, security, organization               | `packages/quantitative/`     |
+| **NFR Evaluation**      | Tokens, timing, consistency                   | `packages/nonfunctional/`    |
+| **Trend Reporting**     | Cross-release metric tracking                 | `packages/trend-reports/`    |
+| **GitHub CI/CD**        | Pipeline integration & management             | `.github/workflows/`         |
 
 ## Code Standards
 
diff --git a/scripts/aidlc-evaluator/config/default.yaml b/scripts/aidlc-evaluator/config/default.yaml
index 28992087..e0e31bb2 100644
--- a/scripts/aidlc-evaluator/config/default.yaml
+++ b/scripts/aidlc-evaluator/config/default.yaml
@@ -41,3 +41,6 @@ execution:
 
 tools:
   pmd_path: null  # Path to PMD executable; if null, looks for 'pmd' on PATH
+
+cli:
+  adapters: {}  # Register custom CLI adapters: name: "mypackage.MyAdapter"
diff --git a/scripts/aidlc-evaluator/docker/sandbox/Dockerfile b/scripts/aidlc-evaluator/docker/sandbox/Dockerfile
index d45d61d4..b42e4444 100644
--- a/scripts/aidlc-evaluator/docker/sandbox/Dockerfile
+++ b/scripts/aidlc-evaluator/docker/sandbox/Dockerfile
@@ -1,6 +1,6 @@
 # Multi-language sandbox image for running AI-generated code in isolation.
 #
-# Includes Python 3.14 + uv, Node.js 22 + npm, and common build tools.
+# Includes Python 3.13 + uv, Node.js 22 + npm, and common build tools.
 # Runs as a non-root user with no credentials or host tools.
 #
 # Security notes:
@@ -9,7 +9,7 @@
 
 # checkov:skip=CKV_DOCKER_2:HEALTHCHECK not needed for ephemeral test sandbox
 # nosemgrep: dockerfile-source-not-pinned
-FROM public.ecr.aws/docker/library/python:3.14-slim@sha256:3989a23fd2c28a34c7be819e488b958a10601d421ac25bea1e7a5d757365e2d5 AS base
+FROM public.ecr.aws/docker/library/python:3.13-slim@sha256:8922791069fdfdd6056cf7f418a8655d970862d1972570d4c0e78dfc43afacd6 AS base
 
 # Install system dependencies and Node.js 22
 # nosemgrep: set-pipefail
diff --git a/scripts/aidlc-evaluator/packages/cli-harness/pyproject.toml b/scripts/aidlc-evaluator/packages/cli-harness/pyproject.toml
index c5e30217..ad9acf0c 100644
--- a/scripts/aidlc-evaluator/packages/cli-harness/pyproject.toml
+++ b/scripts/aidlc-evaluator/packages/cli-harness/pyproject.toml
@@ -5,6 +5,8 @@ description = "CLI-based harness for testing AIDLC workflows via kiro-cli"
 requires-python = ">=3.13"
 dependencies = [
     "pyyaml>=6.0",
+    "anthropic[bedrock]>=0.40",
+    "boto3>=1.42.47",
 ]
 
 [project.optional-dependencies]
diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapter.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapter.py
index 95f0f0c0..e015f3d4 100644
--- a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapter.py
+++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapter.py
@@ -5,6 +5,10 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from pathlib import Path
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from cli_harness.simulator import HumanSimulator
 
 
 @dataclass
@@ -17,7 +21,11 @@ class AdapterConfig:
     tech_env_path: Path | None = None
     prompt_template: str | None = None
     model: str | None = None
+    simulator_model: str | None = None  # kept for backwards compat; prefer simulator field
     aws_profile: str | None = None
+    aws_region: str | None = None
+    openapi_content: str | None = None  # injected into prompt/simulator for contract validation
+    simulator: "HumanSimulator | None" = None  # pre-built by orchestrator; shared across adapters
     timeout_seconds: int = 7200  # 2 hours max
 
 
diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code.py
index 1bb25d94..247b74e2 100644
--- a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code.py
+++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code.py
@@ -128,8 +128,11 @@ def run(self, config: AdapterConfig) -> AdapterResult:
                 shutil.copy2(rules_path, rules_dir / rules_path.name)
                 _log(f"Copied AIDLC rules file: {rules_path.name}")
 
-            # Build the prompt
-            prompt = config.prompt_template or render_prompt()
+            # Build the prompt — inject OpenAPI spec so the self-approving executor
+            # has the full contract in view during design and code review.
+            prompt = config.prompt_template or render_prompt(
+                openapi_content=config.openapi_content,
+            )
 
             # Build command — claude -p for non-interactive print mode
             cmd = [
diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code_sdk.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code_sdk.py
new file mode 100644
index 00000000..a08842d4
--- /dev/null
+++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code_sdk.py
@@ -0,0 +1,588 @@
+"""Claude Code SDK adapter — drives AIDLC workflows via Anthropic SDK with Bedrock.
+
+Unlike the subprocess-based ClaudeCodeAdapter (which runs ``claude -p`` as a
+one-shot process), this adapter uses ``anthropic.AnthropicBedrock`` to drive
+the executor turn-by-turn.  It intercepts ``handoff_to_simulator`` tool calls
+and injects Human Simulator responses using the same system prompt as the
+Strands two-agent swarm in ``packages/execution``.
+
+This faithfully recreates the interactive executor↔simulator loop that the CLI
+subprocess approach cannot support.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import shlex
+import subprocess
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+
+import anthropic
+import boto3
+
+from cli_harness.adapter import AdapterConfig, AdapterResult, CLIAdapter
+from cli_harness.normalizer import normalize_output
+from cli_harness.simulator import HumanSimulator
+
+# Execution package imports (system prompts + post-run tests)
+import sys as _sys
+_EXEC_SRC = Path(__file__).resolve().parents[6] / "execution" / "src"
+if str(_EXEC_SRC) not in _sys.path:
+    _sys.path.insert(0, str(_EXEC_SRC))
+from aidlc_runner.agents.executor import EXECUTOR_SYSTEM_PROMPT  # noqa: E402
+from aidlc_runner.post_run import run_post_evaluation  # noqa: E402
+from aidlc_runner.config import ExecutionConfig, SandboxConfig, RunnerConfig  # noqa: E402
+
+_SHARED_SRC = Path(__file__).resolve().parents[6] / "shared" / "src"
+if str(_SHARED_SRC) not in _sys.path:
+    _sys.path.insert(0, str(_SHARED_SRC))
+from shared.sandbox import _get_container_cli  # noqa: E402
+
+logger = logging.getLogger(__name__)
+
+_MAX_ITERATIONS = 300
+_MAX_OUTPUT_CHARS = 50_000
+
+
+def _log(msg: str) -> None:
+    print(f"  [claude-sdk] {msg}", file=_sys.stderr, flush=True)
+
+
+# ── Tool schemas ──────────────────────────────────────────────────────────────
+
+_TOOL_READ_FILE: dict = {
+    "name": "read_file",
+    "description": "Read the contents of a file in the run folder.",
+    "input_schema": {
+        "type": "object",
+        "properties": {
+            "path": {
+                "type": "string",
+                "description": "File path relative to the run folder (e.g. 'aidlc-docs/aidlc-state.md').",
+            }
+        },
+        "required": ["path"],
+    },
+}
+
+_TOOL_WRITE_FILE: dict = {
+    "name": "write_file",
+    "description": "Write content to a file in the run folder. Creates parent directories if needed.",
+    "input_schema": {
+        "type": "object",
+        "properties": {
+            "path": {
+                "type": "string",
+                "description": "Relative to run folder (e.g. 'aidlc-docs/inception/requirements.md').",
+            },
+            "content": {
+                "type": "string",
+                "description": "The text content to write to the file.",
+            },
+        },
+        "required": ["path", "content"],
+    },
+}
+
+_TOOL_LIST_FILES: dict = {
+    "name": "list_files",
+    "description": "List files and directories within a path in the run folder.",
+    "input_schema": {
+        "type": "object",
+        "properties": {
+            "directory": {
+                "type": "string",
+                "description": "Directory path relative to the run folder. Defaults to '.'.",
+                "default": ".",
+            }
+        },
+        "required": [],
+    },
+}
+
+_TOOL_LOAD_RULE: dict = {
+    "name": "load_rule",
+    "description": (
+        "Load an AIDLC rule file by path. "
+        "Use this to read AIDLC workflow rules as you progress through stages."
+    ),
+    "input_schema": {
+        "type": "object",
+        "properties": {
+            "rule_path": {
+                "type": "string",
+                "description": (
+                    "Path relative to the rules directory. Examples: "
+                    "'core-workflow', 'common/process-overview.md', "
+                    "'inception/requirements-analysis.md', 'construction/code-generation.md'."
+                ),
+            }
+        },
+        "required": ["rule_path"],
+    },
+}
+
+_TOOL_RUN_COMMAND: dict = {
+    "name": "run_command",
+    "description": (
+        "Execute a shell command in the run folder. "
+        "Use during Build and Test to install dependencies, run tests, and fix issues."
+    ),
+    "input_schema": {
+        "type": "object",
+        "properties": {
+            "command": {
+                "type": "string",
+                "description": "The shell command to execute.",
+            },
+            "working_directory": {
+                "type": "string",
+                "description": "Directory relative to the run folder to run in (default: workspace/).",
+                "default": "workspace",
+            },
+        },
+        "required": ["command"],
+    },
+}
+
+_TOOL_HANDOFF_TO_SIMULATOR: dict = {
+    "name": "handoff_to_simulator",
+    "description": (
+        "Hand off to the Human Simulator agent for answers, approvals, or reviews. "
+        "The simulator will respond and hand control back to you."
+    ),
+    "input_schema": {
+        "type": "object",
+        "properties": {
+            "message": {
+                "type": "string",
+                "description": (
+                    "Message to the simulator — describe what input you need "
+                    "(answer questions / approve document / review) and include the "
+                    "file path they need to read."
+                ),
+            }
+        },
+        "required": ["message"],
+    },
+}
+
+_EXECUTOR_TOOLS = [
+    _TOOL_READ_FILE,
+    _TOOL_WRITE_FILE,
+    _TOOL_LIST_FILES,
+    _TOOL_LOAD_RULE,
+    _TOOL_RUN_COMMAND,
+    _TOOL_HANDOFF_TO_SIMULATOR,
+]
+
+# ── Token accumulator ─────────────────────────────────────────────────────────
+
+@dataclass
+class _TokenBucket:
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cache_read_tokens: int = 0
+    cache_write_tokens: int = 0
+
+    def add(self, usage: anthropic.types.Usage) -> None:
+        self.input_tokens += getattr(usage, "input_tokens", 0)
+        self.output_tokens += getattr(usage, "output_tokens", 0)
+        self.cache_read_tokens += getattr(usage, "cache_read_input_tokens", 0)
+        self.cache_write_tokens += getattr(usage, "cache_creation_input_tokens", 0)
+
+    @property
+    def total(self) -> int:
+        return self.input_tokens + self.output_tokens + self.cache_read_tokens + self.cache_write_tokens
+
+
+@dataclass
+class _UsageTracker:
+    executor: _TokenBucket = field(default_factory=_TokenBucket)
+    simulator: _TokenBucket = field(default_factory=_TokenBucket)
+    handoff_count: int = 0
+
+    def to_dict(self) -> dict:
+        e, s = self.executor, self.simulator
+        return {
+            "input_tokens": e.input_tokens + s.input_tokens,
+            "output_tokens": e.output_tokens + s.output_tokens,
+            "total_tokens": e.total + s.total,
+            "cache_read_tokens": e.cache_read_tokens + s.cache_read_tokens,
+            "cache_write_tokens": e.cache_write_tokens + s.cache_write_tokens,
+            "executor_input_tokens": e.input_tokens,
+            "executor_output_tokens": e.output_tokens,
+            "executor_total_tokens": e.total,
+            "simulator_input_tokens": s.input_tokens,
+            "simulator_output_tokens": s.output_tokens,
+            "simulator_total_tokens": s.total,
+            "handoffs": self.handoff_count,
+            "num_turns": self.handoff_count,
+        }
+
+
+# ── Tool execution ────────────────────────────────────────────────────────────
+
+def _resolve_safe(base: Path, relative: str) -> Path:
+    resolved = (base / relative).resolve()
+    if not str(resolved).startswith(str(base.resolve())):
+        raise ValueError(f"Path traversal denied: {relative}")
+    return resolved
+
+
+def _exec_tool(name: str, tool_input: dict, run_folder: Path, rules_dir: Path) -> str:
+    """Execute a tool call and return its string result."""
+    try:
+        if name == "read_file":
+            path = tool_input["path"]
+            target = _resolve_safe(run_folder, path)
+            if not target.exists():
+                return f"Error: File not found: {path}"
+            if not target.is_file():
+                return f"Error: Not a file: {path}"
+            return target.read_text(encoding="utf-8")
+
+        elif name == "write_file":
+            path, content = tool_input["path"], tool_input.get("content", "")
+            target = _resolve_safe(run_folder, path)
+            target.parent.mkdir(parents=True, exist_ok=True)
+            target.write_text(content, encoding="utf-8")
+            return f"Written: {path} ({len(content)} chars)"
+
+        elif name == "list_files":
+            directory = tool_input.get("directory", ".")
+            target = _resolve_safe(run_folder, directory)
+            if not target.exists():
+                return f"Error: Directory not found: {directory}"
+            if not target.is_dir():
+                return f"Error: Not a directory: {directory}"
+            entries = sorted(target.iterdir())
+            lines = [
+                f"  {e.relative_to(run_folder)}{'/' if e.is_dir() else ''}"
+                for e in entries
+            ]
+            return "\n".join(lines) if lines else f"(empty: {directory})"
+
+        elif name == "load_rule":
+            rule_path = tool_input["rule_path"]
+            if rule_path in ("core-workflow", "core-workflow.md"):
+                target = rules_dir / "aws-aidlc-rules" / "core-workflow.md"
+            else:
+                target = rules_dir / "aws-aidlc-rule-details" / rule_path
+                if not target.suffix:
+                    target = target.with_suffix(".md")
+            resolved = target.resolve()
+            if not str(resolved).startswith(str(rules_dir.resolve())):
+                return f"Error: Path traversal denied: {rule_path}"
+            if not resolved.exists():
+                return f"Error: Rule not found: {rule_path}"
+            return resolved.read_text(encoding="utf-8")
+
+        elif name == "run_command":
+            command = tool_input["command"]
+            working_dir = tool_input.get("working_directory", "workspace")
+            cwd = _resolve_safe(run_folder, working_dir)
+            if not cwd.is_dir():
+                return f"[error: working directory not found: {working_dir}]"
+            env = {
+                "PATH": os.environ.get("PATH", "/usr/bin:/bin"),
+                "HOME": str(run_folder),
+                "LANG": os.environ.get("LANG", "C.UTF-8"),
+                "TERM": "dumb",
+            }
+            for var in ("UV_CACHE_DIR", "UV_PYTHON", "NODE_PATH", "NPM_CONFIG_CACHE",
+                        "VIRTUAL_ENV", "PYTHONPATH"):
+                if (val := os.environ.get(var)):
+                    env[var] = val
+            try:
+                # nosemgrep: dangerous-subprocess-use-audit
+                result = subprocess.run(  # nosec B603
+                    shlex.split(command),
+                    shell=False,
+                    cwd=str(cwd),
+                    capture_output=True,
+                    text=True,
+                    timeout=120,
+                    env=env,
+                )
+                output = result.stdout + result.stderr
+                if len(output) > _MAX_OUTPUT_CHARS:
+                    output = output[:_MAX_OUTPUT_CHARS] + "\n... (output truncated)"
+                return f"[exit code: {result.returncode}]\n{output}"
+            except subprocess.TimeoutExpired:
+                return "[error: command timed out after 120s]"
+            except OSError as e:
+                return f"[error: {e}]"
+
+        else:
+            return f"[error: unknown tool: {name}]"
+
+    except ValueError as e:
+        return f"Error: {e}"
+    except Exception as e:
+        logger.exception("Tool %r failed", name)
+        return f"[error: {e}]"
+
+
+# ── Agent loops ───────────────────────────────────────────────────────────────
+
+def _run_executor_loop(
+    client: anthropic.AnthropicBedrock,
+    executor_model: str,
+    simulator: HumanSimulator,
+    initial_prompt: str,
+    run_folder: Path,
+    rules_dir: Path,
+    usage: _UsageTracker,
+) -> None:
+    """Run the executor agent loop, injecting simulator turns on handoff calls."""
+    messages: list[dict] = [{"role": "user", "content": initial_prompt}]
+
+    for iteration in range(_MAX_ITERATIONS):
+        response = client.messages.create(
+            model=executor_model,
+            max_tokens=8192,
+            system=EXECUTOR_SYSTEM_PROMPT,
+            tools=_EXECUTOR_TOOLS,
+            messages=messages,
+        )
+        usage.executor.add(response.usage)
+
+        tool_uses = [b for b in response.content if b.type == "tool_use"]
+
+        if response.stop_reason == "end_turn" and not tool_uses:
+            _log(f"Executor finished after {iteration + 1} iterations")
+            return
+
+        messages.append({"role": "assistant", "content": response.content})
+        tool_results = []
+
+        for tu in tool_uses:
+            if tu.name == "handoff_to_simulator":
+                usage.handoff_count += 1
+                _log(f"  → simulator turn (handoff #{usage.handoff_count})")
+                sim_response = simulator.respond(tu.input.get("message", ""))
+                _log(f"  ← simulator responded ({len(sim_response)} chars)")
+                tool_results.append({
+                    "type": "tool_result",
+                    "tool_use_id": tu.id,
+                    "content": sim_response,
+                })
+            else:
+                result_text = _exec_tool(tu.name, tu.input, run_folder, rules_dir)
+                tool_results.append({
+                    "type": "tool_result",
+                    "tool_use_id": tu.id,
+                    "content": result_text,
+                })
+
+        messages.append({"role": "user", "content": tool_results})
+
+    _log(f"[WARN] Executor hit max iterations ({_MAX_ITERATIONS})")
+
+
+# ── Adapter ───────────────────────────────────────────────────────────────────
+
+class ClaudeCodeSDKAdapter(CLIAdapter):
+    """Adapter that drives AIDLC workflows via the Anthropic SDK with an embedded simulator.
+
+    Uses ``anthropic.AnthropicBedrock`` to run an executor agent that can
+    interactively hand off to a Human Simulator agent mid-workflow, matching
+    the two-agent Strands Swarm in ``packages/execution``.
+    """
+
+    def __init__(self, verbose: bool = False):
+        self.verbose = verbose
+
+    @property
+    def name(self) -> str:
+        return "claude-code-sdk"
+
+    def check_prerequisites(self) -> tuple[bool, str]:
+        """Verify AWS credentials are resolvable via boto3."""
+        try:
+            session = boto3.Session()
+            creds = session.get_credentials()
+            if creds is None:
+                return False, "No AWS credentials found. Configure via profile, env vars, or IAM role."
+            return True, "AWS credentials available"
+        except Exception as e:
+            return False, f"AWS credential check failed: {e}"
+
+    def run(self, config: AdapterConfig) -> AdapterResult:
+        """Execute the full AIDLC workflow through the Anthropic SDK with an embedded simulator."""
+        ok, msg = self.check_prerequisites()
+        if not ok:
+            return AdapterResult(success=False, output_dir=config.output_dir, error=msg)
+
+        start_time = time.monotonic()
+        config.output_dir.mkdir(parents=True, exist_ok=True)
+        workspace = config.output_dir / "workspace"
+        workspace.mkdir(exist_ok=True)
+        _log(f"Run folder: {config.output_dir}")
+
+        import shutil
+
+        try:
+            # Copy input documents into the run folder (matching execution runner layout)
+            shutil.copy2(config.vision_path, config.output_dir / "vision.md")
+            vision_content = config.vision_path.read_text(encoding="utf-8")
+
+            tech_env_content: str | None = None
+            if config.tech_env_path and config.tech_env_path.is_file():
+                shutil.copy2(config.tech_env_path, config.output_dir / "tech-env.md")
+                tech_env_content = config.tech_env_path.read_text(encoding="utf-8")
+
+            # Also place vision.md in workspace for the executor to find
+            shutil.copy2(config.vision_path, workspace / "vision.md")
+            if tech_env_content:
+                shutil.copy2(config.tech_env_path, workspace / "tech-env.md")
+
+            # rules_path is already set up by the orchestrator (output_dir/aidlc-rules);
+            # use it directly rather than copying again.
+            rules_dir = config.rules_path
+
+            # Build initial prompt (mirrors runner.py)
+            initial_prompt = (
+                "Begin the AIDLC workflow and execute it TO COMPLETION through ALL phases. "
+                "The project vision is available at vision.md in the run folder. "
+            )
+            if tech_env_content:
+                initial_prompt += (
+                    "The technical environment document is available at tech-env.md "
+                    "in the run folder. It defines the required languages, frameworks, "
+                    "cloud services, security controls, testing standards, and prohibited "
+                    "technologies. Follow it as a binding reference during all Construction stages. "
+                )
+            initial_prompt += (
+                "Start by loading the core workflow rules and the process overview, then "
+                "execute every stage of the Inception phase followed by every stage of the "
+                "Construction phase. The workspace directory is 'workspace/' (currently empty — "
+                "this is a greenfield project). You MUST generate all application code in "
+                "workspace/ before the workflow is complete. Do NOT stop after requirements — "
+                "continue through application design, code generation, and build-and-test."
+            )
+
+            # Retrieve the pre-built HumanSimulator injected by the orchestrator.
+            simulator = config.simulator
+            if simulator is None:
+                raise RuntimeError(
+                    "claude-code-sdk adapter requires a HumanSimulator — "
+                    "ensure --simulator-model is set or models.simulator.model_id is in config.yaml"
+                )
+
+            # Resolve executor model and region
+            executor_model = config.model or "global.anthropic.claude-opus-4-6-v1"
+            aws_region = getattr(config, "aws_region", None) or os.environ.get("AWS_DEFAULT_REGION", "us-east-1")
+
+            # Build Bedrock client for executor loop only
+            session_kwargs: dict = {}
+            if config.aws_profile:
+                session_kwargs["profile_name"] = config.aws_profile
+            boto_session = boto3.Session(**session_kwargs)
+            frozen = boto_session.get_credentials().get_frozen_credentials()
+            client = anthropic.AnthropicBedrock(
+                aws_access_key=frozen.access_key,
+                aws_secret_key=frozen.secret_key,
+                aws_session_token=frozen.token,
+                aws_region=aws_region,
+            )
+
+            _log(f"Executor model: {executor_model}")
+            _log(f"Simulator model: {simulator._model}")
+
+            # Run the executor↔simulator loop
+            usage = _UsageTracker()
+            _run_executor_loop(
+                client=client,
+                executor_model=executor_model,
+                simulator=simulator,
+                initial_prompt=initial_prompt,
+                run_folder=config.output_dir,
+                rules_dir=rules_dir,
+                usage=usage,
+            )
+
+            elapsed_seconds = time.monotonic() - start_time
+            usage_extra = usage.to_dict()
+            usage_extra["duration_ms"] = int(elapsed_seconds * 1000)
+            usage_extra["model"] = executor_model
+
+            _log(
+                f"Completed in {elapsed_seconds:.0f}s — "
+                f"{usage_extra['total_tokens']:,} total tokens, "
+                f"{usage_extra['handoffs']} handoffs"
+            )
+
+            # Move aidlc-docs from workspace/ up to run_folder/ if the executor placed them there
+            src_docs = workspace / "aidlc-docs"
+            dst_docs = config.output_dir / "aidlc-docs"
+            if src_docs.is_dir() and not dst_docs.exists():
+                shutil.move(str(src_docs), str(dst_docs))
+
+            # Write run metadata
+            normalize_output(
+                source_dir=workspace,
+                output_dir=config.output_dir,
+                adapter_name=self.name,
+                model_hint=executor_model,
+                elapsed_seconds=elapsed_seconds,
+                token_usage=usage_extra,
+            )
+
+            # Stage 2: post-run tests — same logic as the Strands runner
+            _log("Running post-run test evaluation...")
+            sandbox_enabled = _get_container_cli() is not None
+            runner_cfg = RunnerConfig()
+            runner_cfg.execution = ExecutionConfig(
+                post_run_tests=True,
+                post_run_timeout=300,
+                sandbox=SandboxConfig(enabled=sandbox_enabled),
+            )
+            test_results_path = run_post_evaluation(config.output_dir, runner_cfg)
+            if test_results_path:
+                _log(f"Test results: {test_results_path}")
+            else:
+                _log("No testable project detected — post-run tests skipped.")
+
+            has_docs = dst_docs.is_dir() and any(dst_docs.iterdir())
+            return AdapterResult(
+                success=has_docs,
+                output_dir=config.output_dir,
+                aidlc_docs_dir=dst_docs if has_docs else None,
+                workspace_dir=workspace,
+                elapsed_seconds=elapsed_seconds,
+                extra=usage_extra,
+                error=None if has_docs else "No aidlc-docs produced",
+            )
+
+        except Exception as exc:
+            elapsed_seconds = time.monotonic() - start_time
+            logger.exception("claude-code-sdk adapter run failed")
+            return AdapterResult(
+                success=False,
+                output_dir=config.output_dir,
+                workspace_dir=workspace,
+                error=f"claude-code-sdk adapter error: {exc}",
+                elapsed_seconds=elapsed_seconds,
+            )
+
+
+def _setup_rules(rules_dir: Path, rules_path: Path) -> None:
+    """Copy or link the AIDLC rules into the run folder."""
+    import shutil
+    rules_dir.mkdir(parents=True, exist_ok=True)
+    if rules_path.is_dir():
+        for rule_file in sorted(rules_path.rglob("*.md")):
+            rel = rule_file.relative_to(rules_path)
+            dst = rules_dir / rel
+            dst.parent.mkdir(parents=True, exist_ok=True)
+            shutil.copy2(rule_file, dst)
+        _log(f"Copied AIDLC rules ({sum(1 for _ in rules_dir.rglob('*.md'))} files)")
+    else:
+        shutil.copy2(rules_path, rules_dir / rules_path.name)
+        _log(f"Copied AIDLC rules file: {rules_path.name}")
diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/kiro_cli.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/kiro_cli.py
index ae0fc23e..e5861b81 100644
--- a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/kiro_cli.py
+++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/kiro_cli.py
@@ -20,6 +20,19 @@
 from cli_harness.adapter import AdapterConfig, AdapterResult, CLIAdapter
 from cli_harness.normalizer import normalize_output
 from cli_harness.prompt_template import render_prompt
+from cli_harness.simulator import HumanSimulator
+
+import sys as _sys
+_EXEC_SRC = Path(__file__).resolve().parents[6] / "execution" / "src"
+if str(_EXEC_SRC) not in _sys.path:
+    _sys.path.insert(0, str(_EXEC_SRC))
+from aidlc_runner.post_run import run_post_evaluation  # noqa: E402
+from aidlc_runner.config import ExecutionConfig, SandboxConfig, RunnerConfig  # noqa: E402
+
+_SHARED_SRC = Path(__file__).resolve().parents[6] / "shared" / "src"
+if str(_SHARED_SRC) not in _sys.path:
+    _sys.path.insert(0, str(_SHARED_SRC))
+from shared.sandbox import _get_container_cli  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
@@ -110,95 +123,188 @@ def run(self, config: AdapterConfig) -> AdapterResult:
             )
             _log(f"Injected AIDLC rules ({len(rules_content)} chars)")
 
-            # Build the prompt
-            prompt = config.prompt_template or render_prompt()
+            # Build executor prompt — instructs kiro to pause at review gates
+            # so the human simulator can respond rather than self-approving.
+            prompt = config.prompt_template or render_prompt(
+                openapi_content=config.openapi_content,
+                with_simulator=True,
+            )
 
-            # Base command flags
-            base_flags = [
-                "--no-interactive",
-                "--trust-all-tools",
-            ]
+            # Retrieve the pre-built HumanSimulator injected by the orchestrator.
+            # All document context (vision, tech_env, openapi) is already embedded.
+            simulator = config.simulator
+            if simulator is None:
+                raise RuntimeError(
+                    "kiro-cli adapter requires a HumanSimulator — "
+                    "ensure --simulator-model is set or models.simulator.model_id is in config.yaml"
+                )
+            _log(f"Simulator model: {simulator._model}")
+
+            # Per-stage gate approach using kiro's --no-interactive + --resume.
+            #
+            # Each stage produces a sentinel file. We run kiro to that sentinel,
+            # have the simulator review the output, then resume with feedback.
+            # Stages map to the AIDLC workflow as tracked in aidlc-state.md.
+            #
+            # Gate schedule (sentinel → simulator focus):
+            #   1. requirements.md    → answer verification questions, approve requirements
+            #   2. execution-plan.md  → approve workflow plan and application design
+            #   3. code-gen-plan      → approve code generation plan before code is written
+            #   4. build-and-test-summary → review final output
+            base_flags = ["--no-interactive", "--trust-all-tools"]
             if config.model:
                 base_flags += ["--model", config.model]
 
-            # Run kiro-cli in a loop to handle AIDLC review gates.
-            # The workflow pauses at gates (e.g. "Approve & Continue").
-            # With --no-interactive, kiro-cli exits at each gate.
-            # We resume the session with an approval message each time.
             log_path = config.output_dir / "kiro-session.log"
             _log(f"Session log: {log_path}")
 
-            turn = 0
-            max_turns = 20  # safety limit
+            gate_count = 0
             total_rc = 0
 
-            with open(log_path, "w", encoding="utf-8") as log_file:
-                while turn < max_turns:
-                    turn += 1
-
-                    if turn == 1:
-                        cmd = [_KIRO_CLI, "chat"] + base_flags + [prompt]
-                        _log(f"Turn {turn}: initial prompt ({len(prompt)} chars)")
-                    else:
-                        approval = "Approve & Continue. Proceed to the next phase."
-                        cmd = [_KIRO_CLI, "chat"] + base_flags + ["--resume", approval]
-                        _log(f"Turn {turn}: resuming with approval")
-
-                    log_file.write(f"\n{'='*60}\n")
-                    log_file.write(f"TURN {turn}\n")
-                    log_file.write(f"{'='*60}\n")
-                    log_file.flush()
-
-                    # nosec B603 - Executing user's Kiro CLI with validated configuration
-                    # nosemgrep: dangerous-subprocess-use-audit
-                    process = subprocess.Popen(
-                        cmd,
-                        cwd=str(workspace),
-                        stdout=subprocess.PIPE,
-                        stderr=subprocess.STDOUT,
-                        text=True,
-                        bufsize=1,
-                    )
-
-                    for line in process.stdout:
-                        log_file.write(_strip_ansi(line))
-                        log_file.flush()
+            def _run_kiro_stage(stage_prompt: str, stage_name: str, is_first: bool) -> tuple[str, int]:
+                """Run one kiro stage segment and return (output, exit_code)."""
+                cmd = [_KIRO_CLI, "chat"] + base_flags
+                if is_first:
+                    cmd.append(stage_prompt)
+                else:
+                    cmd += ["--resume", stage_prompt]
+
+                _log(f"{stage_name}: launching kiro ({len(stage_prompt)} chars)")
+
+                # nosemgrep: dangerous-subprocess-use-audit
+                proc = subprocess.Popen(  # nosec B603
+                    cmd,
+                    cwd=str(workspace),
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                )
+
+                output_lines: list[str] = []
+                last_printed = [""]
+                line_buf = [""]
+
+                _SKIP = ("⠀","⠋","⠙","⠹","⠸","⠼","⠴","⠦","⠧","⠇","⠏",
+                         "⣴","⣿","⠿","╭","╰","│","▸ Credits:","Credits:",
+                         "Model:","Plan:","All tools are","Agents can",
+                         "Learn more","https://","Did you know","Jump into",
+                         "Use /","38;","5;","[0m","[1m")
+
+                def _print_line(line: str) -> None:
+                    s = line.strip()
+                    if not s or len(s) < 8:
+                        return
+                    if any(s.startswith(p) for p in _SKIP):
+                        return
+                    if s == last_printed[0]:
+                        return
+                    last_printed[0] = s
+                    print(f"  [kiro] {s}", file=sys.stderr, flush=True)
+
+                with open(log_path, "a", encoding="utf-8") as lf:
+                    lf.write(f"\n{'='*60}\n{stage_name.upper()}\n{'='*60}\n")
+                    lf.flush()
+                    while True:
+                        chunk = proc.stdout.read(4096)
+                        if not chunk:
+                            break
+                        text = chunk.decode("utf-8", errors="replace")
+                        clean = _strip_ansi(text)
+                        lf.write(clean)
+                        lf.flush()
+                        output_lines.append(clean)
+                        for ch in clean:
+                            if ch == "\n":
+                                _print_line(line_buf[0])
+                                line_buf[0] = ""
+                            else:
+                                line_buf[0] += ch
                         if self.verbose:
-                            sys.stderr.write(line)
+                            sys.stderr.write(text)
                             sys.stderr.flush()
 
-                    remaining = config.timeout_seconds - (time.monotonic() - start_time)
-                    if remaining <= 0:
-                        process.kill()
-                        _log(f"Timeout reached at turn {turn}")
-                        break
-                    process.wait(timeout=max(remaining, 10))
-                    total_rc = process.returncode
-
-                    _log(f"Turn {turn} exited with code {process.returncode}")
-
-                    # Check if aidlc-docs looks complete (has construction phase files)
-                    aidlc_docs_dir = workspace / "aidlc-docs"
-                    if aidlc_docs_dir.is_dir():
-                        has_construction = any(
-                            (aidlc_docs_dir / "construction").rglob("*.md")
-                        ) if (aidlc_docs_dir / "construction").is_dir() else False
-                        file_count = sum(1 for _ in aidlc_docs_dir.rglob("*") if _.is_file())
-                        _log(f"  aidlc-docs: {file_count} files, construction={'yes' if has_construction else 'no'}")
-
-                        if has_construction:
-                            _log("Construction phase detected — workflow complete")
-                            break
-                    else:
-                        _log("  aidlc-docs/ not yet created")
+                proc.wait()
+                return "".join(output_lines), proc.returncode
+
+            def _sim_review(sentinel_glob: str, focus: str) -> str:
+                """Run simulator review after a stage completes."""
+                nonlocal gate_count
+                gate_count += 1
+                _log(f"Gate #{gate_count}: simulator reviewing ({focus})...")
+                response = simulator.respond(
+                    f"The AIDLC executor has just completed: {focus}.\n\n"
+                    f"Please read the relevant files in aidlc-docs/ ({sentinel_glob}) "
+                    f"and any supporting documents. "
+                    f"Answer any open questions, approve or request changes, "
+                    f"and give clear direction for the next stage. Be concise."
+                )
+                _log(f"Gate #{gate_count}: simulator responded ({len(response)} chars)")
+                return response
+
+            # ── Stage 1: Requirements Analysis ───────────────────────────────
+            _log("Stage 1: Requirements Analysis...")
+            _, rc = _run_kiro_stage(
+                prompt + (
+                    "\n\nIMPORTANT: Execute ONLY these stages in order: "
+                    "Workspace Detection, Requirements Analysis. "
+                    "Stop after writing aidlc-docs/inception/requirements/requirements.md "
+                    "and aidlc-docs/inception/requirements/requirement-verification-questions.md. "
+                    "Do NOT proceed further. End your response when these files are written."
+                ),
+                "stage-1-requirements",
+                is_first=True,
+            )
+            feedback = _sim_review(
+                "inception/requirements/*.md",
+                "Requirements Analysis — requirements.md and requirement-verification-questions.md",
+            )
 
-                    elapsed = time.monotonic() - start_time
-                    if elapsed >= config.timeout_seconds:
-                        _log("Timeout reached")
-                        break
+            # ── Stage 2: Workflow Planning + Application Design ───────────────
+            _log("Stage 2: Workflow Planning + Application Design...")
+            _, rc = _run_kiro_stage(
+                f"Human reviewer feedback on requirements:\n\n{feedback}\n\n"
+                "Now execute: Workflow Planning, then Application Design. "
+                "Stop after writing aidlc-docs/inception/plans/execution-plan.md "
+                "and all application-design artifacts (components.md, component-methods.md, "
+                "component-dependency.md, services.md). "
+                "Do NOT proceed to Construction.",
+                "stage-2-design",
+                is_first=False,
+            )
+            feedback = _sim_review(
+                "inception/plans/*.md, inception/application-design/*.md",
+                "Workflow Planning and Application Design",
+            )
+
+            # ── Stage 3: Code Generation Plan ────────────────────────────────
+            _log("Stage 3: Code Generation Plan...")
+            _, rc = _run_kiro_stage(
+                f"Human reviewer feedback on design:\n\n{feedback}\n\n"
+                "Now execute the Code Generation PLAN only — write the detailed code generation plan "
+                "in aidlc-docs/construction/plans/ with exact file paths and implementation steps. "
+                "Do NOT write any application code yet. Stop after the plan document is complete.",
+                "stage-3-codegen-plan",
+                is_first=False,
+            )
+            feedback = _sim_review(
+                "construction/plans/*.md",
+                "Code Generation Plan",
+            )
+
+            # ── Stage 4: Code Generation + Build and Test ─────────────────────
+            _log("Stage 4: Code Generation + Build and Test...")
+            _, rc = _run_kiro_stage(
+                f"Human reviewer has approved the code generation plan:\n\n{feedback}\n\n"
+                "Now execute: generate ALL application code per the plan, then run Build and Test. "
+                "Install dependencies, run tests, fix failures, and write the build-and-test-summary.md. "
+                "Complete the full Construction phase.",
+                "stage-4-construction",
+                is_first=False,
+            )
+            total_rc = rc
+            _log(f"Stage 4 complete (exit {rc})")
 
             elapsed_seconds = time.monotonic() - start_time
-            _log(f"Completed {turn} turn(s) in {elapsed_seconds:.0f}s")
+            _log(f"Completed in {elapsed_seconds:.0f}s ({gate_count} simulator gate(s))")
 
             # List workspace contents for debugging
             _log("Workspace contents:")
@@ -223,11 +329,26 @@ def run(self, config: AdapterConfig) -> AdapterResult:
                 adapter_name=self.name,
                 elapsed_seconds=elapsed_seconds,
                 token_usage={
-                    "num_turns": turn,
+                    "num_turns": gate_count,
                     "model": config.model or "",
                 },
             )
 
+            # Stage 2: post-run tests — same logic as the Strands runner
+            _log("Running post-run test evaluation...")
+            sandbox_enabled = _get_container_cli() is not None
+            runner_cfg = RunnerConfig()
+            runner_cfg.execution = ExecutionConfig(
+                post_run_tests=True,
+                post_run_timeout=300,
+                sandbox=SandboxConfig(enabled=sandbox_enabled),
+            )
+            test_results_path = run_post_evaluation(config.output_dir, runner_cfg)
+            if test_results_path:
+                _log(f"Test results: {test_results_path}")
+            else:
+                _log("No testable project detected — post-run tests skipped.")
+
             has_docs = dst_docs.is_dir() and any(dst_docs.iterdir())
 
             if total_rc == 0 and has_docs:
diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/orchestrator.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/orchestrator.py
index a837a4f8..61c9573f 100644
--- a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/orchestrator.py
+++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/orchestrator.py
@@ -11,6 +11,7 @@
 
 from cli_harness.adapter import AdapterConfig, AdapterResult, CLIAdapter
 from cli_harness.normalizer import normalize_output, _count_workspace_files, _count_doc_files
+from cli_harness.simulator import HumanSimulator
 
 
 REPO_ROOT = Path(__file__).resolve().parents[4]  # packages/cli-harness/src/cli_harness -> repo root
@@ -114,6 +115,7 @@ def run_cli_evaluation(
     profile: str | None = None,
     region: str | None = None,
     scorer_model: str = "us.anthropic.claude-sonnet-4-5-20250929-v1:0",
+    simulator_model: str | None = None,
     report_format: str = "both",
     prompt_template: str | None = None,
     model: str | None = None,
@@ -145,7 +147,32 @@ def run_cli_evaluation(
 
     print(f"[OK] {adapter.name} prerequisites met: {msg}")
 
-    # 2. Run the adapter
+    # 2. Build the shared HumanSimulator and run the adapter.
+    # The simulator is constructed once here with the full document context
+    # (vision, tech_env, openapi) and injected into AdapterConfig so every
+    # adapter uses the same instance — no per-adapter construction needed.
+    openapi_content: str | None = None
+    if openapi_path and openapi_path.is_file():
+        openapi_content = openapi_path.read_text(encoding="utf-8")
+
+    simulator: HumanSimulator | None = None
+    if simulator_model:
+        vision_content = vision_path.read_text(encoding="utf-8")
+        tech_env_content = (
+            tech_env_path.read_text(encoding="utf-8")
+            if tech_env_path and tech_env_path.is_file()
+            else None
+        )
+        simulator = HumanSimulator.from_adapter_config(
+            run_folder=output_dir,
+            vision_content=vision_content,
+            tech_env_content=tech_env_content,
+            openapi_content=openapi_content,
+            aws_profile=profile,
+            aws_region=region or "us-east-1",
+            model=simulator_model,
+        )
+
     config = AdapterConfig(
         vision_path=vision_path,
         tech_env_path=tech_env_path,
@@ -153,7 +180,11 @@ def run_cli_evaluation(
         output_dir=output_dir,
         prompt_template=prompt_template,
         model=model,
+        simulator_model=simulator_model,
         aws_profile=profile,
+        aws_region=region,
+        openapi_content=openapi_content,
+        simulator=simulator,
         timeout_seconds=timeout_seconds,
     )
 
@@ -194,7 +225,7 @@ def run_cli_evaluation(
 
     # 5. Run evaluation pipeline (stages 2-6)
     eval_cmd = [
-        sys.executable, str(REPO_ROOT / "run_evaluation.py"),
+        sys.executable, str(REPO_ROOT / "scripts" / "run_evaluation.py"),
         "--evaluate-only", str(aidlc_docs),
         "--golden", str(golden_docs),
         "--results", str(output_dir / "qualitative-comparison.yaml"),
diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py
index 3ef18884..cb5a03e0 100644
--- a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py
+++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py
@@ -98,26 +98,59 @@
 
 ## Important rules
 
-- Since you are running autonomously without a human reviewer, self-approve all stages \
-and continue immediately to the next one. Do NOT pause or wait for approval.
-- Read the relevant rule file BEFORE starting each stage.
+{approval_rule}- Read the relevant rule file BEFORE starting each stage.
 - Read common rules as needed (e.g. `aidlc-rules/common/content-validation.md` before \
 writing files, `aidlc-rules/common/question-format-guide.md` before creating questions).
 - For CONDITIONAL stages, evaluate based on project scope and skip with justification if \
 not needed, but always continue to the next stage.
 - When generating code, write COMPLETE, WORKING files — not stubs or placeholders.
 - Generate complete, working code with full test coverage.
-"""
-
-
-def render_prompt(vision_path: str = "vision.md", tech_env_path: str = "tech-env.md") -> str:
-    r"""Render the AIDLC prompt with customized file paths.
-
-    Only replaces backtick-delimited references (``\`vision.md\```) so that
-    prose mentions like "alongside vision.md" are left intact.
+{openapi_section}"""
+
+_SELF_APPROVE_RULE = (
+    "- Since you are running autonomously without a human reviewer, self-approve all stages "
+    "and continue immediately to the next one. Do NOT pause or wait for approval.\n"
+)
+
+_SIMULATOR_HANDOFF_RULE = (
+    "- At each stage gate (questions, approval requests, document reviews, code reviews), "
+    "PAUSE and end your turn. A human reviewer will read your output and respond. "
+    "Resume work only after receiving their response — do not self-approve.\n"
+)
+
+
+def render_prompt(
+    vision_path: str = "vision.md",
+    tech_env_path: str = "tech-env.md",
+    openapi_content: str | None = None,
+    with_simulator: bool = False,
+) -> str:
+    r"""Render the AIDLC executor prompt.
+
+    Args:
+        vision_path: Path to vision doc (replaces backtick references only).
+        tech_env_path: Path to tech-env doc.
+        openapi_content: Full OpenAPI spec text — injected as a binding contract section.
+        with_simulator: When True, instructs the executor to pause at review gates
+            for a human reviewer instead of self-approving. Use for kiro-cli when
+            a HumanSimulator will be providing responses between turns.
     """
+    openapi_section = (
+        "\n## The API contract (OpenAPI specification)\n\n"
+        "The following is the OpenAPI specification that defines the exact API contract "
+        "this project must implement. Ensure all generated endpoints, request/response "
+        "schemas, status codes, and error shapes match this specification exactly.\n\n"
+        "---\n"
+        f"{openapi_content}\n"
+        "---\n"
+    ) if openapi_content else ""
+
+    approval_rule = _SIMULATOR_HANDOFF_RULE if with_simulator else _SELF_APPROVE_RULE
+
     return (
         EXECUTOR_SYSTEM_PROMPT
         .replace("`vision.md`", f"`{vision_path}`")
         .replace("`tech-env.md`", f"`{tech_env_path}`")
+        .replace("{openapi_section}", openapi_section)
+        .replace("{approval_rule}", approval_rule)
     )
diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/registry.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/registry.py
index 025d6374..146f0f9a 100644
--- a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/registry.py
+++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/registry.py
@@ -5,13 +5,43 @@
 from cli_harness.adapter import CLIAdapter
 
 
-# Lazy imports to avoid pulling in adapter-specific deps at import time
+# Built-in adapters — always available
 _ADAPTER_MAP: dict[str, str] = {
     "kiro-cli": "cli_harness.adapters.kiro_cli.KiroCLIAdapter",
     "claude-code": "cli_harness.adapters.claude_code.ClaudeCodeAdapter",
+    "claude-code-sdk": "cli_harness.adapters.claude_code_sdk.ClaudeCodeSDKAdapter",
 }
 
 
+def register_adapter(name: str, fqn: str) -> None:
+    """Register an adapter by name and fully-qualified class path.
+
+    Allows external code (config loaders, plugins) to add adapters without
+    modifying framework code.  Built-in adapters can be overridden by name.
+
+    Args:
+        name: Adapter name as used on the CLI (e.g. 'my-tool').
+        fqn:  Fully-qualified class path (e.g. 'mypackage.adapters.MyAdapter').
+    """
+    _ADAPTER_MAP[name.lower().strip()] = fqn
+
+
+def load_adapters_from_config(cfg_data: dict) -> None:
+    """Register adapters declared under ``cli.adapters`` in a config dict.
+
+    Config shape::
+
+        cli:
+          adapters:
+            my-tool: "mypackage.adapters.MyToolAdapter"
+
+    Each entry calls :func:`register_adapter` so the adapter is available
+    for the current process without any framework code changes.
+    """
+    for adapter_name, fqn in cfg_data.get("cli", {}).get("adapters", {}).items():
+        register_adapter(adapter_name, fqn)
+
+
 def list_adapters() -> list[str]:
     """Return sorted list of registered adapter names."""
     return sorted(_ADAPTER_MAP.keys())
diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/simulator.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/simulator.py
new file mode 100644
index 00000000..a820c37a
--- /dev/null
+++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/simulator.py
@@ -0,0 +1,229 @@
+"""Shared Human Simulator — Anthropic SDK-based reviewer for CLI adapter workflows.
+
+Used by both the kiro-cli adapter (after each kiro turn) and the claude-code-sdk
+adapter (on each handoff_to_simulator tool call).  Backed by the same system prompt
+as the Strands two-agent swarm via build_simulator_system_prompt().
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+from pathlib import Path
+
+import anthropic
+import boto3
+
+# Import shared prompt builder from execution package
+_EXEC_SRC = Path(__file__).resolve().parents[5] / "execution" / "src"
+if str(_EXEC_SRC) not in sys.path:
+    sys.path.insert(0, str(_EXEC_SRC))
+from aidlc_runner.agents.simulator import build_simulator_system_prompt  # noqa: E402
+
+logger = logging.getLogger(__name__)
+
+_SIMULATOR_TOOLS = [
+    {
+        "name": "read_file",
+        "description": "Read the contents of a file in the run folder.",
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "path": {
+                    "type": "string",
+                    "description": "File path relative to the run folder.",
+                }
+            },
+            "required": ["path"],
+        },
+    },
+    {
+        "name": "write_file",
+        "description": "Write content to a file in the run folder.",
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "path": {"type": "string"},
+                "content": {"type": "string"},
+            },
+            "required": ["path", "content"],
+        },
+    },
+    {
+        "name": "list_files",
+        "description": "List files and directories within a path in the run folder.",
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "directory": {"type": "string", "default": "."},
+            },
+            "required": [],
+        },
+    },
+]
+
+
+def _resolve_safe(base: Path, relative: str) -> Path:
+    resolved = (base / relative).resolve()
+    if not str(resolved).startswith(str(base.resolve())):
+        raise ValueError(f"Path traversal denied: {relative}")
+    return resolved
+
+
+def _exec_file_tool(name: str, tool_input: dict, run_folder: Path) -> str:
+    try:
+        if name == "read_file":
+            path = tool_input["path"]
+            target = _resolve_safe(run_folder, path)
+            if not target.exists():
+                return f"Error: File not found: {path}"
+            return target.read_text(encoding="utf-8")
+
+        elif name == "write_file":
+            path = tool_input["path"]
+            content = tool_input.get("content", "")
+            target = _resolve_safe(run_folder, path)
+            target.parent.mkdir(parents=True, exist_ok=True)
+            target.write_text(content, encoding="utf-8")
+            return f"Written: {path} ({len(content)} chars)"
+
+        elif name == "list_files":
+            directory = tool_input.get("directory", ".")
+            target = _resolve_safe(run_folder, directory)
+            if not target.is_dir():
+                return f"Error: Not a directory: {directory}"
+            entries = sorted(target.iterdir())
+            lines = [
+                f"  {e.relative_to(run_folder)}{'/' if e.is_dir() else ''}"
+                for e in entries
+            ]
+            return "\n".join(lines) if lines else f"(empty: {directory})"
+
+        return f"[error: unknown tool: {name}]"
+    except ValueError as e:
+        return f"Error: {e}"
+    except Exception as e:
+        logger.exception("File tool %r failed", name)
+        return f"[error: {e}]"
+
+
+class HumanSimulator:
+    """Anthropic SDK-based human simulator for CLI adapter review gates.
+
+    Wraps a single stateless call: given a message from the executor (e.g.
+    the output of a kiro turn, or a handoff_to_simulator tool call), runs
+    a short simulator conversation and returns the simulator's text response.
+
+    Token usage is accumulated separately per respond() call so callers can
+    report simulator and executor tokens independently.
+    """
+
+    def __init__(
+        self,
+        client: anthropic.AnthropicBedrock,
+        model: str,
+        system_prompt: str,
+        run_folder: Path,
+    ):
+        self._client = client
+        self._model = model
+        self._system_prompt = system_prompt
+        self._run_folder = run_folder
+        # Accumulated token counts across all respond() calls
+        self._input_tokens: int = 0
+        self._output_tokens: int = 0
+        self._cache_read_tokens: int = 0
+        self._cache_write_tokens: int = 0
+
+    @property
+    def accumulated_usage(self) -> dict[str, int]:
+        """Token totals across all respond() calls, keyed by snake_case names
+        matching MetricsCollector's expected format."""
+        total = self._input_tokens + self._output_tokens
+        return {
+            "inputTokens": self._input_tokens,
+            "outputTokens": self._output_tokens,
+            "totalTokens": total,
+            "cacheReadInputTokens": self._cache_read_tokens,
+            "cacheWriteInputTokens": self._cache_write_tokens,
+        }
+
+    @classmethod
+    def from_adapter_config(
+        cls,
+        run_folder: Path,
+        vision_content: str,
+        tech_env_content: str | None,
+        openapi_content: str | None,
+        aws_profile: str | None,
+        aws_region: str | None,
+        model: str,
+    ) -> "HumanSimulator":
+        """Construct a HumanSimulator from the pieces available in an AdapterConfig."""
+        system_prompt = build_simulator_system_prompt(
+            vision_content=vision_content,
+            tech_env_content=tech_env_content,
+            openapi_content=openapi_content,
+        )
+
+        session_kwargs: dict = {}
+        if aws_profile:
+            session_kwargs["profile_name"] = aws_profile
+        boto_session = boto3.Session(**session_kwargs)
+        frozen = boto_session.get_credentials().get_frozen_credentials()
+        client = anthropic.AnthropicBedrock(
+            aws_access_key=frozen.access_key,
+            aws_secret_key=frozen.secret_key,
+            aws_session_token=frozen.token,
+            aws_region=aws_region or "us-east-1",
+        )
+
+        return cls(
+            client=client,
+            model=model,
+            system_prompt=system_prompt,
+            run_folder=run_folder,
+        )
+
+    def respond(self, message: str, max_iterations: int = 50) -> str:
+        """Run one simulator turn and return the final text response.
+
+        The simulator may make file tool calls before responding — this loop
+        handles those transparently.
+        """
+        messages: list[dict] = [{"role": "user", "content": message}]
+
+        for _ in range(max_iterations):
+            response = self._client.messages.create(
+                model=self._model,
+                max_tokens=8192,
+                system=self._system_prompt,
+                tools=_SIMULATOR_TOOLS,
+                messages=messages,
+            )
+
+            # Accumulate token usage from this API call
+            u = response.usage
+            self._input_tokens += getattr(u, "input_tokens", 0)
+            self._output_tokens += getattr(u, "output_tokens", 0)
+            self._cache_read_tokens += getattr(u, "cache_read_input_tokens", 0)
+            self._cache_write_tokens += getattr(u, "cache_creation_input_tokens", 0)
+
+            tool_uses = [b for b in response.content if b.type == "tool_use"]
+            text_blocks = [b for b in response.content if b.type == "text"]
+
+            if not tool_uses:
+                return "\n".join(b.text for b in text_blocks).strip() or "(no response)"
+
+            messages.append({"role": "assistant", "content": response.content})
+            tool_results = []
+            for tu in tool_uses:
+                result_text = _exec_file_tool(tu.name, tu.input, self._run_folder)
+                tool_results.append({
+                    "type": "tool_result",
+                    "tool_use_id": tu.id,
+                    "content": result_text,
+                })
+            messages.append({"role": "user", "content": tool_results})
+
+        return "[error: simulator exceeded max iterations]"
diff --git a/scripts/aidlc-evaluator/packages/execution/pyproject.toml b/scripts/aidlc-evaluator/packages/execution/pyproject.toml
index b383dedb..3586fdf0 100644
--- a/scripts/aidlc-evaluator/packages/execution/pyproject.toml
+++ b/scripts/aidlc-evaluator/packages/execution/pyproject.toml
@@ -7,6 +7,8 @@ dependencies = [
     "strands-agents>=0.1.0",
     "strands-agents-tools>=0.1.0",
     "pyyaml>=6.0",
+    "anthropic[bedrock]>=0.40",
+    "boto3>=1.42.47",
 ]
 
 [project.scripts]
diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/executor.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/executor.py
index 32c29f05..1d4c5a0d 100644
--- a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/executor.py
+++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/executor.py
@@ -177,6 +177,7 @@ def create_executor(
     aws_region: str | None = None,
     callback_handler: Callable[..., Any] | None = None,
     execution_config: ExecutionConfig | None = None,
+    simulator_tool: Any | None = None,
 ) -> Agent:
     """Create the AIDLC Executor agent.
 
@@ -188,6 +189,9 @@ def create_executor(
         aws_region: AWS region for Bedrock.
         callback_handler: Optional callback handler for progress reporting.
         execution_config: Optional execution config controlling run_command availability.
+        simulator_tool: Optional Strands @tool wrapping a HumanSimulator. When
+            provided it is added to the executor's tool list so handoff_to_simulator
+            calls are handled inline rather than via a separate Swarm agent.
 
     Returns:
         Configured Strands Agent instance.
@@ -206,6 +210,9 @@ def create_executor(
     else:
         system_prompt = _EXECUTOR_PROMPT_NO_EXEC
 
+    if simulator_tool is not None:
+        tools.append(simulator_tool)
+
     session_kwargs: dict = {}
     if aws_profile:
         session_kwargs["profile_name"] = aws_profile
diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/simulator.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/simulator.py
index 8355400c..b1e63963 100644
--- a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/simulator.py
+++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/simulator.py
@@ -40,8 +40,7 @@
 ---
 {vision_content}
 ---
-{tech_env_section}
-## How you work
+{tech_env_section}{openapi_section}## How you work
 
 1. When you receive a handoff from the "executor" agent, read the file path mentioned \
 in the handoff message.
@@ -57,8 +56,10 @@
 describe what needs to change.
    - **Review requests**: Read the document, provide brief feedback, and approve. Only \
 request revisions for significant issues that contradict the vision.
-   - **Code review**: Review generated code for correctness against the vision spec. \
-Approve if it implements the required functionality. Do not block on style issues.
+   - **Code review**: Review generated code for correctness against the vision spec \
+and the API contract above. Verify that all required endpoints, request/response shapes, \
+and error codes match the specification. Reject if critical endpoints are missing or \
+the contract is violated; approve otherwise.
 
 3. Write your response to the same file (appending) or to a response file as directed \
 by the question format.
@@ -79,6 +80,52 @@
 """
 
 
+def build_simulator_system_prompt(
+    vision_content: str,
+    tech_env_content: str | None = None,
+    openapi_content: str | None = None,
+) -> str:
+    """Build the simulator system prompt string from project inputs.
+
+    Extracted so other adapters (SDK, kiro) can construct the same prompt
+    without calling the full Strands-specific create_simulator().
+    """
+    if tech_env_content:
+        tech_env_section = (
+            "\n## The technical environment\n\n"
+            "The following is the technical environment document that defines HOW the project "
+            "must be built — languages, frameworks, cloud services, security controls, testing "
+            "standards, and prohibited technologies. Use this as a binding reference when "
+            "answering technical questions and reviewing designs and code:\n\n"
+            "---\n"
+            f"{tech_env_content}\n"
+            "---\n"
+        )
+    else:
+        tech_env_section = ""
+
+    if openapi_content:
+        openapi_section = (
+            "\n## The API contract (OpenAPI specification)\n\n"
+            "The following is the OpenAPI specification that defines the exact API contract "
+            "this project must implement — all required endpoints, request/response schemas, "
+            "status codes, and error shapes. Use this as a binding reference when reviewing "
+            "API design documents and generated code. Reject any design or code that violates "
+            "this contract.\n\n"
+            "---\n"
+            f"{openapi_content}\n"
+            "---\n\n"
+        )
+    else:
+        openapi_section = ""
+
+    return SIMULATOR_SYSTEM_PROMPT_TEMPLATE.format(
+        vision_content=vision_content,
+        tech_env_section=tech_env_section,
+        openapi_section=openapi_section,
+    )
+
+
 def create_simulator(
     run_folder: Path,
     vision_content: str,
@@ -87,6 +134,7 @@ def create_simulator(
     aws_region: str | None = None,
     callback_handler: Callable[..., Any] | None = None,
     tech_env_content: str | None = None,
+    openapi_content: str | None = None,
 ) -> Agent:
     """Create the Human Simulator agent.
 
@@ -98,29 +146,17 @@ def create_simulator(
         aws_region: AWS region for Bedrock.
         callback_handler: Optional callback handler for progress reporting.
         tech_env_content: Optional full text of the technical environment file.
+        openapi_content: Optional full text of the OpenAPI spec (test contract).
 
     Returns:
         Configured Strands Agent instance.
     """
     file_tools = make_file_tools(run_folder)
 
-    if tech_env_content:
-        tech_env_section = (
-            "\n## The technical environment\n\n"
-            "The following is the technical environment document that defines HOW the project "
-            "must be built — languages, frameworks, cloud services, security controls, testing "
-            "standards, and prohibited technologies. Use this as a binding reference when "
-            "answering technical questions and reviewing designs and code:\n\n"
-            "---\n"
-            f"{tech_env_content}\n"
-            "---\n"
-        )
-    else:
-        tech_env_section = ""
-
-    system_prompt = SIMULATOR_SYSTEM_PROMPT_TEMPLATE.format(
+    system_prompt = build_simulator_system_prompt(
         vision_content=vision_content,
-        tech_env_section=tech_env_section,
+        tech_env_content=tech_env_content,
+        openapi_content=openapi_content,
     )
 
     session_kwargs: dict = {}
diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/cli.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/cli.py
index de49dc75..918b17c0 100644
--- a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/cli.py
+++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/cli.py
@@ -27,6 +27,12 @@ def build_parser() -> argparse.ArgumentParser:
         default=None,
         help="Path to the technical environment markdown file (optional).",
     )
+    parser.add_argument(
+        "--openapi",
+        type=Path,
+        default=None,
+        help="Path to OpenAPI spec — injected into the simulator's system prompt for contract validation (optional).",
+    )
     parser.add_argument(
         "--config",
         type=Path,
@@ -143,4 +149,4 @@ def main(argv: list[str] | None = None) -> None:
     config = load_config(config_path=config_path, cli_overrides=cli_overrides)
 
     # Run the workflow
-    run(config=config, vision_path=args.vision, tech_env_path=args.tech_env)
+    run(config=config, vision_path=args.vision, tech_env_path=args.tech_env, openapi_path=args.openapi)
diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/metrics.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/metrics.py
index aefd21be..836cb0ac 100644
--- a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/metrics.py
+++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/metrics.py
@@ -168,6 +168,7 @@ def __init__(self, config: RunnerConfig) -> None:
         self._handoffs: list[dict[str, Any]] = []
         self._errors: list[dict[str, str]] = []
         self._context_samples: list[dict[str, Any]] = []
+        self._simulator_usage: dict[str, int] | None = None
 
     # -- Live recording (called during execution) --
 
@@ -187,6 +188,15 @@ def record_error(self, error_type: str, message: str) -> None:
             "message": message,
         })
 
+    def record_simulator_usage(self, usage: dict[str, int]) -> None:
+        """Record accumulated token usage from the HumanSimulator (Anthropic SDK).
+
+        Called after the swarm completes. The usage dict must use the same
+        camelCase key format as Strands accumulated_usage so _usage_to_dict
+        can normalise it uniformly.
+        """
+        self._simulator_usage = usage
+
     def record_context_sample(self, agent_name: str, input_tokens: int) -> None:
         """Record the input token count from a single model invocation.
 
@@ -227,11 +237,16 @@ def build_metrics(self, result: MultiAgentResult, run_folder: Path) -> dict[str,
         metrics: dict[str, Any] = {}
 
         # --- Tokens ---
-        # Extract per-agent token counts (unique tokens per agent)
+        # Extract per-agent token counts (unique tokens per agent).
+        # Strands nodes cover the executor. The simulator is tracked separately
+        # via record_simulator_usage() since it runs outside the Strands swarm.
         per_agent: dict[str, dict[str, int]] = {}
         for node_id, node_result in result.results.items():
             per_agent[node_id] = _usage_to_dict(node_result.accumulated_usage)
 
+        if self._simulator_usage is not None:
+            per_agent["simulator"] = _usage_to_dict(self._simulator_usage)
+
         # Calculate sum of per-agent tokens (unique tokens across all agents)
         unique_total = {
             "input_tokens": sum(agent["input_tokens"] for agent in per_agent.values()),
diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/runner.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/runner.py
index 6b5f8829..a3d507f4 100644
--- a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/runner.py
+++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/runner.py
@@ -16,13 +16,58 @@
 from shared.io import atomic_yaml_dump
 from strands.multiagent import Swarm
 
+from strands import tool as strands_tool
+
 from aidlc_runner.agents.executor import create_executor
-from aidlc_runner.agents.simulator import create_simulator
+from aidlc_runner.agents.simulator import build_simulator_system_prompt
 from aidlc_runner.config import AidlcConfig, RunnerConfig
 from aidlc_runner.metrics import MetricsCollector
 from aidlc_runner.post_run import run_post_evaluation
 from aidlc_runner.progress import AgentProgressHandler, SwarmProgressHook
 
+
+def _make_simulator_tool(
+    run_folder: Path,
+    vision_content: str,
+    model_id: str,
+    aws_profile: str | None,
+    aws_region: str | None,
+    tech_env_content: str | None = None,
+    openapi_content: str | None = None,
+):
+    """Create a Strands @tool that delegates to HumanSimulator.
+
+    Returns (tool, simulator) so the caller can harvest accumulated_usage
+    after the swarm completes and record it separately in MetricsCollector,
+    keeping executor and simulator token counts in distinct buckets.
+    """
+    import sys as _sys
+    _CLI_HARNESS = Path(__file__).resolve().parents[4] / "cli-harness" / "src"
+    if str(_CLI_HARNESS) not in _sys.path:
+        _sys.path.insert(0, str(_CLI_HARNESS))
+    from cli_harness.simulator import HumanSimulator  # noqa: E402
+
+    simulator = HumanSimulator.from_adapter_config(
+        run_folder=run_folder,
+        vision_content=vision_content,
+        tech_env_content=tech_env_content,
+        openapi_content=openapi_content,
+        aws_profile=aws_profile,
+        aws_region=aws_region,
+        model=model_id,
+    )
+
+    @strands_tool
+    def handoff_to_simulator(message: str) -> str:
+        """Hand off to the Human Simulator for answers, approvals, or reviews.
+
+        Args:
+            message: Message describing what input is needed and which file to read.
+        """
+        return simulator.respond(message)
+
+    return handoff_to_simulator, simulator
+
 _SLUG_MAX_LEN = 80
 
 
@@ -45,34 +90,41 @@ def _rules_slug(aidlc: AidlcConfig) -> str:
 
 
 def create_run_folder(output_dir: str | Path, config: RunnerConfig) -> Path:
-    """Create a timestamped run folder named after the rules source.
+    """Create or use the specified run folder.
 
-    Format: {ISO8601_compact}-{rules_slug}
-    Example: 20260224T214917-aidlc-workflows_v0.1.0
+    Two modes:
+    1. If output_dir itself looks like a timestamped folder (name starts with
+       a digit and contains "T"), use it directly — the orchestrator pre-allocated
+       the exact path for deterministic, parallel-safe execution.
+    2. Otherwise treat output_dir as a parent and create a timestamped subfolder.
+       Format: {ISO8601_compact}-{rules_slug}
+       Example: 20260224T214917-aidlc-workflows_v0.1.0
 
-    Also writes a sentinel file (``{output_dir}/.last_run_folder``) containing
-    the absolute path of the new run folder so that parent orchestrators can
-    discover the folder without racy before/after directory listing.
+    Also writes a sentinel file (``{output_dir.parent}/.last_run_folder``) in
+    Mode 2 so legacy orchestrators can discover the folder.
 
     Returns:
         Path to the created run folder.
     """
     output_dir = Path(output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S")
-    slug = _rules_slug(config.aidlc)
-    folder_name = f"{timestamp}-{slug}"
-    run_folder = output_dir / folder_name
 
-    run_folder.mkdir()
-    (run_folder / "aidlc-docs" / "inception").mkdir(parents=True)
-    (run_folder / "aidlc-docs" / "construction").mkdir(parents=True)
-    (run_folder / "workspace").mkdir()
-
-    # Write sentinel for orchestrator discovery (atomic via os.replace)
-    sentinel = output_dir / _SENTINEL_NAME
-    sentinel.write_text(str(run_folder.resolve()), encoding="utf-8")
+    folder_name = output_dir.name
+    if folder_name and folder_name[0].isdigit() and "T" in folder_name:
+        # Mode 1: orchestrator specified exact folder name
+        run_folder = output_dir
+    else:
+        # Mode 2: generate a timestamped subfolder
+        output_dir.mkdir(parents=True, exist_ok=True)
+        timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S")
+        slug = _rules_slug(config.aidlc)
+        run_folder = output_dir / f"{timestamp}-{slug}"
+        # Write sentinel for legacy orchestrator discovery
+        sentinel = output_dir / _SENTINEL_NAME
+        sentinel.write_text(str(run_folder.resolve()), encoding="utf-8")
+
+    (run_folder / "aidlc-docs" / "inception").mkdir(parents=True, exist_ok=True)
+    (run_folder / "aidlc-docs" / "construction").mkdir(parents=True, exist_ok=True)
+    (run_folder / "workspace").mkdir(exist_ok=True)
 
     return run_folder
 
@@ -172,13 +224,21 @@ def write_run_meta(
     atomic_yaml_dump(meta, run_folder / "run-meta.yaml")
 
 
-def run(config: RunnerConfig, vision_path: Path, tech_env_path: Path | None = None) -> None:
+def run(
+    config: RunnerConfig,
+    vision_path: Path,
+    tech_env_path: Path | None = None,
+    openapi_path: Path | None = None,
+) -> None:
     """Execute a full AIDLC workflow run.
 
     Args:
         config: Fully resolved runner configuration.
         vision_path: Path to the vision/constraints markdown file.
         tech_env_path: Optional path to the technical environment markdown file.
+        openapi_path: Optional path to the OpenAPI spec — injected into the
+            simulator's system prompt so it can validate the API contract
+            during design reviews and code review handoffs.
     """
     # 1. Create run folder
     run_folder = create_run_folder(config.runs.output_dir, config)
@@ -194,6 +254,11 @@ def run(config: RunnerConfig, vision_path: Path, tech_env_path: Path | None = No
         tech_env_content = tech_env_path.read_text(encoding="utf-8")
         (run_folder / "tech-env.md").write_text(tech_env_content, encoding="utf-8")
 
+    # 2c. Read OpenAPI spec if provided (not copied to run folder — used for simulator prompt only)
+    openapi_content: str | None = None
+    if openapi_path is not None and openapi_path.is_file():
+        openapi_content = openapi_path.read_text(encoding="utf-8")
+
     # 3. Set up AIDLC rules
     print("Setting up AIDLC rules...")
     rules_dir = setup_rules(run_folder, config)
@@ -202,11 +267,25 @@ def run(config: RunnerConfig, vision_path: Path, tech_env_path: Path | None = No
     # 4. Write run metadata
     write_run_meta(run_folder, config, vision_path, tech_env_path=tech_env_path)
 
-    # 5. Create metrics collector and agents with progress handlers
+    # 5. Create metrics collector and executor with progress handler
     print("Creating agents...")
     collector = MetricsCollector(config)
     executor_handler = AgentProgressHandler("executor", collector=collector)
-    simulator_handler = AgentProgressHandler("simulator", collector=collector)
+
+    # Build the HumanSimulator tool — same implementation as kiro-cli and
+    # claude-code-sdk, backed by build_simulator_system_prompt().
+    # The simulator instance is kept so we can harvest its token usage after
+    # the swarm completes and inject it into MetricsCollector as a separate
+    # "simulator" bucket — keeping executor and simulator tokens distinct.
+    simulator_tool, simulator_instance = _make_simulator_tool(
+        run_folder=run_folder,
+        vision_content=vision_content,
+        model_id=config.models.simulator.model_id,
+        aws_profile=config.aws.profile,
+        aws_region=config.aws.region,
+        tech_env_content=tech_env_content,
+        openapi_content=openapi_content,
+    )
 
     executor = create_executor(
         run_folder=run_folder,
@@ -216,19 +295,11 @@ def run(config: RunnerConfig, vision_path: Path, tech_env_path: Path | None = No
         aws_region=config.aws.region,
         callback_handler=executor_handler,
         execution_config=config.execution,
-    )
-    simulator = create_simulator(
-        run_folder=run_folder,
-        vision_content=vision_content,
-        model_config=config.models.simulator,
-        aws_profile=config.aws.profile,
-        aws_region=config.aws.region,
-        callback_handler=simulator_handler,
-        tech_env_content=tech_env_content,
+        simulator_tool=simulator_tool,
     )
 
-    # 6. Create and run the Swarm
-    print("Starting AIDLC workflow swarm...")
+    # 6. Run the executor (single-agent; simulator is a tool call)
+    print("Starting AIDLC workflow executor...")
     initial_prompt = (
         "Begin the AIDLC workflow and execute it TO COMPLETION through ALL phases. "
         "The project vision is available at vision.md in the run folder. "
@@ -250,14 +321,12 @@ def run(config: RunnerConfig, vision_path: Path, tech_env_path: Path | None = No
     )
 
     swarm = Swarm(
-        [executor, simulator],
+        [executor],
         entry_point=executor,
         max_handoffs=config.swarm.max_handoffs,
         max_iterations=config.swarm.max_iterations,
         execution_timeout=config.swarm.execution_timeout,
         node_timeout=config.swarm.node_timeout,
-        repetitive_handoff_detection_window=5,
-        repetitive_handoff_min_unique_agents=2,
     )
 
     # Register progress hook for node-level events
@@ -271,7 +340,11 @@ def run(config: RunnerConfig, vision_path: Path, tech_env_path: Path | None = No
     print(f"Execution time: {result.execution_time}ms")
     print(f"Total handoffs: {len(result.node_history)}")
 
-    # 8. Write run metrics
+    # 8. Record simulator token usage separately so metrics keep executor
+    # and simulator buckets distinct (simulator runs outside the Strands swarm).
+    collector.record_simulator_usage(simulator_instance.accumulated_usage)
+
+    # 9. Write run metrics
     metrics_path = collector.write(result, run_folder)
     print(f"Metrics written to: {metrics_path}")
 
diff --git a/scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py b/scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py
index 625591db..6464d514 100644
--- a/scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py
+++ b/scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py
@@ -1,6 +1,7 @@
-"""Docker sandbox for running untrusted commands in an isolated container.
+"""Container sandbox for running untrusted commands in an isolated container.
 
-Provides a thin wrapper around ``docker run`` so that generated code
+Supports Docker, Podman, and Finch — whichever is available on PATH.
+Provides a thin wrapper around ``<cli> run`` so that generated code
 (post-run tests, contract-test servers) can be executed without granting
 access to the host filesystem, network credentials, or environment.
 
@@ -10,6 +11,7 @@
 from __future__ import annotations
 
 import os
+import shutil
 import subprocess
 from dataclasses import dataclass
 from pathlib import Path
@@ -28,16 +30,27 @@ class SandboxResult:
 
 
 _DOCKER_AVAILABLE: bool | None = None
+_CONTAINER_CLI: str | None = None
 
 
-def is_docker_available() -> bool:
-    """Check whether Docker can actually run containers.
+def _get_container_cli() -> str | None:
+    """Return the first available container CLI: docker, podman, or finch."""
+    global _CONTAINER_CLI
+    if _CONTAINER_CLI is not None:
+        return _CONTAINER_CLI
+    for cli in ("docker", "podman", "finch"):
+        if shutil.which(cli):
+            _CONTAINER_CLI = cli
+            return _CONTAINER_CLI
+    return None
+
 
-    Goes beyond ``docker info`` by spawning a trivial container, which
-    catches cgroup v2 / OCI runtime errors that ``docker info`` misses.
+def is_docker_available() -> bool:
+    """Check whether a container runtime can actually run containers.
 
-    Goes beyond ``docker info`` by spawning a trivial container, which
-    catches cgroup v2 / OCI runtime errors that ``docker info`` misses.
+    Tries docker, podman, and finch in that order. Goes beyond ``<cli> info``
+    by spawning a trivial container, which catches cgroup v2 / OCI runtime
+    errors that plain info checks miss.
 
     The result is cached for the lifetime of the process.
     """
@@ -45,10 +58,15 @@ def is_docker_available() -> bool:
     if _DOCKER_AVAILABLE is not None:
         return _DOCKER_AVAILABLE
 
+    cli = _get_container_cli()
+    if cli is None:
+        _DOCKER_AVAILABLE = False
+        return _DOCKER_AVAILABLE
+
     try:
-        # nosec B603, B607 - Static docker command for availability check
+        # nosec B603 - Static container CLI info command for availability check
         result = subprocess.run(
-            ["docker", "info"],
+            [cli, "info"],
             capture_output=True,
             timeout=10,
         )
@@ -57,10 +75,10 @@ def is_docker_available() -> bool:
             return _DOCKER_AVAILABLE
 
         # Verify containers can actually start *with resource limits*
-        # (catches cgroup v2 / OCI runtime errors that plain `docker run` misses)
-        # nosec B603, B607 - Static docker command for runtime verification
+        # (catches cgroup v2 / OCI runtime errors that plain info misses)
+        # nosec B603 - Static container CLI run command for runtime verification
         result = subprocess.run(
-            ["docker", "run", "--rm", "--memory=6m", "--cpus=1", "alpine", "true"],
+            [cli, "run", "--rm", "--memory=6m", "--cpus=1", "alpine", "true"],
             capture_output=True,
             timeout=30,
         )
@@ -108,8 +126,9 @@ def sandbox_run(
     cpus:
         Container CPU limit.
     """
+    cli = _get_container_cli() or "docker"
     docker_cmd: list[str] = [
-        "docker", "run",
+        cli, "run",
         "--rm",
         f"--memory={memory}",
         f"--cpus={cpus}",
@@ -121,7 +140,7 @@ def sandbox_run(
         # no entry in the container's /etc/passwd.
         "-e", "HOME=/tmp",
         "-e", "UV_CACHE_DIR=/tmp/.cache/uv",
-        "-e", "NPM_CONFIG_CACHE=/tmp/.cache/npm",    
+        "-e", "NPM_CONFIG_CACHE=/tmp/.cache/npm",
     ]
 
     if not network:
@@ -185,8 +204,9 @@ def sandbox_run_detached(
 
     Raises ``RuntimeError`` if the container fails to start.
     """
+    cli = _get_container_cli() or "docker"
     docker_cmd: list[str] = [
-        "docker", "run",
+        cli, "run",
         "-d", "--rm",
         f"--memory={memory}",
         f"--cpus={cpus}",
@@ -198,7 +218,7 @@ def sandbox_run_detached(
         # no entry in the container's /etc/passwd.
         "-e", "HOME=/tmp",
         "-e", "UV_CACHE_DIR=/tmp/.cache/uv",
-        "-e", "NPM_CONFIG_CACHE=/tmp/.cache/npm",        
+        "-e", "NPM_CONFIG_CACHE=/tmp/.cache/npm",
     ]
 
     if not network:
@@ -231,18 +251,19 @@ def sandbox_run_detached(
 
 def sandbox_stop(container_id: str, timeout: int = 10) -> None:
     """Stop a running container by ID."""
+    cli = _get_container_cli() or "docker"
     try:
-        # nosec B603, B607 - Static docker stop command with container ID parameter
+        # nosec B603 - Static container stop command with container ID parameter
         subprocess.run(
-            ["docker", "stop", "-t", str(timeout), container_id],
+            [cli, "stop", "-t", str(timeout), container_id],
             capture_output=True,
             timeout=timeout + 5,
         )
     except (subprocess.TimeoutExpired, OSError):
         # Force kill if graceful stop fails
-        # nosec B603, B607 - Static docker kill command with container ID parameter
+        # nosec B603 - Static container kill command with container ID parameter
         subprocess.run(
-            ["docker", "kill", container_id],
+            [cli, "kill", container_id],
             capture_output=True,
             timeout=5,
         )
@@ -250,10 +271,11 @@ def sandbox_stop(container_id: str, timeout: int = 10) -> None:
 
 def sandbox_is_running(container_id: str) -> bool:
     """Check whether a container is still running."""
+    cli = _get_container_cli() or "docker"
     try:
-        # nosec B603, B607 - Static docker inspect command with container ID parameter
+        # nosec B603 - Static container inspect command with container ID parameter
         result = subprocess.run(
-            ["docker", "inspect", "-f", "{{.State.Running}}", container_id],
+            [cli, "inspect", "-f", "{{.State.Running}}", container_id],
             capture_output=True,
             text=True,
             timeout=5,
@@ -265,10 +287,11 @@ def sandbox_is_running(container_id: str) -> bool:
 
 def sandbox_logs(container_id: str) -> tuple[str, str]:
     """Return (stdout, stderr) from a running or stopped container."""
+    cli = _get_container_cli() or "docker"
     try:
-        # nosec B603, B607 - Static docker logs command with container ID parameter
+        # nosec B603 - Static container logs command with container ID parameter
         result = subprocess.run(
-            ["docker", "logs", container_id],
+            [cli, "logs", container_id],
             capture_output=True,
             text=True,
             timeout=10,
diff --git a/scripts/aidlc-evaluator/pyproject.toml b/scripts/aidlc-evaluator/pyproject.toml
index 2387e7d2..98c56e2c 100644
--- a/scripts/aidlc-evaluator/pyproject.toml
+++ b/scripts/aidlc-evaluator/pyproject.toml
@@ -14,6 +14,7 @@ dependencies = [
     "aidlc-reporting",
     "aidlc-shared",
     "aidlc-trend-reports",
+    "aidlc-cli-harness",
 ]
 
 [tool.uv.workspace]
@@ -39,6 +40,7 @@ aidlc-nonfunctional = { workspace = true }
 aidlc-reporting = { workspace = true }
 aidlc-shared = { workspace = true }
 aidlc-trend-reports = { workspace = true }
+aidlc-cli-harness = { workspace = true }
 
 [dependency-groups]
 dev = [
diff --git a/scripts/aidlc-evaluator/run.py b/scripts/aidlc-evaluator/run.py
index 3190e7ca..c9ac74aa 100644
--- a/scripts/aidlc-evaluator/run.py
+++ b/scripts/aidlc-evaluator/run.py
@@ -5,15 +5,17 @@
 It dispatches to specialized runner scripts in the scripts/ directory.
 
 Available modes:
-  - full       Full evaluation (execute workflow + score outputs)
-  - cli        Evaluation through a CLI AI assistant (kiro-cli, claude-code, etc.)
-  - ide        Evaluation through an IDE AI assistant (cursor, cline, kiro)
-  - batch      Batch evaluation across multiple models
-  - compare    Generate cross-model comparison report
-  - ext-test   Test extension hooks with different opt-in configurations
-  - ext-report Regenerate extension test comparison report
-  - trend      Generate trend report across AIDLC rules releases
-  - test       Run unit tests for all packages
+  - full              Full evaluation (execute workflow + score outputs)
+  - cli               Evaluation through a CLI AI assistant (kiro-cli, claude-code, etc.)
+  - ide               Evaluation through an IDE AI assistant (cursor, cline, kiro)
+  - batch             Batch evaluation across multiple models
+  - compare           Generate cross-model comparison report
+  - ext-test          Test extension hooks with different opt-in configurations
+  - ext-report        Regenerate extension test comparison report
+  - git-compare       Compare multiple git refs across scenarios with repeated runs
+  - git-compare-report Regenerate git comparison reports from existing runs
+  - trend             Generate trend report across AIDLC rules releases
+  - test              Run unit tests for all packages
 
 Usage:
     # Full pipeline evaluation
@@ -37,6 +39,12 @@
     # Regenerate extension comparison report
     python run.py ext-report --runs-dir runs/sci-calc/extension-test
 
+    # Compare git refs (branches, tags, commits)
+    python run.py git-compare --refs main,feat/my-feature --scenarios sci-calc --runs-per-ref 3
+
+    # Regenerate git comparison reports from existing runs
+    python run.py git-compare-report --runs-dir runs/sci-calc/git-compare
+
     # Generate trend report across releases
     python run.py trend --baseline test_cases/sci-calc/golden.yaml
 
@@ -47,6 +55,7 @@
     python run.py full --help
     python run.py cli --help
     python run.py ext-test --help
+    python run.py git-compare --help
 """
 
 from __future__ import annotations
@@ -59,6 +68,52 @@
 REPO_ROOT = Path(__file__).resolve().parent
 SCRIPTS_DIR = REPO_ROOT / "scripts"
 
+# Modes that require Docker sandbox
+DOCKER_DEPENDENT_MODES = {"full", "cli", "ide", "batch", "git-compare", "ext-test"}
+
+
+def _container_cli() -> str | None:
+    """Return the first available container CLI: docker or podman."""
+    import shutil
+    for cli in ("docker", "podman"):
+        if shutil.which(cli):
+            return cli
+    return None
+
+
+def check_docker_sandbox() -> bool:
+    """Check if a container runtime is available and the sandbox image exists.
+
+    Supports both Docker and Podman (checks in that order).
+
+    Returns:
+        True if a container runtime and sandbox image are available, False otherwise
+    """
+    cli = _container_cli()
+    if cli is None:
+        return False
+    try:
+        # nosemgrep: dangerous-subprocess-use-audit
+        result = subprocess.run(  # nosec B603
+            [cli, "info"],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+            timeout=5,
+        )
+        if result.returncode != 0:
+            return False
+
+        # nosemgrep: dangerous-subprocess-use-audit
+        result = subprocess.run(  # nosec B603
+            [cli, "images", "-q", "aidlc-sandbox:latest"],
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+        return bool(result.stdout.strip())
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        return False
+
 
 def main() -> None:
     parser = argparse.ArgumentParser(
@@ -124,6 +179,20 @@ def main() -> None:
         add_help=False,
     )
 
+    # Git compare mode
+    subparsers.add_parser(
+        "git-compare",
+        help="Compare multiple git refs across scenarios with repeated runs",
+        add_help=False,
+    )
+
+    # Git compare report regeneration mode
+    subparsers.add_parser(
+        "git-compare-report",
+        help="Regenerate git comparison reports from existing runs",
+        add_help=False,
+    )
+
     # Trend report mode
     subparsers.add_parser(
         "trend",
@@ -154,6 +223,8 @@ def main() -> None:
         "compare": SCRIPTS_DIR / "run_comparison_report.py",
         "ext-test": SCRIPTS_DIR / "run_extension_test.py",
         "ext-report": SCRIPTS_DIR / "regenerate_extension_report.py",
+        "git-compare": SCRIPTS_DIR / "run_git_compare.py",
+        "git-compare-report": SCRIPTS_DIR / "regenerate_git_compare_report.py",
         "trend": SCRIPTS_DIR / "run_trend_report.py",
         "test": SCRIPTS_DIR / "run_evaluation.py",  # test mode is in run_evaluation.py
     }
@@ -174,6 +245,29 @@ def main() -> None:
     # Forward all remaining arguments
     cmd.extend(remaining)
 
+    # Check container sandbox availability for modes that need it.
+    # Skip when --no-sandbox is explicitly passed (sandbox disabled by user).
+    sandbox_disabled = "--no-sandbox" in remaining
+    if args.mode in DOCKER_DEPENDENT_MODES and not sandbox_disabled:
+        if not check_docker_sandbox():
+            print("=" * 70, file=sys.stderr)
+            print("ERROR: Docker sandbox image not found", file=sys.stderr)
+            print("=" * 70, file=sys.stderr)
+            print(file=sys.stderr)
+            print("The evaluation framework requires the Docker sandbox image", file=sys.stderr)
+            print("'aidlc-sandbox:latest' to run generated code safely.", file=sys.stderr)
+            print(file=sys.stderr)
+            print("To build the image, run:", file=sys.stderr)
+            print("  ./docker/sandbox/build.sh", file=sys.stderr)
+            print(file=sys.stderr)
+            print("Or manually:", file=sys.stderr)
+            print("  docker build -t aidlc-sandbox:latest docker/sandbox/", file=sys.stderr)
+            print(file=sys.stderr)
+            print("To run without Docker (not recommended for untrusted code),", file=sys.stderr)
+            print("set 'execution.sandbox.enabled: false' in config/default.yaml", file=sys.stderr)
+            print("=" * 70, file=sys.stderr)
+            sys.exit(1)
+
     # Execute the script
     try:
         # nosec B603 - Executing trusted framework scripts from scripts/ directory
diff --git a/scripts/aidlc-evaluator/scripts/generate_html_report.py b/scripts/aidlc-evaluator/scripts/generate_html_report.py
new file mode 100644
index 00000000..39b5b2c9
--- /dev/null
+++ b/scripts/aidlc-evaluator/scripts/generate_html_report.py
@@ -0,0 +1,931 @@
+#!/usr/bin/env python3
+"""Generate interactive HTML report with charts for git-compare results."""
+
+import json
+from pathlib import Path
+
+
+def generate_interactive_html_report(
+    scenarios: list[str],
+    version_names: list[str],
+    all_results: list[dict],
+    generated_at: str,
+    runs_dir: Path,
+) -> str:
+    """Generate an interactive HTML report with charts and navigation.
+
+    Args:
+        scenarios: List of scenario names
+        version_names: List of version names in order
+        all_results: List of run result dicts with version_name, scenario, output_dir, etc.
+        generated_at: ISO timestamp of report generation
+        runs_dir: Path to runs directory for loading metrics
+
+    Returns:
+        HTML string
+    """
+    from run_git_compare import (
+        load_run_metrics,
+        get_metric_value,
+        METRIC_ROWS,
+    )
+
+    # Collect metrics per version per scenario
+    scenario_data = {}
+    for scenario_name in scenarios:
+        scenario_results = [r for r in all_results if r["scenario"] == scenario_name]
+        if not scenario_results:
+            continue
+
+        # Group by version
+        version_metrics = {vn: [] for vn in version_names}
+        for result in scenario_results:
+            vn = result["version_name"]
+            folder = Path(result["output_dir"])
+            if folder.is_dir():
+                metrics = load_run_metrics(folder)
+                if metrics:
+                    version_metrics[vn].append(metrics)
+
+        scenario_data[scenario_name] = version_metrics
+
+    # Compute aggregated metrics for charts
+    chart_data = _prepare_chart_data(version_names, scenario_data, scenarios)
+
+    # Generate HTML
+    html = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Git Version Comparison Report</title>
+    <script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
+    <style>
+        * {{
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }}
+
+        body {{
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
+            line-height: 1.6;
+            color: #333;
+            background: #f5f5f5;
+        }}
+
+        .header {{
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 2rem;
+            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        }}
+
+        .header h1 {{
+            font-size: 2rem;
+            margin-bottom: 0.5rem;
+        }}
+
+        .header .meta {{
+            opacity: 0.9;
+            font-size: 0.9rem;
+        }}
+
+        .container {{
+            max-width: 1400px;
+            margin: 0 auto;
+            padding: 2rem;
+        }}
+
+        .tabs {{
+            display: flex;
+            gap: 0.5rem;
+            margin-bottom: 2rem;
+            flex-wrap: wrap;
+        }}
+
+        .tab {{
+            padding: 0.75rem 1.5rem;
+            background: white;
+            border: none;
+            border-radius: 8px;
+            cursor: pointer;
+            font-size: 0.95rem;
+            font-weight: 500;
+            transition: all 0.2s;
+            box-shadow: 0 2px 4px rgba(0,0,0,0.05);
+        }}
+
+        .tab:hover {{
+            transform: translateY(-2px);
+            box-shadow: 0 4px 8px rgba(0,0,0,0.1);
+        }}
+
+        .tab.active {{
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            box-shadow: 0 4px 8px rgba(0,0,0,0.15);
+        }}
+
+        .tab-content {{
+            display: none;
+        }}
+
+        .tab-content.active {{
+            display: block;
+        }}
+
+        .card {{
+            background: white;
+            border-radius: 12px;
+            padding: 2rem;
+            margin-bottom: 2rem;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.08);
+        }}
+
+        .card h2 {{
+            margin-bottom: 1.5rem;
+            color: #667eea;
+            font-size: 1.5rem;
+        }}
+
+        .chart-container {{
+            position: relative;
+            height: 400px;
+            margin-bottom: 2rem;
+        }}
+
+        .metrics-grid {{
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
+            gap: 1.5rem;
+            margin-bottom: 2rem;
+        }}
+
+        .metric-card {{
+            background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
+            padding: 1.5rem;
+            border-radius: 8px;
+            text-align: center;
+        }}
+
+        .metric-card h3 {{
+            font-size: 0.9rem;
+            color: #666;
+            margin-bottom: 0.5rem;
+            text-transform: uppercase;
+            letter-spacing: 0.5px;
+        }}
+
+        .metric-card .value {{
+            font-size: 2rem;
+            font-weight: bold;
+            color: #333;
+        }}
+
+        .metric-card .delta {{
+            font-size: 0.9rem;
+            margin-top: 0.5rem;
+        }}
+
+        .delta.better {{
+            color: #10b981;
+        }}
+
+        .delta.worse {{
+            color: #ef4444;
+        }}
+
+        table {{
+            width: 100%;
+            border-collapse: collapse;
+            margin-top: 1rem;
+        }}
+
+        th, td {{
+            padding: 0.75rem;
+            text-align: left;
+            border-bottom: 1px solid #e5e7eb;
+        }}
+
+        th {{
+            background: #f9fafb;
+            font-weight: 600;
+            color: #374151;
+        }}
+
+        tr:hover {{
+            background: #f9fafb;
+        }}
+
+        .version-badge {{
+            display: inline-block;
+            padding: 0.25rem 0.75rem;
+            background: #667eea;
+            color: white;
+            border-radius: 12px;
+            font-size: 0.85rem;
+            font-weight: 500;
+        }}
+
+        .status-pass {{
+            color: #10b981;
+            font-weight: 600;
+        }}
+
+        .status-fail {{
+            color: #ef4444;
+            font-weight: 600;
+        }}
+
+        .legend {{
+            display: flex;
+            gap: 2rem;
+            margin-top: 1.5rem;
+            padding: 1rem;
+            background: #f9fafb;
+            border-radius: 8px;
+            font-size: 0.9rem;
+        }}
+
+        .legend-item {{
+            display: flex;
+            align-items: center;
+            gap: 0.5rem;
+        }}
+
+        .legend-color {{
+            width: 20px;
+            height: 20px;
+            border-radius: 4px;
+        }}
+
+        @media (max-width: 768px) {{
+            .container {{
+                padding: 1rem;
+            }}
+
+            .header {{
+                padding: 1rem;
+            }}
+
+            .header h1 {{
+                font-size: 1.5rem;
+            }}
+
+            .tabs {{
+                flex-direction: column;
+            }}
+
+            .tab {{
+                width: 100%;
+            }}
+
+            .chart-container {{
+                height: 300px;
+            }}
+        }}
+    </style>
+</head>
+<body>
+    <div class="header">
+        <h1>🚀 Git Version Comparison Report</h1>
+        <div class="meta">
+            <div>Generated: {generated_at}</div>
+            <div>Versions: {', '.join(version_names)}</div>
+            <div>Scenarios: {', '.join(scenarios)}</div>
+        </div>
+    </div>
+
+    <div class="container">
+        <div class="tabs">
+            <button class="tab active" onclick="switchTab('overview')">Overview</button>
+            <button class="tab" onclick="switchTab('performance')">Performance</button>
+            <button class="tab" onclick="switchTab('quality')">Code Quality</button>
+            <button class="tab" onclick="switchTab('tests')">Testing</button>
+            <button class="tab" onclick="switchTab('artifacts')">Artifacts</button>
+            <button class="tab" onclick="switchTab('raw-data')">Raw Data</button>
+        </div>
+
+        <div id="overview" class="tab-content active">
+            {_generate_overview_section(version_names, chart_data, scenarios)}
+        </div>
+
+        <div id="performance" class="tab-content">
+            {_generate_performance_section(version_names, chart_data)}
+        </div>
+
+        <div id="quality" class="tab-content">
+            {_generate_quality_section(version_names, chart_data)}
+        </div>
+
+        <div id="tests" class="tab-content">
+            {_generate_tests_section(version_names, chart_data)}
+        </div>
+
+        <div id="artifacts" class="tab-content">
+            {_generate_artifacts_section(version_names, chart_data)}
+        </div>
+
+        <div id="raw-data" class="tab-content">
+            {_generate_raw_data_section(version_names, scenario_data, scenarios, all_results)}
+        </div>
+    </div>
+
+    <script>
+        const chartData = {json.dumps(chart_data, indent=2)};
+
+        function switchTab(tabName) {{
+            // Hide all tabs
+            document.querySelectorAll('.tab-content').forEach(content => {{
+                content.classList.remove('active');
+            }});
+            document.querySelectorAll('.tab').forEach(button => {{
+                button.classList.remove('active');
+            }});
+
+            // Show selected tab
+            document.getElementById(tabName).classList.add('active');
+            event.target.classList.add('active');
+        }}
+
+        // Chart colors
+        const colors = [
+            'rgb(102, 126, 234)',
+            'rgb(237, 100, 166)',
+            'rgb(16, 185, 129)',
+            'rgb(251, 146, 60)',
+            'rgb(139, 92, 246)',
+            'rgb(236, 72, 153)',
+        ];
+
+        // Color scale function: maps value to color gradient (red -> yellow -> green)
+        function getColorForValue(value, minVal, maxVal, higherIsBetter) {{
+            if (value === null || minVal === null || maxVal === null) {{
+                return 'rgb(156, 163, 175)'; // Gray for null
+            }}
+
+            // Normalize value to 0-1 range
+            const range = maxVal - minVal;
+            let normalized = range > 0 ? (value - minVal) / range : 0.5;
+
+            // Invert for "lower is better" metrics
+            if (!higherIsBetter) {{
+                normalized = 1 - normalized;
+            }}
+
+            // Color gradient: red (0) -> yellow (0.5) -> green (1)
+            let r, g, b;
+            if (normalized < 0.5) {{
+                // Red to Yellow
+                const t = normalized * 2; // 0 to 1
+                r = 239; // Red stays high
+                g = Math.round(68 + (234 - 68) * t); // Green increases
+                b = 68; // Blue stays low
+            }} else {{
+                // Yellow to Green
+                const t = (normalized - 0.5) * 2; // 0 to 1
+                r = Math.round(234 - (234 - 16) * t); // Red decreases
+                g = Math.round(234 - (234 - 185) * t); // Green adjusts
+                b = Math.round(68 + (129 - 68) * t); // Blue increases
+            }}
+
+            return `rgb(${{r}}, ${{g}}, ${{b}})`;
+        }}
+
+        // Error bar plugin for 95% confidence intervals
+        const errorBarPlugin = {{
+            id: 'errorBars',
+            afterDatasetsDraw(chart, args, options) {{
+                const ctx = chart.ctx;
+                chart.data.datasets.forEach((dataset, datasetIndex) => {{
+                    const meta = chart.getDatasetMeta(datasetIndex);
+                    if (!meta.visible || !dataset.ci95) return;
+
+                    meta.data.forEach((point, index) => {{
+                        const value = dataset.data[index];
+                        const ci = dataset.ci95[index];
+
+                        if (value === null || ci === null || ci === 0) return;
+
+                        const x = point.x;
+                        const yScale = chart.scales.y;
+                        const yTop = yScale.getPixelForValue(value + ci);
+                        const yBottom = yScale.getPixelForValue(value - ci);
+                        const yCenter = point.y;
+
+                        // Draw vertical line (error bar)
+                        ctx.save();
+                        ctx.strokeStyle = 'rgba(0, 0, 0, 0.5)';
+                        ctx.lineWidth = 2;
+                        ctx.beginPath();
+                        ctx.moveTo(x, yTop);
+                        ctx.lineTo(x, yBottom);
+                        ctx.stroke();
+
+                        // Draw top cap
+                        const capWidth = 8;
+                        ctx.beginPath();
+                        ctx.moveTo(x - capWidth / 2, yTop);
+                        ctx.lineTo(x + capWidth / 2, yTop);
+                        ctx.stroke();
+
+                        // Draw bottom cap
+                        ctx.beginPath();
+                        ctx.moveTo(x - capWidth / 2, yBottom);
+                        ctx.lineTo(x + capWidth / 2, yBottom);
+                        ctx.stroke();
+
+                        ctx.restore();
+                    }});
+                }});
+            }}
+        }};
+
+        function createLineChart(canvasId, title, metricKey, higherIsBetter) {{
+            const ctx = document.getElementById(canvasId);
+            if (!ctx) return;
+
+            const data = chartData[metricKey];
+            if (!data) return;
+
+            // Calculate y-axis range including 95% CI bars with 10% padding
+            let allMinValues = [];
+            let allMaxValues = [];
+            data.scenarios.forEach((scenario, scenarioIdx) => {{
+                data.versions.forEach((version, versionIdx) => {{
+                    const point = data.values[versionIdx][scenarioIdx];
+                    if (point && point.avg !== null) {{
+                        const avg = point.avg;
+                        const ci = point.ci || 0;
+                        // Include 95% CI bar extents
+                        allMinValues.push(avg - ci);
+                        allMaxValues.push(avg + ci);
+                    }}
+                }});
+            }});
+
+            let yMin = 0;
+            let yMax = 100;
+            if (allMinValues.length > 0 && allMaxValues.length > 0) {{
+                const dataMin = Math.min(...allMinValues);
+                const dataMax = Math.max(...allMaxValues);
+                const range = dataMax - dataMin;
+
+                // If all values are the same, add minimum padding of 10% of value
+                const padding = range > 0 ? range * 0.1 : dataMax * 0.1;
+
+                yMin = Math.max(0, dataMin - padding);
+                yMax = dataMax + padding;
+            }}
+
+            // Transpose data: x-axis shows versions, each scenario is a line
+            const datasets = data.scenarios.map((scenario, scenarioIdx) => {{
+                const dataPoints = data.versions.map((version, versionIdx) => {{
+                    const point = data.values[versionIdx][scenarioIdx];
+                    return point ? point.avg : null;
+                }});
+
+                // Find min and max for color scaling
+                const validValues = dataPoints.filter(v => v !== null);
+                const minValue = validValues.length > 0 ? Math.min(...validValues) : null;
+                const maxValue = validValues.length > 0 ? Math.max(...validValues) : null;
+
+                // Baseline is first version (for tooltip comparison)
+                const baseline = dataPoints[0];
+
+                // Color scale from red (worst) to yellow to green (best)
+                const pointColors = dataPoints.map((value, idx) => {{
+                    return getColorForValue(value, minValue, maxValue, higherIsBetter);
+                }});
+
+                return {{
+                    label: scenario,
+                    data: dataPoints,
+                    ci95: data.versions.map((version, versionIdx) => {{
+                        const point = data.values[versionIdx][scenarioIdx];
+                        return point ? point.ci : null;
+                    }}),
+                    sampleSize: data.versions.map((version, versionIdx) => {{
+                        const point = data.values[versionIdx][scenarioIdx];
+                        return point ? point.n : null;
+                    }}),
+                    borderColor: 'rgb(0, 0, 0)',  // Black lines
+                    backgroundColor: pointColors,
+                    pointBackgroundColor: pointColors,
+                    pointBorderColor: pointColors,
+                    pointBorderWidth: 2,
+                    borderWidth: 2,
+                    pointRadius: 6,
+                    pointHoverRadius: 8,
+                    tension: 0.1,
+                }};
+            }});
+
+            new Chart(ctx, {{
+                type: 'line',
+                data: {{
+                    labels: data.versions,
+                    datasets: datasets,
+                }},
+                options: {{
+                    responsive: true,
+                    maintainAspectRatio: false,
+                    plugins: {{
+                        title: {{
+                            display: true,
+                            text: title,
+                            font: {{ size: 16, weight: 'bold' }},
+                        }},
+                        legend: {{
+                            display: data.scenarios.length > 1,
+                            position: 'top',
+                        }},
+                        tooltip: {{
+                            callbacks: {{
+                                label: function(context) {{
+                                    let label = context.dataset.label || '';
+                                    if (label && data.scenarios.length > 1) {{
+                                        label += ': ';
+                                    }} else {{
+                                        label = '';
+                                    }}
+                                    const value = context.parsed.y;
+                                    const ci = context.dataset.ci95[context.dataIndex];
+                                    const n = context.dataset.sampleSize[context.dataIndex];
+
+                                    if (value !== null) {{
+                                        label += value.toFixed(2);
+                                        if (ci !== null && ci > 0) {{
+                                            const lower = value - ci;
+                                            const upper = value + ci;
+                                            label += ` (95% CI: ${{lower.toFixed(2)}}-${{upper.toFixed(2)}})`;
+                                        }}
+                                        if (n !== null && n > 1) {{
+                                            label += ` n=${{n}}`;
+                                        }}
+
+                                        // Show better/worse indicator
+                                        if (context.dataIndex > 0) {{
+                                            const baseline = context.dataset.data[0];
+                                            if (baseline !== null) {{
+                                                const diff = value - baseline;
+                                                const pctChange = (diff / baseline) * 100;
+                                                const isBetter = higherIsBetter
+                                                    ? value > baseline
+                                                    : value < baseline;
+                                                const indicator = isBetter ? '🟢' : '🔴';
+                                                label += ` ${{indicator}} ${{pctChange > 0 ? '+' : ''}}${{pctChange.toFixed(1)}}%`;
+                                            }}
+                                        }}
+                                    }}
+                                    return label;
+                                }},
+                            }},
+                        }},
+                    }},
+                    scales: {{
+                        y: {{
+                            min: yMin,
+                            max: yMax,
+                            title: {{
+                                display: true,
+                                text: title,
+                            }},
+                        }},
+                        x: {{
+                            title: {{
+                                display: true,
+                                text: 'Version',
+                            }},
+                            ticks: {{
+                                maxRotation: 45,
+                                minRotation: 45,
+                            }},
+                        }},
+                    }},
+                }},
+                plugins: [errorBarPlugin],
+            }});
+        }}
+
+        // Initialize all charts
+        {_generate_chart_init_calls()}
+    </script>
+</body>
+</html>"""
+
+    return html
+
+
+def _get_t_critical(n: int, confidence: float = 0.95) -> float:
+    """Get t-critical value for confidence interval.
+
+    Uses t-distribution for small samples, z for large samples.
+    For 95% CI (two-tailed).
+    """
+    if n < 2:
+        return 1.0
+
+    # t-critical values for 95% CI (two-tailed, α=0.05)
+    t_table = {
+        2: 12.706,
+        3: 4.303,
+        4: 3.182,
+        5: 2.776,
+        6: 2.571,
+        7: 2.447,
+        8: 2.365,
+        9: 2.306,
+        10: 2.262,
+        15: 2.145,
+        20: 2.086,
+        30: 2.045,
+    }
+
+    # Use lookup table or approximate for large n
+    if n in t_table:
+        return t_table[n]
+    elif n > 30:
+        return 1.96  # z-value for 95% CI with large samples
+    else:
+        # Interpolate or use closest value
+        return 2.0
+
+
+def _prepare_chart_data(version_names: list[str], scenario_data: dict, scenarios: list[str]) -> dict:
+    """Prepare chart data structure for all metrics."""
+    from run_git_compare import get_metric_value, METRIC_ROWS, _mean, _stdev
+    import math
+
+    chart_data = {}
+
+    # Key metrics to chart
+    chart_metrics = [
+        ("tests_pass_pct", "Unit Test Pass %", True),
+        ("contract_passed", "Contract Tests Passed", True),
+        ("qualitative_score", "Qualitative Score", True),
+        ("wall_clock_min", "Execution Time (min)", False),
+        ("total_tokens", "Total Tokens", False),
+        ("lint_total", "Lint Findings", False),
+        ("security_total", "Security Findings", False),
+        ("lines_of_code", "Lines of Code", True),
+    ]
+
+    for metric_key, metric_name, higher_is_better in chart_metrics:
+        chart_data[metric_key] = {
+            "name": metric_name,
+            "higher_is_better": higher_is_better,
+            "versions": version_names,
+            "scenarios": scenarios,
+            "values": [],  # One entry per version
+        }
+
+        for vn in version_names:
+            version_data = []
+            for scenario in scenarios:
+                if scenario not in scenario_data:
+                    version_data.append({"avg": None, "std": None})
+                    continue
+
+                mlist = scenario_data[scenario].get(vn, [])
+                vals = [v for v in (get_metric_value(m, metric_key) for m in mlist) if v is not None]
+
+                if not vals:
+                    version_data.append({"avg": None, "ci": None, "n": 0})
+                elif len(vals) == 1:
+                    version_data.append({"avg": vals[0], "ci": None, "n": 1})
+                else:
+                    n = len(vals)
+                    avg = _mean(vals)
+                    std = _stdev(vals)
+                    # Calculate 95% confidence interval: t * (std / sqrt(n))
+                    t_crit = _get_t_critical(n)
+                    sem = std / math.sqrt(n)
+                    ci_half_width = t_crit * sem
+                    version_data.append({"avg": avg, "ci": ci_half_width, "n": n})
+
+            chart_data[metric_key]["values"].append(version_data)
+
+    return chart_data
+
+
+def _generate_overview_section(version_names: list[str], chart_data: dict, scenarios: list[str]) -> str:
+    """Generate overview section HTML."""
+    from run_git_compare import get_metric_value, _mean, _stdev
+
+    # Calculate key metrics
+    baseline = version_names[0] if version_names else None
+
+    html = '<div class="card">'
+    html += '<h2>📊 Overview</h2>'
+    html += '<div class="metrics-grid">'
+
+    # Show key metrics for each version
+    for idx, vn in enumerate(version_names):
+        qualitative = chart_data.get("qualitative_score", {})
+        if qualitative and qualitative["values"]:
+            scores = [v["avg"] for v in qualitative["values"][idx] if v["avg"] is not None]
+            avg_score = sum(scores) / len(scores) if scores else 0
+
+            delta_html = ""
+            if idx > 0 and baseline:
+                baseline_scores = [v["avg"] for v in qualitative["values"][0] if v["avg"] is not None]
+                baseline_avg = sum(baseline_scores) / len(baseline_scores) if baseline_scores else 0
+                delta = avg_score - baseline_avg
+                if abs(delta) > 0.001:
+                    delta_class = "better" if delta > 0 else "worse"
+                    delta_html = f'<div class="delta {delta_class}">{delta:+.3f} vs {baseline}</div>'
+
+            html += f'''
+            <div class="metric-card">
+                <h3>{vn}</h3>
+                <div class="value">{avg_score:.3f}</div>
+                <div style="font-size: 0.85rem; color: #666; margin-top: 0.25rem;">Qualitative Score</div>
+                {delta_html}
+            </div>
+            '''
+
+    html += '</div>'
+
+    # Add summary table
+    html += '<h3 style="margin-top: 2rem; margin-bottom: 1rem;">Key Metrics Summary</h3>'
+    html += '<table><thead><tr><th>Metric</th>'
+    for vn in version_names:
+        html += f'<th>{vn}</th>'
+    html += '</tr></thead><tbody>'
+
+    # Key metrics to show
+    summary_metrics = [
+        ("qualitative_score", "Qualitative Score", 3),
+        ("tests_pass_pct", "Unit Test Pass %", 1),
+        ("contract_passed", "Contract Tests Passed", 0),
+        ("wall_clock_min", "Execution Time (min)", 1),
+        ("total_tokens", "Total Tokens", 0),
+        ("lines_of_code", "Lines of Code", 0),
+    ]
+
+    for metric_key, metric_name, decimals in summary_metrics:
+        html += f'<tr><td><strong>{metric_name}</strong></td>'
+        metric_data = chart_data.get(metric_key, {})
+        if metric_data and metric_data.get("values"):
+            for idx in range(len(version_names)):
+                # Get the first scenario's data for this version (usually only one scenario)
+                version_data = metric_data["values"][idx]
+                if version_data and len(version_data) > 0:
+                    point = version_data[0]  # First scenario
+                    if point["avg"] is None:
+                        html += '<td>—</td>'
+                    elif point["ci"] is None or point["ci"] == 0:
+                        html += f'<td>{point["avg"]:.{decimals}f}</td>'
+                    else:
+                        lower = point["avg"] - point["ci"]
+                        upper = point["avg"] + point["ci"]
+                        html += f'<td>{point["avg"]:.{decimals}f}<br><span style="font-size: 0.85em; color: #666;">(95% CI: {lower:.{decimals}f}-{upper:.{decimals}f})</span></td>'
+                else:
+                    html += '<td>—</td>'
+        else:
+            for _ in version_names:
+                html += '<td>—</td>'
+        html += '</tr>'
+
+    html += '</tbody></table>'
+
+    # Key metrics charts
+    html += '<div class="chart-container"><canvas id="chart-overview-quality"></canvas></div>'
+    html += '<div class="chart-container"><canvas id="chart-overview-performance"></canvas></div>'
+
+    html += '</div>'
+    return html
+
+
+def _generate_performance_section(version_names: list[str], chart_data: dict) -> str:
+    """Generate performance section HTML."""
+    html = '<div class="card">'
+    html += '<h2>⚡ Performance Metrics</h2>'
+    html += '<div class="chart-container"><canvas id="chart-perf-time"></canvas></div>'
+    html += '<div class="chart-container"><canvas id="chart-perf-tokens"></canvas></div>'
+    html += '<div class="chart-container"><canvas id="chart-perf-context"></canvas></div>'
+    html += '</div>'
+    return html
+
+
+def _generate_quality_section(version_names: list[str], chart_data: dict) -> str:
+    """Generate code quality section HTML."""
+    html = '<div class="card">'
+    html += '<h2>🔍 Code Quality</h2>'
+    html += '<div class="chart-container"><canvas id="chart-quality-lint"></canvas></div>'
+    html += '<div class="chart-container"><canvas id="chart-quality-security"></canvas></div>'
+    html += '<div class="chart-container"><canvas id="chart-quality-qualitative"></canvas></div>'
+    html += '</div>'
+    return html
+
+
+def _generate_tests_section(version_names: list[str], chart_data: dict) -> str:
+    """Generate testing section HTML."""
+    html = '<div class="card">'
+    html += '<h2>✅ Testing Metrics</h2>'
+    html += '<div class="chart-container"><canvas id="chart-tests-unit"></canvas></div>'
+    html += '<div class="chart-container"><canvas id="chart-tests-contract"></canvas></div>'
+    html += '</div>'
+    return html
+
+
+def _generate_artifacts_section(version_names: list[str], chart_data: dict) -> str:
+    """Generate artifacts section HTML."""
+    html = '<div class="card">'
+    html += '<h2>📦 Generated Artifacts</h2>'
+    html += '<div class="chart-container"><canvas id="chart-artifacts-loc"></canvas></div>'
+    html += '<div class="chart-container"><canvas id="chart-artifacts-files"></canvas></div>'
+    html += '</div>'
+    return html
+
+
+def _generate_raw_data_section(version_names: list[str], scenario_data: dict, scenarios: list[str], all_results: list[dict]) -> str:
+    """Generate raw data section HTML."""
+    from run_git_compare import get_metric_value, _mean, _stdev
+
+    html = '<div class="card">'
+    html += '<h2>📋 Raw Data</h2>'
+
+    for scenario in scenarios:
+        html += f'<h3 style="margin-top: 2rem; margin-bottom: 1rem;">{scenario}</h3>'
+        html += '<table><thead><tr><th>Metric</th>'
+
+        for vn in version_names:
+            html += f'<th>{vn}</th>'
+
+        html += '</tr></thead><tbody>'
+
+        # Key metrics
+        metrics_to_show = [
+            ("tests_pass_pct", "Unit Test Pass %", 1),
+            ("tests_passed", "Unit Tests Passed", 0),
+            ("contract_passed", "Contract Tests Passed", 0),
+            ("qualitative_score", "Qualitative Score", 3),
+            ("wall_clock_min", "Execution Time (min)", 1),
+            ("total_tokens", "Total Tokens", 0),
+            ("lines_of_code", "Lines of Code", 0),
+        ]
+
+        for metric_key, metric_name, decimals in metrics_to_show:
+            html += f'<tr><td><strong>{metric_name}</strong></td>'
+
+            for vn in version_names:
+                if scenario not in scenario_data:
+                    html += '<td>—</td>'
+                    continue
+
+                mlist = scenario_data[scenario].get(vn, [])
+                vals = [v for v in (get_metric_value(m, metric_key) for m in mlist) if v is not None]
+
+                if not vals:
+                    html += '<td>—</td>'
+                elif len(vals) == 1:
+                    html += f'<td>{vals[0]:.{decimals}f}</td>'
+                else:
+                    avg = _mean(vals)
+                    std = _stdev(vals)
+                    html += f'<td>{avg:.{decimals}f} ± {std:.{decimals}f}</td>'
+
+            html += '</tr>'
+
+        html += '</tbody></table>'
+
+    # Run status table
+    html += '<h3 style="margin-top: 2rem; margin-bottom: 1rem;">Run Status</h3>'
+    html += '<table><thead><tr><th>Version</th><th>Scenario</th><th>Run</th><th>Status</th><th>Duration</th><th>Output</th></tr></thead><tbody>'
+
+    for result in sorted(all_results, key=lambda x: (x["version_name"], x["scenario"], x["run_index"])):
+        status_class = "status-pass" if result["status"] == "success" else "status-fail"
+        duration = result.get("elapsed_seconds", 0) / 60
+        html += f'''
+        <tr>
+            <td><span class="version-badge">{result["version_name"]}</span></td>
+            <td>{result["scenario"]}</td>
+            <td>{result["run_index"]}</td>
+            <td class="{status_class}">{result["status"].upper()}</td>
+            <td>{duration:.1f} min</td>
+            <td style="font-size: 0.8rem; max-width: 300px; overflow: hidden; text-overflow: ellipsis;">{result["output_dir"]}</td>
+        </tr>
+        '''
+
+    html += '</tbody></table>'
+    html += '</div>'
+    return html
+
+
+def _generate_chart_init_calls() -> str:
+    """Generate JavaScript calls to initialize all charts."""
+    return """
+        createLineChart('chart-overview-quality', 'Qualitative Score', 'qualitative_score', true);
+        createLineChart('chart-overview-performance', 'Execution Time (min)', 'wall_clock_min', false);
+        createLineChart('chart-perf-time', 'Execution Time (min)', 'wall_clock_min', false);
+        createLineChart('chart-perf-tokens', 'Total Tokens', 'total_tokens', false);
+        createLineChart('chart-quality-lint', 'Lint Findings', 'lint_total', false);
+        createLineChart('chart-quality-security', 'Security Findings', 'security_total', false);
+        createLineChart('chart-quality-qualitative', 'Qualitative Score', 'qualitative_score', true);
+        createLineChart('chart-tests-unit', 'Unit Test Pass %', 'tests_pass_pct', true);
+        createLineChart('chart-tests-contract', 'Contract Tests Passed', 'contract_passed', true);
+        createLineChart('chart-artifacts-loc', 'Lines of Code', 'lines_of_code', true);
+    """
diff --git a/scripts/aidlc-evaluator/scripts/regenerate_git_compare_report.py b/scripts/aidlc-evaluator/scripts/regenerate_git_compare_report.py
new file mode 100644
index 00000000..32fa44e9
--- /dev/null
+++ b/scripts/aidlc-evaluator/scripts/regenerate_git_compare_report.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""Regenerate git comparison reports from completed runs.
+
+Scans a git-compare runs directory for its git-compare-summary.yaml, groups
+run folders by (version, scenario), and regenerates all per-scenario detail
+reports and the rollup report without re-running any evaluations.
+
+Usage:
+    python run.py git-compare-report --runs-dir runs/sci-calc/git-compare
+    python run.py git-compare-report --runs-dir runs/git-compare
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from datetime import UTC, datetime
+from pathlib import Path
+
+import yaml
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+
+# Add scripts dir so we can import shared report logic from run_git_compare
+sys.path.insert(0, str(REPO_ROOT / "scripts"))
+# Add packages needed by run_git_compare imports
+sys.path.insert(0, str(REPO_ROOT / "packages" / "shared" / "src"))
+sys.path.insert(0, str(REPO_ROOT / "packages" / "reporting" / "src"))
+
+from run_git_compare import write_reports  # noqa: E402
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        prog="regenerate_git_compare_report",
+        description="Regenerate git comparison reports from completed runs",
+    )
+    parser.add_argument(
+        "--runs-dir", type=Path, required=True,
+        help="Git compare runs directory containing git-compare-summary.yaml",
+    )
+    args = parser.parse_args()
+
+    summary_path = args.runs_dir / "git-compare-summary.yaml"
+    if not summary_path.exists():
+        print(f"Error: {summary_path} not found", file=sys.stderr)
+        print(
+            "Make sure --runs-dir points to the git-compare output directory "
+            "that contains git-compare-summary.yaml.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    with open(summary_path, encoding="utf-8") as f:
+        summary = yaml.safe_load(f) or {}
+
+    version_names: list[str] = summary.get("version_names", [])
+    scenarios: list[str] = summary.get("scenarios", [])
+    all_results: list[dict] = summary.get("runs", [])
+
+    if not version_names or not scenarios or not all_results:
+        print(
+            "Error: git-compare-summary.yaml is missing version_names, scenarios, or runs.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    print(
+        f"Loaded summary: {len(all_results)} run(s) across "
+        f"{len(version_names)} version(s) and {len(scenarios)} scenario(s)"
+    )
+    print(f"  Versions:  {', '.join(version_names)}")
+    print(f"  Scenarios: {', '.join(scenarios)}")
+
+    generated_at = datetime.now(UTC).isoformat(timespec="seconds")
+
+    write_reports(
+        runs_dir=args.runs_dir,
+        scenarios=scenarios,
+        version_names=version_names,
+        all_results=all_results,
+        generated_at=generated_at,
+    )
+
+    print(f"\nReports regenerated in: {args.runs_dir / 'comparison'}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/aidlc-evaluator/scripts/run_cli_evaluation.py b/scripts/aidlc-evaluator/scripts/run_cli_evaluation.py
index c5129f51..150372f5 100644
--- a/scripts/aidlc-evaluator/scripts/run_cli_evaluation.py
+++ b/scripts/aidlc-evaluator/scripts/run_cli_evaluation.py
@@ -41,7 +41,7 @@
 # Add cli-harness to path
 sys.path.insert(0, str(PACKAGES / "cli-harness" / "src"))
 
-from cli_harness.registry import get_adapter, list_adapters  # noqa: E402
+from cli_harness.registry import get_adapter, list_adapters, load_adapters_from_config  # noqa: E402
 from cli_harness.orchestrator import run_cli_evaluation  # noqa: E402
 
 _SLUG_MAX_LEN = 80
@@ -176,6 +176,7 @@ def main() -> None:
     parser.add_argument("--profile", default=None, help="AWS profile (default: from config YAML)")
     parser.add_argument("--region", default=None, help="AWS region (default: from config YAML)")
     parser.add_argument("--scorer-model", default=None, help="Bedrock model for scoring (default: from config YAML)")
+    parser.add_argument("--simulator-model", default=None, help="Bedrock model for human simulator (default: from config YAML models.simulator.model_id)")
     parser.add_argument("--model", default=None, help="Model to use with the CLI adapter (e.g., claude-sonnet-4)")
     parser.add_argument(
         "--verbose", "-v", action="store_true",
@@ -220,6 +221,9 @@ def main() -> None:
         with open(args.config, encoding="utf-8") as f:
             cfg_data = yaml.safe_load(f) or {}
 
+    # Register any custom adapters declared in config before resolving --cli
+    load_adapters_from_config(cfg_data)
+
     if args.profile is None:
         args.profile = cfg_data.get("aws", {}).get("profile")
     if args.region is None:
@@ -232,6 +236,10 @@ def main() -> None:
             parser.error(
                 "--scorer-model is required (or set models.scorer.model_id in config YAML)"
             )
+    if args.simulator_model is None:
+        args.simulator_model = (
+            cfg_data.get("models", {}).get("simulator", {}).get("model_id")
+        )
 
     # ── Resolve AIDLC rules config ────────────────────────────────────────
     aidlc_cfg = cfg_data.get("aidlc", {})
@@ -281,6 +289,7 @@ def main() -> None:
         profile=args.profile,
         region=args.region,
         scorer_model=args.scorer_model,
+        simulator_model=args.simulator_model,
         model=args.model,
         rules_source=rules_source,
         rules_ref=rules_ref,
diff --git a/scripts/aidlc-evaluator/scripts/run_evaluation.py b/scripts/aidlc-evaluator/scripts/run_evaluation.py
index d57a08ae..03329707 100644
--- a/scripts/aidlc-evaluator/scripts/run_evaluation.py
+++ b/scripts/aidlc-evaluator/scripts/run_evaluation.py
@@ -38,10 +38,12 @@
 
 import argparse
 import os
+import re
 import subprocess
 import sys
 from datetime import datetime, timezone
 from pathlib import Path
+from urllib.parse import urlparse
 
 import yaml
 
@@ -198,6 +200,7 @@ def _rel(p: Path | None) -> str | None:
             "scorer_model": args.scorer_model,
             "executor_model": args.executor_model,
             "rules_ref": args.rules_ref,
+            "rules_repo": args.rules_repo,
             "output_dir": _rel(args.output_dir),
             "sandbox": args.sandbox,
             "report_format": args.report_format,
@@ -225,59 +228,6 @@ def _rel(p: Path | None) -> str | None:
     atomic_yaml_dump(meta, meta_path)
 
 
-_SENTINEL_NAME = ".last_run_folder"
-
-
-def _read_run_sentinel(output_dir: Path) -> Path | None:
-    """Read the sentinel file written by create_run_folder().
-
-    Returns the run folder path if the sentinel exists and the directory
-    is valid, otherwise None.  The sentinel is removed after reading so
-    it does not confuse subsequent runs.
-    """
-    sentinel = output_dir / _SENTINEL_NAME
-    if not sentinel.is_file():
-        return None
-    try:
-        run_folder = Path(sentinel.read_text(encoding="utf-8").strip())
-        sentinel.unlink(missing_ok=True)
-        if run_folder.is_dir():
-            return run_folder
-    except OSError:
-        pass
-    return None
-
-
-def _list_run_folders(output_dir: Path | None = None) -> set[Path]:
-    """Return the current set of run folders under runs/.
-
-    Args:
-        output_dir: Directory to search for run folders. Defaults to REPO_ROOT / "runs".
-    """
-    runs_dir = output_dir if output_dir else REPO_ROOT / "runs"
-    if not runs_dir.is_dir():
-        return set()
-    return {d for d in runs_dir.iterdir() if d.is_dir() and not d.name.startswith(".")}
-
-
-def _find_new_run(before: set[Path], output_dir: Path | None = None) -> Path | None:
-    """Find the single new run folder created since *before* was captured.
-
-    Falls back to the newest folder if multiple appeared (shouldn't happen
-    in normal single-run usage).
-
-    Args:
-        before: Set of run folders that existed before execution.
-        output_dir: Directory to search for new run folders. Defaults to REPO_ROOT / "runs".
-
-    .. deprecated::
-        Prefer :func:`_read_run_sentinel` which avoids the TOCTOU race
-        condition inherent in before/after directory listing.
-    """
-    after = _list_run_folders(output_dir)
-    new = sorted(after - before, reverse=True)
-    return new[0] if new else None
-
 
 def _find_latest_run(scenario_name: str | None = None) -> Path | None:
     """Find the most recent timestamped run folder under runs/.
@@ -304,17 +254,59 @@ def _find_latest_run(scenario_name: str | None = None) -> Path | None:
 
 # ── stages ───────────────────────────────────────────────────────────────────
 
-def stage_execute(args: argparse.Namespace) -> Path | None:
+_SLUG_MAX_LEN = 80
+
+
+def _rules_slug(cfg_data: dict, args: argparse.Namespace) -> str:
+    """Derive a filesystem-safe slug matching runner.py's _rules_slug()."""
+    aidlc = cfg_data.get("aidlc", {})
+    rules_source = aidlc.get("rules_source", "git")
+    rules_local_path = aidlc.get("rules_local_path")
+    rules_repo = args.rules_repo or aidlc.get("rules_repo", "")
+    rules_ref = args.rules_ref or aidlc.get("rules_ref", "main")
+
+    if rules_source == "local" and rules_local_path:
+        raw = f"local_{Path(rules_local_path).name}"
+    else:
+        path = urlparse(rules_repo).path.rstrip("/")
+        repo_name = Path(path).stem
+        raw = f"{repo_name}_{rules_ref}"
+
+    slug = raw.replace(" ", "-")
+    slug = re.sub(r"[^a-zA-Z0-9._-]", "", slug)
+    return slug[:_SLUG_MAX_LEN]
+
+
+def stage_execute(args: argparse.Namespace, cfg_data: dict) -> Path | None:
     """Stage 1: Run the AIDLC workflow via packages/execution.
 
+    The run folder is pre-allocated here with the same timestamp+slug format
+    used by runner.py, then passed as the exact --output-dir.  This makes the
+    folder deterministic and eliminates all post-hoc discovery, which is
+    required for safe parallel execution.
+
     Returns the run folder even if the runner exits non-zero, as long as
     aidlc-docs were produced (the swarm may fail on a late handoff after
     all documents are already written).
     """
+    # Pre-allocate the run folder with the same naming convention as runner.py.
+    # Passing a timestamped path as --output-dir triggers runner.py's Mode 1
+    # (use the path directly rather than creating a new timestamped subfolder).
+    parent_dir = args.output_dir
+    if not parent_dir and hasattr(args, "_scenario_name"):
+        parent_dir = REPO_ROOT / "runs" / args._scenario_name
+    parent_dir = parent_dir or (REPO_ROOT / "runs")
+    parent_dir.mkdir(parents=True, exist_ok=True)
+
+    timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S")
+    slug = _rules_slug(cfg_data, args)
+    run_folder = parent_dir / f"{timestamp}-{slug}"
+
     cmd = [
         sys.executable, "-m", "aidlc_runner",
         "--vision", str(args.vision),
         "--config", str(args.config),
+        "--output-dir", str(run_folder),
     ]
     if args.tech_env:
         cmd += ["--tech-env", str(args.tech_env)]
@@ -326,12 +318,10 @@ def stage_execute(args: argparse.Namespace) -> Path | None:
         cmd += ["--executor-model", args.executor_model]
     if args.rules_ref:
         cmd += ["--rules-ref", args.rules_ref]
-    # Route output under runs/<scenario>/ by default
-    output_dir = args.output_dir
-    if not output_dir and hasattr(args, "_scenario_name"):
-        output_dir = REPO_ROOT / "runs" / args._scenario_name
-    if output_dir:
-        cmd += ["--output-dir", str(output_dir)]
+    if args.rules_repo:
+        cmd += ["--rules-repo", args.rules_repo]
+    if args.openapi and args.openapi.is_file():
+        cmd += ["--openapi", str(args.openapi)]
 
     env_pythonpath = os.pathsep.join([
         str(PACKAGES / "execution" / "src"),
@@ -339,22 +329,9 @@ def stage_execute(args: argparse.Namespace) -> Path | None:
     ])
     env = {**os.environ, "PYTHONPATH": env_pythonpath}
 
-    # Determine the output directory so we can read the sentinel file after.
-    effective_output_dir = output_dir or (REPO_ROOT / "runs")
-
-    # Snapshot for the legacy fallback (in case the runner doesn't write
-    # the sentinel, e.g. older runner versions).
-    existing_runs = _list_run_folders(output_dir)
-
     result = _run_cmd(cmd, "Stage 1: AIDLC Workflow Execution", env=env)
 
-    # Prefer the sentinel file written by create_run_folder() — it avoids
-    # the TOCTOU race inherent in before/after directory listing.
-    run_folder = _read_run_sentinel(effective_output_dir)
-    if run_folder is None:
-        # Fall back to directory-diff for backwards compatibility.
-        run_folder = _find_new_run(existing_runs, output_dir)
-    if run_folder is None:
+    if not run_folder.is_dir():
         return None
 
     docs_dir = run_folder / "aidlc-docs"
@@ -652,6 +629,10 @@ def build_parser() -> argparse.ArgumentParser:
         "--rules-ref", default=None,
         help="Git ref (branch/tag/commit) for AIDLC rules (overrides config value)",
     )
+    parser.add_argument(
+        "--rules-repo", default=None,
+        help="Git repository URL for AIDLC rules (overrides config aidlc.rules_repo)",
+    )
     parser.add_argument(
         "--executor-model", default=None,
         help="Override executor model ID",
@@ -789,7 +770,7 @@ def main() -> None:
     print(f"  Sandbox:   {'enabled' if args.sandbox else 'disabled'}")
 
     # Stage 1: Execute the AIDLC workflow
-    run_folder = stage_execute(args)
+    run_folder = stage_execute(args, cfg_data)
     if run_folder is None:
         print("\n[ABORT] Execution stage failed.", file=sys.stderr)
         sys.exit(1)
diff --git a/scripts/aidlc-evaluator/scripts/run_git_compare.py b/scripts/aidlc-evaluator/scripts/run_git_compare.py
new file mode 100644
index 00000000..74b7a09f
--- /dev/null
+++ b/scripts/aidlc-evaluator/scripts/run_git_compare.py
@@ -0,0 +1,1398 @@
+#!/usr/bin/env python3
+"""Git Version Comparison Runner — compare multiple versions of AIDLC rules.
+
+Runs the AIDLC evaluation pipeline against multiple versions, where each
+version specifies a git ref and optionally its own repository URL (GitHub,
+GitLab, any git host), executor model, and base config.  Supports repeated
+runs per version for non-determinism analysis.
+
+Generates per-scenario detail reports (raw numbers per run) and a rollup
+report with avg +/- std dev aggregated across repeated runs.
+
+Usage:
+    # Simple ref comparison (all refs share the repo URL from config)
+    python run.py git-compare \\
+        --refs main,feat/my-feature \\
+        --scenarios sci-calc \\
+        --runs-per-ref 3
+
+    # Per-version sources via a versions file (different repos, models, etc.)
+    python run.py git-compare \\
+        --versions-file versions.yaml \\
+        --scenarios sci-calc,all-stages \\
+        --runs-per-ref 2
+
+    # Incremental mode: add new versions to existing comparison
+    python run.py git-compare \\
+        --versions-file versions-expanded.yaml \\
+        --scenarios sci-calc \\
+        --runs-per-ref 2 \\
+        --runs-dir runs/sci-calc/git-compare \\
+        --incremental
+
+    # Parallel execution: run up to 3 evaluations concurrently
+    python run.py git-compare \\
+        --versions-file versions.yaml \\
+        --scenarios sci-calc \\
+        --runs-per-ref 3 \\
+        --max-parallel 3
+
+    # Regenerate reports from existing runs
+    python run.py git-compare-report \\
+        --runs-dir runs/sci-calc/git-compare
+
+Versions file format (versions.yaml):
+    versions:
+      - name: main-github
+        ref: main
+        repo: https://github.com/awslabs/aidlc-workflows.git
+
+      - name: my-feature-gitlab
+        ref: feat/new-rules
+        repo: https://gitlab.com/myorg/aidlc-fork.git
+        executor_model: global.anthropic.claude-sonnet-4-6-v1  # optional
+        config: config/sonnet-4-6.yaml                         # optional
+
+Incremental mode:
+    In incremental mode (--incremental), the script:
+    1. Loads existing git-compare-summary.yaml from --runs-dir
+    2. Identifies which versions have already been tested
+    3. Runs evaluations ONLY for new versions not in the existing summary
+    4. Merges new results with existing data
+    5. Regenerates all reports with the complete dataset
+
+    Example workflow:
+    # Week 1: Test 2 versions
+    python run.py git-compare --versions-file v1-2.yaml --scenarios sci-calc --runs-per-ref 3
+
+    # Week 2: Add 3rd version (only runs 1 new version, ~30 min vs ~90 min)
+    python run.py git-compare --versions-file v1-3.yaml --scenarios sci-calc --runs-per-ref 3 \\
+        --runs-dir runs/sci-calc/git-compare --incremental
+
+    Use --force-rerun to re-run versions that already exist in the summary.
+"""
+
+from __future__ import annotations
+
+import argparse
+import math
+import os
+import shutil
+import subprocess
+import sys
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, field
+from datetime import UTC, datetime
+from pathlib import Path
+
+import yaml
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+CONFIG_DIR = REPO_ROOT / "config"
+DEFAULT_CONFIG = CONFIG_DIR / "default.yaml"
+TEST_CASES_DIR = REPO_ROOT / "test_cases"
+SCRIPTS_DIR = REPO_ROOT / "scripts"
+
+# Add shared and reporting packages to path
+sys.path.insert(0, str(REPO_ROOT / "packages" / "shared" / "src"))
+sys.path.insert(0, str(REPO_ROOT / "packages" / "reporting" / "src"))
+
+from shared.scenario import resolve_scenario, Scenario  # noqa: E402
+from reporting.baseline import BaselineMetrics, extract_baseline  # noqa: E402
+from reporting.collector import collect  # noqa: E402
+
+
+# ── Version spec ───────────────────────────────────────────────────────────────
+
+
+@dataclass
+class Version:
+    """A single version to compare — a named (repo, ref) pair with optional overrides."""
+
+    name: str
+    """Display label used in report column headers and run folder names."""
+
+    ref: str
+    """Git ref: branch name, tag, or commit SHA."""
+
+    repo: str | None = None
+    """Git repository URL. None means use the value from the base config YAML."""
+
+    executor_model: str | None = None
+    """Per-version executor model override. None means use the global default."""
+
+    config: Path | None = None
+    """Per-version base config YAML. None means use the global --config value."""
+
+
+def parse_versions_file(path: Path) -> list[Version]:
+    """Load a versions YAML file and return a list of Version objects.
+
+    Expected format::
+
+        versions:
+          - name: main-github
+            ref: main
+            repo: https://github.com/awslabs/aidlc-workflows.git
+          - name: my-feature
+            ref: feat/my-feature
+            repo: https://gitlab.com/myorg/fork.git
+            executor_model: global.anthropic.claude-sonnet-4-6-v1
+            config: config/sonnet-4-6.yaml   # resolved relative to versions file
+
+    ``repo``, ``executor_model``, and ``config`` are all optional.
+    """
+    with open(path, encoding="utf-8") as f:
+        data = yaml.safe_load(f) or {}
+
+    raw = data.get("versions", [])
+    if not raw:
+        raise ValueError(f"versions file {path} contains no 'versions' list")
+
+    versions: list[Version] = []
+    for i, v in enumerate(raw):
+        name = v.get("name", "").strip()
+        if not name:
+            raise ValueError(f"version entry {i + 1} in {path} is missing 'name'")
+        ref = v.get("ref", "").strip()
+        if not ref:
+            raise ValueError(f"version '{name}' in {path} is missing 'ref'")
+
+        cfg_override: Path | None = None
+        if v.get("config"):
+            cfg_path = Path(v["config"])
+            if not cfg_path.is_absolute():
+                cfg_path = path.parent / cfg_path
+            cfg_override = cfg_path
+
+        versions.append(Version(
+            name=name,
+            ref=ref,
+            repo=v.get("repo") or None,
+            executor_model=v.get("executor_model") or None,
+            config=cfg_override,
+        ))
+
+    return versions
+
+
+def versions_from_refs(refs: list[str]) -> list[Version]:
+    """Build a list of Versions from a plain list of git refs.
+
+    The version name is derived from the ref by replacing '/' with '_' and
+    truncating to 40 characters (same slug logic used for folder names).
+    The repo field is left None so each run inherits the repo URL from config.
+    """
+    return [Version(name=ref_to_slug(ref), ref=ref) for ref in refs]
+
+
+# ── Metrics and formatting ─────────────────────────────────────────────────────
+
+
+# Metric rows used in all reports: (display_name, attr_name, higher_is_better)
+# attr_name="" marks a section-header row (no data cell).
+# "wall_clock_min" is a computed alias for wall_clock_ms / 60000.
+METRIC_ROWS: list[tuple[str, str, bool]] = [
+    ("**Unit Tests**", "", True),
+    ("Pass %", "tests_pass_pct", True),
+    ("Passed", "tests_passed", True),
+    ("Failed", "tests_failed", False),
+    ("Total", "tests_total", True),
+    ("Coverage %", "coverage_pct", True),
+    ("**Contract Tests**", "", True),
+    ("Passed", "contract_passed", True),
+    ("Failed", "contract_failed", False),
+    ("Total", "contract_total", True),
+    ("**Code Quality**", "", True),
+    ("Lint Errors", "lint_errors", False),
+    ("Lint Warnings", "lint_warnings", False),
+    ("Lint Total", "lint_total", False),
+    ("Security Findings", "security_total", False),
+    ("Security High", "security_high", False),
+    ("Duplication Blocks", "duplication_blocks", False),
+    ("**Qualitative**", "", True),
+    ("Overall Score", "qualitative_score", True),
+    ("Inception Score", "inception_score", True),
+    ("Construction Score", "construction_score", True),
+    ("**Artifacts**", "", True),
+    ("Source Files", "source_files", True),
+    ("Test Files", "test_files", True),
+    ("Total Files", "total_files", True),
+    ("Lines of Code", "lines_of_code", True),
+    ("Doc Files", "doc_files", True),
+    ("**Execution**", "", True),
+    ("Total Tokens", "total_tokens", False),
+    ("Executor Tokens", "executor_total_tokens", False),
+    ("Simulator Tokens", "simulator_total_tokens", False),
+    ("Wall Clock (min)", "wall_clock_min", False),
+    ("Handoffs", "handoffs", False),
+    ("**Context Size**", "", True),
+    ("Max Tokens", "context_size_max", False),
+    ("Avg Tokens", "context_size_avg", False),
+    ("Median Tokens", "context_size_median", False),
+]
+
+
+def ref_to_slug(ref: str, max_len: int = 40) -> str:
+    """Convert a git ref or version name to a filesystem-safe slug.
+
+    Replaces '/' with '_' and truncates to max_len characters.
+    """
+    return ref.replace("/", "_")[:max_len]
+
+
+def get_metric_value(metrics: BaselineMetrics, attr: str) -> float | None:
+    """Extract a metric value from BaselineMetrics, handling the wall_clock_min alias."""
+    if attr == "wall_clock_min":
+        return metrics.wall_clock_ms / 60000 if metrics.wall_clock_ms else None
+    return getattr(metrics, attr, None)
+
+
+def format_num(val: float | int | None, decimals: int = 1) -> str:
+    """Format a number for display, returning em-dash for None."""
+    if val is None:
+        return "\u2014"
+    if isinstance(val, float):
+        return f"{val:.{decimals}f}"
+    return str(val)
+
+
+def _mean(values: list[float]) -> float:
+    return sum(values) / len(values)
+
+
+def _stdev(values: list[float]) -> float:
+    if len(values) < 2:
+        return 0.0
+    m = _mean(values)
+    return math.sqrt(sum((v - m) ** 2 for v in values) / (len(values) - 1))
+
+
+def load_run_metrics(run_folder: Path) -> BaselineMetrics | None:
+    """Load evaluation metrics from a run folder."""
+    try:
+        data = collect(run_folder)
+        return extract_baseline(data)
+    except Exception as e:
+        print(f"  [WARN] Failed to collect metrics from {run_folder}: {e}", file=sys.stderr)
+        return None
+
+
+# ── Execution ──────────────────────────────────────────────────────────────────
+
+
+def run_single_evaluation(
+    version: Version,
+    scenario: Scenario,
+    run_index: int,
+    runs_per_ref: int,
+    runs_dir: Path,
+    base_config: Path,
+    profile: str,
+    region: str,
+    scorer_model: str,
+    default_executor_model: str | None,
+    use_sandbox: bool,
+) -> dict:
+    """Run a single evaluation for one (version, scenario, run_index) combination.
+
+    The effective config, executor model, and rules repo/ref are resolved
+    by layering version-level overrides on top of the global defaults.
+
+    Returns a summary dict describing the run result.
+    """
+    effective_config = version.config or base_config
+    effective_executor = version.executor_model or default_executor_model
+    folder_slug = ref_to_slug(version.name)
+
+    # Generate folder name upfront - orchestrator controls the output location
+    timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%S-%f")
+    folder_name = f"{timestamp}-{folder_slug}"
+    run_folder = runs_dir / folder_name
+
+    _safe_print(f"\n{'=' * 70}")
+    _safe_print(f"  Version:  {version.name}")
+    _safe_print(f"  Ref:      {version.ref}")
+    if version.repo:
+        _safe_print(f"  Repo:     {version.repo}")
+    _safe_print(f"  Scenario: {scenario.name}")
+    _safe_print(f"  Run:      {run_index}/{runs_per_ref}")
+    _safe_print(f"{'=' * 70}\n")
+
+    cmd = [
+        sys.executable, str(SCRIPTS_DIR / "run_evaluation.py"),
+        "--config", str(effective_config),
+        "--vision", str(scenario.vision_path),
+        "--golden", str(scenario.golden_aidlc_docs_path),
+        "--profile", profile,
+        "--region", region,
+        "--scorer-model", scorer_model,
+        "--rules-ref", version.ref,
+        "--report-format", "both",
+        "--output-dir", str(run_folder),  # Pass full folder path, not parent dir
+    ]
+
+    if version.repo:
+        cmd += ["--rules-repo", version.repo]
+    if scenario.tech_env_path.is_file():
+        cmd += ["--tech-env", str(scenario.tech_env_path)]
+    if scenario.openapi_path.is_file():
+        cmd += ["--openapi", str(scenario.openapi_path)]
+    if scenario.golden_baseline_path.is_file():
+        cmd += ["--baseline", str(scenario.golden_baseline_path)]
+    if effective_executor:
+        cmd += ["--executor-model", effective_executor]
+    cmd.append("--sandbox" if use_sandbox else "--no-sandbox")
+
+    # Create log directory
+    log_dir = runs_dir / ".git-compare-logs"
+    log_dir.mkdir(parents=True, exist_ok=True)
+    log_path = log_dir / f"{timestamp}-{folder_slug}-{scenario.name}-run{run_index}.log"
+
+    start_monotonic = time.monotonic()  # Track elapsed time
+    started_at = datetime.now(UTC).isoformat(timespec="seconds")
+    runs_dir.mkdir(parents=True, exist_ok=True)
+
+    with open(log_path, "w", encoding="utf-8") as log_file:
+        # Write header to identify this run in the log
+        log_file.write(f"=== Git-Compare Run Log ===\n")
+        log_file.write(f"Version: {version.name}\n")
+        log_file.write(f"Ref: {version.ref}\n")
+        log_file.write(f"Repo: {version.repo or '(from config)'}\n")
+        log_file.write(f"Scenario: {scenario.name}\n")
+        log_file.write(f"Run: {run_index}/{runs_per_ref}\n")
+        log_file.write(f"Started: {started_at}\n")
+        log_file.write(f"{'=' * 70}\n\n")
+        log_file.flush()
+
+        result = subprocess.run(cmd, stdout=log_file, stderr=subprocess.STDOUT)  # nosec B603 
+
+    elapsed_s = time.monotonic() - start_monotonic
+    status = "success" if result.returncode == 0 else "failed"
+    _safe_print(
+        f"  [{status.upper()}] version={version.name}, scenario={scenario.name}, "
+        f"run={run_index} \u2014 {elapsed_s / 60:.1f} min (exit {result.returncode})"
+    )
+
+    # We told run_evaluation.py exactly where to write, so use that folder
+    if run_folder.is_dir():
+        output_dir = run_folder
+        _safe_print(f"  Output: {output_dir.name}")
+
+        # Move log file with descriptive name
+        final_log_name = f"git-compare-{folder_slug}-{scenario.name}-run{run_index}.log"
+        final_log_path = output_dir / final_log_name
+        shutil.move(str(log_path), str(final_log_path))
+
+        # Write metadata to identify this run
+        meta = {
+            "git_compare_version_name": version.name,
+            "git_compare_ref": version.ref,
+            "git_compare_repo": version.repo,
+            "git_compare_scenario": scenario.name,
+            "git_compare_run_index": run_index,
+            "git_compare_runs_per_version": runs_per_ref,
+        }
+        with open(output_dir / "git-compare-meta.yaml", "w", encoding="utf-8") as f:
+            yaml.safe_dump(meta, f, default_flow_style=False, sort_keys=False)
+    else:
+        _safe_print(f"  [WARN] Run folder not created: {run_folder}")
+        output_dir = run_folder  # Use the expected path even if it doesn't exist
+        output_dir.mkdir(parents=True, exist_ok=True)
+        shutil.move(str(log_path), str(output_dir / "git-compare-run.log"))
+
+    # Clean temp log dir if empty
+    if log_dir.exists() and not any(log_dir.iterdir()):
+        log_dir.rmdir()
+
+    return {
+        "version_name": version.name,
+        "ref": version.ref,
+        "repo": version.repo,
+        "scenario": scenario.name,
+        "run_index": run_index,
+        "started_at": started_at,
+        "elapsed_seconds": round(elapsed_s, 1),
+        "exit_code": result.returncode,
+        "status": status,
+        "output_dir": str(output_dir),
+    }
+
+
+# ── Report generation ──────────────────────────────────────────────────────────
+
+
+def _run_label(res: dict) -> str:
+    """Column label for an individual run in the detail report."""
+    return f"{res['version_name']} run-{res['run_index']}"
+
+
+def generate_scenario_detail_report(
+    scenario_name: str,
+    version_names: list[str],
+    run_results: list[dict],
+    generated_at: str,
+) -> str:
+    """Generate a per-scenario detail report with one column per individual run.
+
+    Columns are ordered by version (preserving the order in version_names)
+    then by run index. Each cell contains the raw numeric value for that run.
+    """
+    lines: list[str] = [
+        f"# Git Version Comparison \u2014 {scenario_name}",
+        "",
+        f"**Scenario:** {scenario_name}",
+        f"**Generated:** {generated_at}",
+        "",
+        "## Run Detail (Raw Numbers)",
+        "",
+        "Each column is one individual run. "
+        "Runs are grouped by version (in the order specified) then sorted by run index.",
+        "",
+    ]
+
+    version_order = {n: i for i, n in enumerate(version_names)}
+    sorted_results = sorted(
+        run_results,
+        key=lambda r: (version_order.get(r["version_name"], 999), r["run_index"]),
+    )
+
+    col_labels: list[str] = []
+    col_metrics: list[BaselineMetrics | None] = []
+    for res in sorted_results:
+        col_labels.append(_run_label(res))
+        folder = Path(res["output_dir"])
+        col_metrics.append(load_run_metrics(folder) if folder.is_dir() else None)
+
+    header = "| Metric |"
+    separator = "|--------|"
+    for label in col_labels:
+        header += f" {label} |"
+        separator += "---------|"
+    lines.append(header)
+    lines.append(separator)
+
+    for display_name, attr, _ in METRIC_ROWS:
+        if not attr:
+            row = f"| {display_name} |"
+            for _ in col_labels:
+                row += " |"
+            lines.append(row)
+            continue
+
+        row = f"| {display_name} |"
+        for metrics in col_metrics:
+            if metrics is None:
+                row += " \u2014 |"
+            else:
+                val = get_metric_value(metrics, attr)
+                row += f" {format_num(val)} |"
+        lines.append(row)
+
+    lines.append("")
+
+    # Run status table
+    lines.extend([
+        "",
+        "## Run Status",
+        "",
+        "| Version | Ref | Repo | Run | Status | Duration (min) | Output |",
+        "|---------|-----|------|-----|--------|----------------|--------|",
+    ])
+    for res in sorted_results:
+        marker = "PASS" if res["status"] == "success" else "FAIL"
+        duration = res.get("elapsed_seconds", 0) / 60
+        repo_display = res.get("repo") or "*(from config)*"
+        lines.append(
+            f"| {res['version_name']} | {res['ref']} | {repo_display} "
+            f"| {res['run_index']} | {marker} | {duration:.1f} "
+            f"| `{res['output_dir']}` |"
+        )
+    lines.append("")
+
+    return "\n".join(lines)
+
+
+def _build_rollup_section(
+    scenario_name: str,
+    version_names: list[str],
+    run_results: list[dict],
+) -> list[str]:
+    """Build markdown lines for one scenario's rollup table (avg +/- std dev)."""
+    lines: list[str] = [
+        f"## Scenario: {scenario_name}",
+        "",
+    ]
+
+    # Group loaded metrics by version name
+    version_metrics: dict[str, list[BaselineMetrics]] = {n: [] for n in version_names}
+    for res in run_results:
+        vn = res["version_name"]
+        folder = Path(res["output_dir"])
+        if folder.is_dir():
+            m = load_run_metrics(folder)
+            if m is not None:
+                version_metrics.setdefault(vn, []).append(m)
+
+    # Build column descriptors: (header_label, version_name, metrics_list)
+    columns: list[tuple[str, str, list[BaselineMetrics]]] = []
+    for vn in version_names:
+        mlist = version_metrics.get(vn, [])
+        columns.append((f"{vn} (n={len(mlist)})", vn, mlist))
+
+    if not any(mlist for _, _, mlist in columns):
+        lines.append("_No metrics available for this scenario._")
+        return lines
+
+    baseline_name = version_names[0] if version_names else None
+
+    header = "| Metric |"
+    separator = "|--------|"
+    for label, _, _ in columns:
+        header += f" {label} |"
+        separator += "---------|"
+    lines.append(header)
+    lines.append(separator)
+
+    for display_name, attr, higher_is_better in METRIC_ROWS:
+        if not attr:
+            row = f"| {display_name} |"
+            for _ in columns:
+                row += " |"
+            lines.append(row)
+            continue
+
+        # Compute per-version (avg, stdev)
+        version_stats: list[tuple[float | None, float | None]] = []
+        for _, vn, mlist in columns:
+            vals = [v for v in (get_metric_value(m, attr) for m in mlist) if v is not None]
+            if not vals:
+                version_stats.append((None, None))
+            elif len(vals) == 1:
+                version_stats.append((vals[0], None))
+            else:
+                version_stats.append((_mean(vals), _stdev(vals)))
+
+        baseline_avg = version_stats[0][0] if version_stats else None
+
+        row = f"| {display_name} |"
+        for i, (_label, _vn, _mlist) in enumerate(columns):
+            avg, std = version_stats[i]
+            if avg is None:
+                row += " \u2014 |"
+                continue
+
+            cell = format_num(avg)
+            if std is not None and std > 0:
+                cell += f" \u00b1 {format_num(std)}"
+
+            # Delta indicator vs baseline version (skip for the baseline column itself)
+            if i > 0 and baseline_avg is not None:
+                delta = avg - baseline_avg
+                if abs(delta) > 0.001:
+                    cell += (" ^" if delta > 0 else " v") if higher_is_better \
+                        else (" v" if delta > 0 else " ^")
+
+            row += f" {cell} |"
+        lines.append(row)
+
+    lines.append("")
+    lines.append(
+        f"**Legend:** ^ = better than `{baseline_name}` (baseline version), "
+        f"v = worse. \u00b1 = sample std dev across repeated runs."
+    )
+
+    return lines
+
+
+def generate_rollup_report(
+    scenarios: list[str],
+    version_names: list[str],
+    all_results: list[dict],
+    generated_at: str,
+) -> str:
+    """Generate the multi-scenario rollup report (avg +/- std dev per version).
+
+    One section per scenario; delta indicators vs the first version listed.
+    """
+    lines: list[str] = [
+        "# Git Version Comparison \u2014 Rollup Report",
+        "",
+        f"**Generated:** {generated_at}",
+        f"**Versions:** {', '.join(version_names)}",
+        f"**Scenarios:** {', '.join(scenarios)}",
+        "",
+        "> Values shown as `avg \u00b1 std_dev` when multiple runs were performed.",
+        "> ^ = better than baseline version (first version listed), v = worse.",
+        "",
+    ]
+
+    for scenario_name in scenarios:
+        scenario_results = [r for r in all_results if r["scenario"] == scenario_name]
+        lines.extend(_build_rollup_section(scenario_name, version_names, scenario_results))
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def write_reports(
+    runs_dir: Path,
+    scenarios: list[str],
+    version_names: list[str],
+    all_results: list[dict],
+    generated_at: str,
+) -> None:
+    """Write all per-scenario detail reports and the rollup report to disk.
+
+    Outputs are written to <runs_dir>/comparison/:
+      - <scenario>-report.md / <scenario>-report.yaml  (one per scenario)
+      - rollup-report.md / rollup-data.yaml
+    """
+    comparison_dir = runs_dir / "comparison"
+    comparison_dir.mkdir(parents=True, exist_ok=True)
+
+    for scenario_name in scenarios:
+        scenario_results = [r for r in all_results if r["scenario"] == scenario_name]
+        if not scenario_results:
+            continue
+
+        print(f"  Writing detail report: {scenario_name}...")
+        md = generate_scenario_detail_report(
+            scenario_name=scenario_name,
+            version_names=version_names,
+            run_results=scenario_results,
+            generated_at=generated_at,
+        )
+        md_path = comparison_dir / f"{scenario_name}-report.md"
+        md_path.write_text(md, encoding="utf-8")
+        print(f"    {md_path}")
+
+        yaml_data: dict = {
+            "generated_at": generated_at,
+            "scenario": scenario_name,
+            "version_names": version_names,
+            "runs": [
+                {
+                    "version_name": r["version_name"],
+                    "ref": r["ref"],
+                    "repo": r.get("repo"),
+                    "run_index": r["run_index"],
+                    "status": r["status"],
+                    "elapsed_seconds": r.get("elapsed_seconds"),
+                    "output_dir": r["output_dir"],
+                }
+                for r in sorted(scenario_results, key=lambda x: (x["version_name"], x["run_index"]))
+            ],
+        }
+        yaml_path = comparison_dir / f"{scenario_name}-report.yaml"
+        with open(yaml_path, "w", encoding="utf-8") as f:
+            yaml.safe_dump(yaml_data, f, default_flow_style=False, sort_keys=False)
+        print(f"    {yaml_path}")
+
+    print("  Writing rollup report...")
+    rollup_md = generate_rollup_report(
+        scenarios=scenarios,
+        version_names=version_names,
+        all_results=all_results,
+        generated_at=generated_at,
+    )
+    rollup_md_path = comparison_dir / "rollup-report.md"
+    rollup_md_path.write_text(rollup_md, encoding="utf-8")
+    print(f"    {rollup_md_path}")
+
+    rollup_yaml: dict = {
+        "generated_at": generated_at,
+        "version_names": version_names,
+        "scenarios": scenarios,
+        "runs": all_results,
+    }
+    rollup_yaml_path = comparison_dir / "rollup-data.yaml"
+    with open(rollup_yaml_path, "w", encoding="utf-8") as f:
+        yaml.safe_dump(rollup_yaml, f, default_flow_style=False, sort_keys=False)
+    print(f"    {rollup_yaml_path}")
+
+    print("  Writing interactive HTML report...")
+    from generate_html_report import generate_interactive_html_report
+    html_report = generate_interactive_html_report(
+        scenarios=scenarios,
+        version_names=version_names,
+        all_results=all_results,
+        generated_at=generated_at,
+        runs_dir=runs_dir,
+    )
+    html_path = comparison_dir / "interactive-report.html"
+    html_path.write_text(html_report, encoding="utf-8")
+    print(f"    {html_path}")
+
+
+# ── Parallel execution ────────────────────────────────────────────────────────
+
+
+# Global lock for thread-safe printing
+_print_lock = threading.Lock()
+
+
+def _safe_print(*args, **kwargs):
+    """Thread-safe print for parallel execution."""
+    with _print_lock:
+        print(*args, **kwargs)
+
+
+@dataclass
+class WorkItem:
+    """A single evaluation work item for parallel execution."""
+    version: Version
+    scenario: "Scenario"
+    run_index: int
+    runs_per_ref: int
+    runs_dir: Path
+    base_config: Path
+    profile: str
+    region: str
+    scorer_model: str
+    default_executor_model: str | None
+    use_sandbox: bool
+
+
+def execute_work_item(item: WorkItem) -> dict:
+    """Execute a single evaluation work item (thread-safe wrapper).
+
+    This is called by ThreadPoolExecutor and wraps run_single_evaluation
+    with thread-safe output handling.
+    """
+    return run_single_evaluation(
+        version=item.version,
+        scenario=item.scenario,
+        run_index=item.run_index,
+        runs_per_ref=item.runs_per_ref,
+        runs_dir=item.runs_dir,
+        base_config=item.base_config,
+        profile=item.profile,
+        region=item.region,
+        scorer_model=item.scorer_model,
+        default_executor_model=item.default_executor_model,
+        use_sandbox=item.use_sandbox,
+    )
+
+
+def run_parallel_evaluations(
+    work_items: list[WorkItem],
+    max_workers: int,
+) -> list[dict]:
+    """Run evaluations in parallel with progress tracking.
+
+    Args:
+        work_items: List of work items to execute
+        max_workers: Maximum number of concurrent workers
+
+    Returns:
+        List of result dicts in original submission order
+    """
+    all_results = []
+    total = len(work_items)
+    completed = 0
+
+    _safe_print(f"\n{'=' * 70}")
+    _safe_print(f"  Parallel Execution: {total} runs, max {max_workers} concurrent")
+    _safe_print(f"{'=' * 70}\n")
+
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        # Submit all work with tracking
+        future_to_item = {
+            executor.submit(execute_work_item, item): (i, item)
+            for i, item in enumerate(work_items)
+        }
+
+        # Collect results as they complete
+        for future in as_completed(future_to_item):
+            idx, item = future_to_item[future]
+            completed += 1
+
+            try:
+                result = future.result()
+                all_results.append((idx, result))
+
+                status = "✓" if result.get("status") == "success" else "✗"
+                duration = result.get("elapsed_seconds", 0) / 60
+                _safe_print(
+                    f"  [{completed:2d}/{total}] {status} {item.version.name:30s} "
+                    f"{item.scenario.name:15s} run-{item.run_index}  ({duration:.1f} min)"
+                )
+
+            except Exception as e:
+                _safe_print(
+                    f"  [{completed:2d}/{total}] ✗ {item.version.name:30s} "
+                    f"{item.scenario.name:15s} run-{item.run_index}  ERROR: {e}"
+                )
+                # Create error result
+                error_result = {
+                    "version_name": item.version.name,
+                    "ref": item.version.ref,
+                    "repo": item.version.repo,
+                    "scenario": item.scenario.name,
+                    "run_index": item.run_index,
+                    "status": "error",
+                    "error": str(e),
+                    "output_dir": str(
+                        item.runs_dir
+                        / f"failed-{ref_to_slug(item.version.name)}-{item.scenario.name}-run{item.run_index}"
+                    ),
+                }
+                all_results.append((idx, error_result))
+
+    _safe_print(f"\n{'=' * 70}")
+    _safe_print(f"  Parallel execution complete: {completed}/{total} finished")
+    _safe_print(f"{'=' * 70}\n")
+
+    # Sort by original submission order
+    all_results.sort(key=lambda x: x[0])
+    return [result for _, result in all_results]
+
+
+# ── Incremental mode helpers ──────────────────────────────────────────────────
+
+
+def load_existing_summary(runs_dir: Path) -> dict | None:
+    """Load existing git-compare-summary.yaml if it exists.
+
+    Returns:
+        Summary dict with keys: version_names, scenarios, runs_per_version, runs
+        Returns None if summary doesn't exist.
+
+    Raises:
+        ValueError: If summary exists but is malformed.
+    """
+    summary_path = runs_dir / "git-compare-summary.yaml"
+    if not summary_path.exists():
+        return None
+
+    with open(summary_path, encoding="utf-8") as f:
+        summary = yaml.safe_load(f) or {}
+
+    required_keys = ["version_names", "scenarios", "runs_per_version", "runs"]
+    missing = [k for k in required_keys if k not in summary]
+    if missing:
+        raise ValueError(
+            f"Existing summary at {summary_path} is missing required keys: {missing}"
+        )
+
+    return summary
+
+
+def filter_new_versions(
+    versions: list[Version],
+    existing_version_names: set[str],
+    force_rerun: bool,
+) -> tuple[list[Version], list[Version]]:
+    """Separate versions into new vs. already-tested.
+
+    Args:
+        versions: All versions from versions file
+        existing_version_names: Version names from existing summary
+        force_rerun: If True, treat all versions as new
+
+    Returns:
+        (new_versions, skipped_versions)
+    """
+    if force_rerun:
+        return versions, []
+
+    new_versions = [v for v in versions if v.name not in existing_version_names]
+    skipped_versions = [v for v in versions if v.name in existing_version_names]
+
+    return new_versions, skipped_versions
+
+
+def merge_summaries(
+    existing_summary: dict,
+    new_results: list[dict],
+    new_versions: list[Version],
+    new_elapsed_seconds: float,
+) -> dict:
+    """Merge new run results into existing summary.
+
+    Args:
+        existing_summary: Loaded from git-compare-summary.yaml
+        new_results: Run results from newly executed versions
+        new_versions: Version objects for newly tested versions
+        new_elapsed_seconds: Elapsed time for new runs
+
+    Returns:
+        Updated summary dict with merged data
+    """
+    new_version_names = [v.name for v in new_versions]
+
+    # Merge version names (preserve order: existing + new)
+    all_version_names = existing_summary["version_names"] + new_version_names
+
+    # Merge version specs
+    existing_version_specs = existing_summary.get("versions", [])
+    new_version_specs = [
+        {
+            "name": v.name,
+            "ref": v.ref,
+            "repo": v.repo,
+            "executor_model": v.executor_model,
+        }
+        for v in new_versions
+    ]
+    all_version_specs = existing_version_specs + new_version_specs
+
+    # Merge run results
+    all_runs = existing_summary["runs"] + new_results
+
+    # Update counts
+    runs_succeeded = sum(1 for r in all_runs if r.get("status") == "success")
+    runs_failed = sum(1 for r in all_runs if r.get("status") != "success")
+
+    # Track incremental runs
+    incremental_runs = existing_summary.get("incremental_runs", [])
+    incremental_runs.append({
+        "added_at": datetime.now(UTC).isoformat(timespec="seconds"),
+        "versions_added": new_version_names,
+        "runs_added": len(new_results),
+        "elapsed_seconds": round(new_elapsed_seconds, 1),
+    })
+
+    return {
+        "started_at": existing_summary["started_at"],  # Keep original start time
+        "generated_at": datetime.now(UTC).isoformat(timespec="seconds"),  # Update
+        "total_elapsed_seconds": existing_summary["total_elapsed_seconds"],  # Original only
+        "incremental_runs": incremental_runs,  # Track all incremental additions
+        "version_names": all_version_names,
+        "versions": all_version_specs,
+        "scenarios": existing_summary["scenarios"],
+        "runs_per_version": existing_summary["runs_per_version"],
+        "total_runs": len(all_runs),
+        "runs_succeeded": runs_succeeded,
+        "runs_failed": runs_failed,
+        "runs": all_runs,
+    }
+
+
+# ── CLI ────────────────────────────────────────────────────────────────────────
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        prog="run_git_compare",
+        description=(
+            "Compare multiple versions of AIDLC rules across scenarios and repeated runs. "
+            "Each version can target a different git repository (GitHub, GitLab, etc.), "
+            "ref, executor model, and base config."
+        ),
+    )
+
+    # Version specification — mutually exclusive
+    version_group = parser.add_mutually_exclusive_group(required=True)
+    version_group.add_argument(
+        "--refs", type=str, default=None,
+        help=(
+            "Comma-separated git refs to compare. "
+            "All refs share the repository URL from the base config YAML. "
+            "Use --versions-file when different repos or per-version settings are needed."
+        ),
+    )
+    version_group.add_argument(
+        "--versions-file", type=Path, default=None,
+        help=(
+            "Path to a YAML file defining named versions with per-version repo URL, "
+            "ref, executor model, and config overrides. "
+            "Mutually exclusive with --refs."
+        ),
+    )
+
+    parser.add_argument(
+        "--scenarios", type=str, default="sci-calc",
+        help="Comma-separated scenario names (default: sci-calc)",
+    )
+    parser.add_argument(
+        "--runs-per-ref", type=int, default=1,
+        help="Number of evaluation runs per (version, scenario) pair (default: 1)",
+    )
+
+    # Global config (can be overridden per-version via versions file)
+    parser.add_argument(
+        "--config", type=Path, default=DEFAULT_CONFIG,
+        help="Base config YAML (default: config/default.yaml)",
+    )
+    parser.add_argument("--profile", default=None, help="AWS profile")
+    parser.add_argument("--region", default=None, help="AWS region")
+    parser.add_argument(
+        "--executor-model", default=None,
+        help="Default executor model ID (can be overridden per-version in versions file)",
+    )
+    parser.add_argument("--scorer-model", default=None, help="Override scorer model ID")
+
+    # Output
+    parser.add_argument(
+        "--runs-dir", type=Path, default=None,
+        help=(
+            "Base directory for all run outputs. "
+            "Defaults to runs/<scenario>/git-compare/ for a single scenario "
+            "or runs/git-compare/ when multiple scenarios are specified."
+        ),
+    )
+
+    # Sandbox
+    sandbox_group = parser.add_mutually_exclusive_group()
+    sandbox_group.add_argument(
+        "--sandbox", action="store_true", default=True,
+        help="Run generated code in Docker sandbox (default)",
+    )
+    sandbox_group.add_argument(
+        "--no-sandbox", action="store_false", dest="sandbox",
+        help="Run generated code directly on host (no isolation)",
+    )
+
+    # Incremental mode
+    parser.add_argument(
+        "--incremental", action="store_true", default=False,
+        help=(
+            "Incremental mode: only run evaluations for versions not present in "
+            "existing git-compare-summary.yaml, then merge results and regenerate "
+            "reports. Requires --runs-dir to point to an existing git-compare output."
+        ),
+    )
+    parser.add_argument(
+        "--force-rerun", action="store_true", default=False,
+        help=(
+            "With --incremental, re-run evaluations for versions that already exist "
+            "in the summary (default: skip existing versions)."
+        ),
+    )
+
+    # Parallel execution
+    parser.add_argument(
+        "--max-parallel", type=int, default=1,
+        help=(
+            "Maximum number of evaluations to run in parallel (default: 1). "
+            "Recommended: 2-4 depending on system resources. Each parallel run "
+            "consumes ~2GB RAM and spawns a Docker container in sandbox mode. "
+            "Higher values may hit Bedrock API rate limits."
+        ),
+    )
+
+    args = parser.parse_args()
+
+    # Build version list
+    versions: list[Version]
+    if args.versions_file:
+        if not args.versions_file.exists():
+            parser.error(f"versions file not found: {args.versions_file}")
+        try:
+            versions = parse_versions_file(args.versions_file)
+        except (ValueError, yaml.YAMLError) as e:
+            parser.error(str(e))
+    else:
+        refs = [r.strip() for r in args.refs.split(",") if r.strip()]
+        if not refs:
+            parser.error("--refs must specify at least one git ref")
+        versions = versions_from_refs(refs)
+
+    if not versions:
+        parser.error("No versions to compare")
+
+    # Parse scenarios
+    scenario_names = [s.strip() for s in args.scenarios.split(",") if s.strip()]
+    if not scenario_names:
+        parser.error("--scenarios must specify at least one scenario name")
+
+    resolved_scenarios: list[Scenario] = []
+    for name in scenario_names:
+        try:
+            resolved_scenarios.append(resolve_scenario(name, TEST_CASES_DIR))
+        except FileNotFoundError as e:
+            parser.error(str(e))
+
+    # Default runs_dir
+    if args.runs_dir is None:
+        if len(resolved_scenarios) == 1:
+            args.runs_dir = REPO_ROOT / "runs" / resolved_scenarios[0].name / "git-compare"
+        else:
+            args.runs_dir = REPO_ROOT / "runs" / "git-compare"
+
+    # Load base config for credential/model defaults
+    base_cfg: dict = {}
+    if args.config and args.config.exists():
+        with open(args.config, encoding="utf-8") as f:
+            base_cfg = yaml.safe_load(f) or {}
+
+    if args.profile is None:
+        args.profile = base_cfg.get("aws", {}).get("profile")
+        # Allow None profile to use default credentials (e.g., EC2 instance role)
+        # Just ensure it's explicitly set to something (even if None)
+
+    if args.region is None:
+        args.region = base_cfg.get("aws", {}).get("region")
+        if args.region is None:
+            parser.error("--region is required (or set aws.region in config YAML)")
+
+    if args.scorer_model is None:
+        args.scorer_model = base_cfg.get("models", {}).get("scorer", {}).get("model_id")
+        if args.scorer_model is None:
+            parser.error("--scorer-model is required (or set models.scorer.model_id in config YAML)")
+
+    # Validate parallel execution settings
+    if args.max_parallel < 1:
+        parser.error("--max-parallel must be >= 1")
+
+    if args.max_parallel > 8:
+        print(
+            f"WARNING: --max-parallel {args.max_parallel} is quite high. "
+            f"Each parallel run consumes ~2GB RAM and may hit Bedrock rate limits.",
+            file=sys.stderr
+        )
+
+    # Suggest optimal settings based on system resources
+    cpu_count = os.cpu_count() or 1
+    if args.max_parallel > cpu_count:
+        print(
+            f"INFO: --max-parallel {args.max_parallel} exceeds CPU count ({cpu_count}). "
+            f"Consider using --max-parallel {min(cpu_count, 4)} for optimal performance.",
+            file=sys.stderr
+        )
+
+    # Handle incremental mode
+    existing_summary = None
+    skipped_versions: list[Version] = []
+    all_versions = versions  # Keep reference to all versions for final version_names
+
+    if args.incremental:
+        if not args.runs_dir:
+            parser.error("--incremental requires --runs-dir to be specified")
+        if not args.runs_dir.exists():
+            parser.error(f"--runs-dir does not exist: {args.runs_dir}")
+
+        try:
+            existing_summary = load_existing_summary(args.runs_dir)
+        except ValueError as e:
+            parser.error(str(e))
+
+        if existing_summary is None:
+            parser.error(
+                f"--incremental requires existing git-compare-summary.yaml in {args.runs_dir}"
+            )
+
+        # Validate consistency
+        existing_scenarios = existing_summary["scenarios"]
+        if set(scenario_names) != set(existing_scenarios):
+            parser.error(
+                f"Scenarios mismatch: new={scenario_names}, existing={existing_scenarios}"
+            )
+        if args.runs_per_ref != existing_summary["runs_per_version"]:
+            parser.error(
+                f"--runs-per-ref mismatch: new={args.runs_per_ref}, "
+                f"existing={existing_summary['runs_per_version']}"
+            )
+
+        # Filter versions
+        existing_version_names = set(existing_summary["version_names"])
+        new_versions, skipped_versions = filter_new_versions(
+            versions, existing_version_names, args.force_rerun
+        )
+
+        if skipped_versions:
+            print("Git Version Comparison (Incremental Mode)")
+            print(f"  Skipping {len(skipped_versions)} already-tested versions:")
+            for v in skipped_versions:
+                print(f"    - {v.name}")
+            print()
+
+        if not new_versions:
+            print("No new versions to test. Regenerating reports from existing data...\n")
+            write_reports(
+                runs_dir=args.runs_dir,
+                scenarios=scenario_names,
+                version_names=existing_summary["version_names"],
+                all_results=existing_summary["runs"],
+                generated_at=datetime.now(UTC).isoformat(timespec="seconds"),
+            )
+            print(f"\n  Results: {args.runs_dir}")
+            sys.exit(0)
+
+        versions = new_versions  # Only run new versions
+
+    version_names = [v.name for v in versions]
+    total_runs = len(versions) * len(resolved_scenarios) * args.runs_per_ref
+
+    mode_str = "Git Version Comparison (Incremental Mode)" if args.incremental else "Git Version Comparison"
+    print(mode_str)
+    if args.incremental and existing_summary:
+        print(f"  Existing vers: {len(existing_summary['version_names'])} ({', '.join(existing_summary['version_names'])})")
+        print(f"  New versions:  {len(version_names)} ({', '.join(version_names)})")
+    else:
+        print(f"  Versions:     {', '.join(version_names)}")
+    print(f"  Scenarios:    {', '.join(s.name for s in resolved_scenarios)}")
+    print(f"  Runs per ver: {args.runs_per_ref}")
+    print(f"  Total runs:   {total_runs}")
+    print(f"  Max parallel: {args.max_parallel}")
+    print(f"  Profile:      {args.profile}")
+    print(f"  Region:       {args.region}")
+    print(f"  Scorer:       {args.scorer_model}")
+    print(f"  Output:       {args.runs_dir}")
+    for v in versions:
+        repo_display = v.repo or "*(from config)*"
+        model_display = v.executor_model or args.executor_model or "*(from config)*"
+        print(f"    [{v.name}]  ref={v.ref}  repo={repo_display}  model={model_display}")
+
+    overall_start = time.monotonic()
+    overall_started_at = datetime.now(UTC).isoformat(timespec="seconds")
+
+    # Choose execution mode based on --max-parallel
+    if args.max_parallel == 1:
+        # Sequential execution (original behavior)
+        all_results: list[dict] = []
+        for version in versions:
+            for scenario in resolved_scenarios:
+                for run_idx in range(1, args.runs_per_ref + 1):
+                    try:
+                        summary = run_single_evaluation(
+                            version=version,
+                            scenario=scenario,
+                            run_index=run_idx,
+                            runs_per_ref=args.runs_per_ref,
+                            runs_dir=args.runs_dir,
+                            base_config=args.config,
+                            profile=args.profile,
+                            region=args.region,
+                            scorer_model=args.scorer_model,
+                            default_executor_model=args.executor_model,
+                            use_sandbox=args.sandbox,
+                        )
+                        all_results.append(summary)
+                    except Exception as e:
+                        print(
+                            f"\n[ERROR] Failed version={version.name}, "
+                            f"scenario={scenario.name}, run={run_idx}: {e}",
+                            file=sys.stderr,
+                        )
+                        all_results.append({
+                            "version_name": version.name,
+                            "ref": version.ref,
+                            "repo": version.repo,
+                            "scenario": scenario.name,
+                            "run_index": run_idx,
+                            "status": "error",
+                            "error": str(e),
+                            "output_dir": str(
+                                args.runs_dir
+                                / f"failed-{ref_to_slug(version.name)}-{scenario.name}-run{run_idx}"
+                            ),
+                        })
+    else:
+        # Parallel execution
+        work_items = []
+        for version in versions:
+            for scenario in resolved_scenarios:
+                for run_idx in range(1, args.runs_per_ref + 1):
+                    work_items.append(WorkItem(
+                        version=version,
+                        scenario=scenario,
+                        run_index=run_idx,
+                        runs_per_ref=args.runs_per_ref,
+                        runs_dir=args.runs_dir,
+                        base_config=args.config,
+                        profile=args.profile,
+                        region=args.region,
+                        scorer_model=args.scorer_model,
+                        default_executor_model=args.executor_model,
+                        use_sandbox=args.sandbox,
+                    ))
+
+        all_results = run_parallel_evaluations(work_items, args.max_parallel)
+
+    overall_elapsed = time.monotonic() - overall_start
+    generated_at = datetime.now(UTC).isoformat(timespec="seconds")
+
+    # Write top-level summary
+    args.runs_dir.mkdir(parents=True, exist_ok=True)
+
+    # Merge results if incremental mode
+    report_version_names = version_names
+    report_all_results = all_results
+    if existing_summary:
+        print("\nMerging results with existing runs...")
+        summary_data = merge_summaries(
+            existing_summary=existing_summary,
+            new_results=all_results,
+            new_versions=versions,
+            new_elapsed_seconds=overall_elapsed,
+        )
+        report_version_names = summary_data["version_names"]
+        report_all_results = summary_data["runs"]
+    else:
+        version_specs = [
+            {
+                "name": v.name,
+                "ref": v.ref,
+                "repo": v.repo,
+                "executor_model": v.executor_model,
+            }
+            for v in versions
+        ]
+        summary_data = {
+            "started_at": overall_started_at,
+            "generated_at": generated_at,
+            "total_elapsed_seconds": round(overall_elapsed, 1),
+            "version_names": version_names,
+            "versions": version_specs,
+            "scenarios": [s.name for s in resolved_scenarios],
+            "runs_per_version": args.runs_per_ref,
+            "total_runs": total_runs,
+            "runs_succeeded": sum(1 for r in all_results if r.get("status") == "success"),
+            "runs_failed": sum(1 for r in all_results if r.get("status") != "success"),
+            "runs": all_results,
+        }
+
+    summary_path = args.runs_dir / "git-compare-summary.yaml"
+    with open(summary_path, "w", encoding="utf-8") as f:
+        yaml.safe_dump(summary_data, f, default_flow_style=False, sort_keys=False)
+    print(f"\n  Summary: {summary_path}")
+
+    # Generate all reports (with merged data if incremental)
+    print("\nGenerating reports...")
+    write_reports(
+        runs_dir=args.runs_dir,
+        scenarios=[s.name for s in resolved_scenarios],
+        version_names=report_version_names,
+        all_results=report_all_results,
+        generated_at=generated_at,
+    )
+
+    # Final summary
+    print(f"\n{'=' * 70}")
+    print("  Git Compare Complete")
+    print(f"{'=' * 70}")
+    if existing_summary:
+        print(f"  New runs time: {overall_elapsed / 60:.1f} min")
+        print(f"  New runs:      {len(all_results)}")
+        print(f"  Total versions: {len(report_version_names)} ({len(existing_summary['version_names'])} existing + {len(version_names)} new)")
+        print(f"  Total runs:     {len(report_all_results)} ({len(existing_summary['runs'])} existing + {len(all_results)} new)")
+        print(f"  Succeeded:      {sum(1 for r in report_all_results if r.get('status') == 'success')}")
+        print(f"  Failed:         {sum(1 for r in report_all_results if r.get('status') != 'success')}")
+    else:
+        print(f"  Total time:  {overall_elapsed / 60:.1f} min")
+        print(f"  Total runs:  {len(all_results)}")
+        print(f"  Succeeded:   {sum(1 for r in all_results if r.get('status') == 'success')}")
+        print(f"  Failed:      {sum(1 for r in all_results if r.get('status') != 'success')}")
+
+    # Show run details (only new runs in incremental mode)
+    for r in all_results:
+        marker = "PASS" if r.get("status") == "success" else "FAIL"
+        duration = r.get("elapsed_seconds", 0) / 60
+        print(
+            f"    [{marker}] {r['version_name']:30s}  {r['scenario']:15s}  "
+            f"run-{r['run_index']}  {duration:.1f} min"
+        )
+    print(f"\n  Results: {args.runs_dir}")
+
+    # Exit status based on all results (including existing in incremental mode)
+    failed = sum(1 for r in report_all_results if r.get("status") != "success")
+    sys.exit(1 if failed > 0 else 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/aidlc-evaluator/uv.lock b/scripts/aidlc-evaluator/uv.lock
index 0dfeca26..fe96c77e 100644
--- a/scripts/aidlc-evaluator/uv.lock
+++ b/scripts/aidlc-evaluator/uv.lock
@@ -22,6 +22,8 @@ name = "aidlc-cli-harness"
 version = "0.1.0"
 source = { editable = "packages/cli-harness" }
 dependencies = [
+    { name = "anthropic", extra = ["bedrock"] },
+    { name = "boto3" },
     { name = "pyyaml" },
 ]
 
@@ -32,6 +34,8 @@ dev = [
 
 [package.metadata]
 requires-dist = [
+    { name = "anthropic", extras = ["bedrock"], specifier = ">=0.40" },
+    { name = "boto3", specifier = ">=1.42.47" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },
     { name = "pyyaml", specifier = ">=6.0" },
 ]
@@ -57,6 +61,7 @@ name = "aidlc-evaluation-framework"
 version = "0.1.0"
 source = { virtual = "." }
 dependencies = [
+    { name = "aidlc-cli-harness" },
     { name = "aidlc-contracttest" },
     { name = "aidlc-nonfunctional" },
     { name = "aidlc-qualitative" },
@@ -80,6 +85,7 @@ dev = [
 
 [package.metadata]
 requires-dist = [
+    { name = "aidlc-cli-harness", editable = "packages/cli-harness" },
     { name = "aidlc-contracttest", editable = "packages/contracttest" },
     { name = "aidlc-nonfunctional", editable = "packages/nonfunctional" },
     { name = "aidlc-qualitative", editable = "packages/qualitative" },
@@ -193,6 +199,8 @@ name = "aidlc-runner"
 version = "0.1.0"
 source = { editable = "packages/execution" }
 dependencies = [
+    { name = "anthropic", extra = ["bedrock"] },
+    { name = "boto3" },
     { name = "pyyaml" },
     { name = "strands-agents" },
     { name = "strands-agents-tools" },
@@ -200,6 +208,8 @@ dependencies = [
 
 [package.metadata]
 requires-dist = [
+    { name = "anthropic", extras = ["bedrock"], specifier = ">=0.40" },
+    { name = "boto3", specifier = ">=1.42.47" },
     { name = "pyyaml", specifier = ">=6.0" },
     { name = "strands-agents", specifier = ">=0.1.0" },
     { name = "strands-agents-tools", specifier = ">=0.1.0" },
@@ -325,6 +335,31 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
 ]
 
+[[package]]
+name = "anthropic"
+version = "0.97.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "distro" },
+    { name = "docstring-parser" },
+    { name = "httpx" },
+    { name = "jiter" },
+    { name = "pydantic" },
+    { name = "sniffio" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/14/93/f66ea8bfe39f2e6bb9da8e27fa5457ad2520e8f7612dfc547b17fad55c4d/anthropic-0.97.0.tar.gz", hash = "sha256:021e79fd8e21e90ad94dc5ba2bbbd8b1599f424f5b1fab6c06204009cab764be", size = 669502, upload-time = "2026-04-23T20:52:34.445Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/53/b6/8e851369fa661ad0fef2ae6266bf3b7d52b78ccf011720058f4adaca59e2/anthropic-0.97.0-py3-none-any.whl", hash = "sha256:8a1a472dfabcfc0c52ff6a3eecf724ac7e07107a2f6e2367be55ceb42f5d5613", size = 662126, upload-time = "2026-04-23T20:52:32.377Z" },
+]
+
+[package.optional-dependencies]
+bedrock = [
+    { name = "boto3" },
+    { name = "botocore" },
+]
+
 [[package]]
 name = "anyio"
 version = "4.12.1"
@@ -631,6 +666,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1e/77/dc8c558f7593132cf8fefec57c4f60c83b16941c574ac5f619abb3ae7933/dill-0.4.1-py3-none-any.whl", hash = "sha256:1e1ce33e978ae97fcfcff5638477032b801c46c7c65cf717f95fbc2248f79a9d", size = 120019, upload-time = "2026-01-19T02:36:55.663Z" },
 ]
 
+[[package]]
+name = "distro"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },
+]
+
 [[package]]
 name = "docstring-parser"
 version = "0.17.0"
@@ -836,6 +880,60 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
 ]
 
+[[package]]
+name = "jiter"
+version = "0.14.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6e/c1/0cddc6eb17d4c53a99840953f95dd3accdc5cfc7a337b0e9b26476276be9/jiter-0.14.0.tar.gz", hash = "sha256:e8a39e66dac7153cf3f964a12aad515afa8d74938ec5cc0018adcdae5367c79e", size = 165725, upload-time = "2026-04-10T14:28:42.01Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/97/2a/09f70020898507a89279659a1afe3364d57fc1b2c89949081975d135f6f5/jiter-0.14.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:af72f204cf4d44258e5b4c1745130ac45ddab0e71a06333b01de660ab4187a94", size = 315502, upload-time = "2026-04-10T14:26:47.697Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/be/080c96a45cd74f9fce5db4fd68510b88087fb37ffe2541ff73c12db92535/jiter-0.14.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4b77da71f6e819be5fbcec11a453fde5b1d0267ef6ed487e2a392fd8e14e4e3a", size = 314870, upload-time = "2026-04-10T14:26:49.149Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/5e/2d0fee155826a968a832cc32438de5e2a193292c8721ca70d0b53e58245b/jiter-0.14.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f4ea612fe8b84b8b04e51d0e78029ecf3466348e25973f953de6e6a59aa4c1", size = 343406, upload-time = "2026-04-10T14:26:50.762Z" },
+    { url = "https://files.pythonhosted.org/packages/70/af/bf9ee0d3a4f8dc0d679fc1337f874fe60cdbf841ebbb304b374e1c9aaceb/jiter-0.14.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:62fe2451f8fcc0240261e6a4df18ecbcd58327857e61e625b2393ea3b468aac9", size = 369415, upload-time = "2026-04-10T14:26:52.188Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/83/8e8561eadba31f4d3948a5b712fb0447ec71c3560b57a855449e7b8ddc98/jiter-0.14.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6112f26f5afc75bcb475787d29da3aa92f9d09c7858f632f4be6ffe607be82e9", size = 461456, upload-time = "2026-04-10T14:26:53.611Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/c9/c5299e826a5fe6108d172b344033f61c69b1bb979dd8d9ddd4278a160971/jiter-0.14.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:215a6cb8fb7dc702aa35d475cc00ddc7f970e5c0b1417fb4b4ac5d82fa2a29db", size = 378488, upload-time = "2026-04-10T14:26:55.211Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/37/c16d9d15c0a471b8644b1abe3c82668092a707d9bedcf076f24ff2e380cd/jiter-0.14.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc4ab96a30fb3cb2c7e0cd33f7616c8860da5f5674438988a54ac717caccdbaa", size = 353242, upload-time = "2026-04-10T14:26:56.705Z" },
+    { url = "https://files.pythonhosted.org/packages/58/ea/8050cb0dc654e728e1bfacbc0c640772f2181af5dedd13ae70145743a439/jiter-0.14.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:3a99c1387b1f2928f799a9de899193484d66206a50e98233b6b088a7f0c1edb2", size = 356823, upload-time = "2026-04-10T14:26:58.281Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/3b/cf71506d270e5f84d97326bf220e47aed9b95e9a4a060758fb07772170ab/jiter-0.14.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ab18d11074485438695f8d34a1b6da61db9754248f96d51341956607a8f39985", size = 392564, upload-time = "2026-04-10T14:27:00.018Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/cc/8c6c74a3efb5bd671bfd14f51e8a73375464ca914b1551bc3b40e26ac2c9/jiter-0.14.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:801028dcfc26ac0895e4964cbc0fd62c73be9fd4a7d7b1aaf6e5790033a719b7", size = 520322, upload-time = "2026-04-10T14:27:01.664Z" },
+    { url = "https://files.pythonhosted.org/packages/41/24/68d7b883ec959884ddf00d019b2e0e82ba81b167e1253684fa90519ce33c/jiter-0.14.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ad425b087aafb4a1c7e1e98a279200743b9aaf30c3e0ba723aec93f061bd9bc8", size = 552619, upload-time = "2026-04-10T14:27:03.316Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/89/b1a0985223bbf3150ff9e8f46f98fc9360c1de94f48abe271bbe1b465682/jiter-0.14.0-cp313-cp313-win32.whl", hash = "sha256:882bcb9b334318e233950b8be366fe5f92c86b66a7e449e76975dfd6d776a01f", size = 205699, upload-time = "2026-04-10T14:27:04.662Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/19/3f339a5a7f14a11730e67f6be34f9d5105751d547b615ef593fa122a5ded/jiter-0.14.0-cp313-cp313-win_amd64.whl", hash = "sha256:9b8c571a5dba09b98bd3462b5a53f27209a5cbbe85670391692ede71974e979f", size = 201323, upload-time = "2026-04-10T14:27:06.139Z" },
+    { url = "https://files.pythonhosted.org/packages/50/56/752dd89c84be0e022a8ea3720bcfa0a8431db79a962578544812ce061739/jiter-0.14.0-cp313-cp313-win_arm64.whl", hash = "sha256:34f19dcc35cb1abe7c369b3756babf8c7f04595c0807a848df8f26ef8298ef92", size = 191099, upload-time = "2026-04-10T14:27:07.564Z" },
+    { url = "https://files.pythonhosted.org/packages/91/28/292916f354f25a1fe8cf2c918d1415c699a4a659ae00be0430e1c5d9ffea/jiter-0.14.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e89bcd7d426a75bb4952c696b267075790d854a07aad4c9894551a82c5b574ab", size = 320880, upload-time = "2026-04-10T14:27:09.326Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/c7/b002a7d8b8957ac3d469bd59c18ef4b1595a5216ae0de639a287b9816023/jiter-0.14.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b25beaa0d4447ea8c7ae0c18c688905d34840d7d0b937f2f7bdd52162c98a40", size = 346563, upload-time = "2026-04-10T14:27:11.287Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/3b/f8d07580d8706021d255a6356b8fab13ee4c869412995550ce6ed4ddf97d/jiter-0.14.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:651a8758dd413c51e3b7f6557cdc6921faf70b14106f45f969f091f5cda990ea", size = 357928, upload-time = "2026-04-10T14:27:12.729Z" },
+    { url = "https://files.pythonhosted.org/packages/47/5b/ac1a974da29e35507230383110ffec59998b290a8732585d04e19a9eb5ba/jiter-0.14.0-cp313-cp313t-win_amd64.whl", hash = "sha256:e1a7eead856a5038a8d291f1447176ab0b525c77a279a058121b5fccee257f6f", size = 203519, upload-time = "2026-04-10T14:27:14.125Z" },
+    { url = "https://files.pythonhosted.org/packages/96/6d/9fc8433d667d2454271378a79747d8c76c10b51b482b454e6190e511f244/jiter-0.14.0-cp313-cp313t-win_arm64.whl", hash = "sha256:2e692633a12cda97e352fdcd1c4acc971b1c28707e1e33aeef782b0cbf051975", size = 190113, upload-time = "2026-04-10T14:27:16.638Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/1e/354ed92461b165bd581f9ef5150971a572c873ec3b68a916d5aa91da3cc2/jiter-0.14.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:6f396837fc7577871ca8c12edaf239ed9ccef3bbe39904ae9b8b63ce0a48b140", size = 315277, upload-time = "2026-04-10T14:27:18.109Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/95/8c7c7028aa8636ac21b7a55faef3e34215e6ed0cbf5ae58258427f621aa3/jiter-0.14.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a4d50ea3d8ba4176f79754333bd35f1bbcd28e91adc13eb9b7ca91bc52a6cef9", size = 315923, upload-time = "2026-04-10T14:27:19.603Z" },
+    { url = "https://files.pythonhosted.org/packages/47/40/e2a852a44c4a089f2681a16611b7ce113224a80fd8504c46d78491b47220/jiter-0.14.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce17f8a050447d1b4153bda4fb7d26e6a9e74eb4f4a41913f30934c5075bf615", size = 344943, upload-time = "2026-04-10T14:27:21.262Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/1f/670f92adee1e9895eac41e8a4d623b6da68c4d46249d8b556b60b63f949e/jiter-0.14.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f4f1c4b125e1652aefbc2e2c1617b60a160ab789d180e3d423c41439e5f32850", size = 369725, upload-time = "2026-04-10T14:27:22.766Z" },
+    { url = "https://files.pythonhosted.org/packages/01/2f/541c9ba567d05de1c4874a0f8f8c5e3fd78e2b874266623da9a775cf46e0/jiter-0.14.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be808176a6a3a14321d18c603f2d40741858a7c4fc982f83232842689fe86dd9", size = 461210, upload-time = "2026-04-10T14:27:24.315Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/a9/c31cbec09627e0d5de7aeaec7690dba03e090caa808fefd8133137cf45bc/jiter-0.14.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:26679d58ba816f88c3849306dd58cb863a90a1cf352cdd4ef67e30ccf8a77994", size = 380002, upload-time = "2026-04-10T14:27:26.155Z" },
+    { url = "https://files.pythonhosted.org/packages/50/02/3c05c1666c41904a2f607475a73e7a4763d1cbde2d18229c4f85b22dc253/jiter-0.14.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80381f5a19af8fa9aef743f080e34f6b25ebd89656475f8cf0470ec6157052aa", size = 354678, upload-time = "2026-04-10T14:27:27.701Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/97/e15b33545c2b13518f560d695f974b9891b311641bdcf178d63177e8801e/jiter-0.14.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:004df5fdb8ecbd6d99f3227df18ba1a259254c4359736a2e6f036c944e02d7c5", size = 358920, upload-time = "2026-04-10T14:27:29.256Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/d2/8b1461def6b96ba44530df20d07ef7a1c7da22f3f9bf1727e2d611077bf1/jiter-0.14.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cff5708f7ed0fa098f2b53446c6fa74c48469118e5cd7497b4f1cd569ab06928", size = 394512, upload-time = "2026-04-10T14:27:31.344Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/88/837566dd6ed6e452e8d3205355afd484ce44b2533edfa4ed73a298ea893e/jiter-0.14.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:2492e5f06c36a976d25c7cc347a60e26d5470178d44cde1b9b75e60b4e519f28", size = 521120, upload-time = "2026-04-10T14:27:33.299Z" },
+    { url = "https://files.pythonhosted.org/packages/89/6b/b00b45c4d1b4c031777fe161d620b755b5b02cdade1e316dcb46e4471d63/jiter-0.14.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:7609cfbe3a03d37bfdbf5052012d5a879e72b83168a363deae7b3a26564d57de", size = 553668, upload-time = "2026-04-10T14:27:34.868Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/d8/6fe5b42011d19397433d345716eac16728ac241862a2aac9c91923c7509a/jiter-0.14.0-cp314-cp314-win32.whl", hash = "sha256:7282342d32e357543565286b6450378c3cd402eea333fc1ebe146f1fabb306fc", size = 207001, upload-time = "2026-04-10T14:27:36.455Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/43/5c2e08da1efad5e410f0eaaabeadd954812612c33fbbd8fd5328b489139d/jiter-0.14.0-cp314-cp314-win_amd64.whl", hash = "sha256:bd77945f38866a448e73b0b7637366afa814d4617790ecd88a18ca74377e6c02", size = 202187, upload-time = "2026-04-10T14:27:38Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/1f/6e39ac0b4cdfa23e606af5b245df5f9adaa76f35e0c5096790da430ca506/jiter-0.14.0-cp314-cp314-win_arm64.whl", hash = "sha256:f2d4c61da0821ee42e0cdf5489da60a6d074306313a377c2b35af464955a3611", size = 192257, upload-time = "2026-04-10T14:27:39.504Z" },
+    { url = "https://files.pythonhosted.org/packages/05/57/7dbc0ffbbb5176a27e3518716608aa464aee2e2887dc938f0b900a120449/jiter-0.14.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1bf7ff85517dd2f20a5750081d2b75083c1b269cf75afc7511bdf1f9548beb3b", size = 323441, upload-time = "2026-04-10T14:27:41.039Z" },
+    { url = "https://files.pythonhosted.org/packages/83/6e/7b3314398d8983f06b557aa21b670511ec72d3b79a68ee5e4d9bff972286/jiter-0.14.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8ef8791c3e78d6c6b157c6d360fbb5c715bebb8113bc6a9303c5caff012754a", size = 348109, upload-time = "2026-04-10T14:27:42.552Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/4f/8dc674bcd7db6dba566de73c08c763c337058baff1dbeb34567045b27cdc/jiter-0.14.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e74663b8b10da1fe0f4e4703fd7980d24ad17174b6bb35d8498d6e3ebce2ae6a", size = 368328, upload-time = "2026-04-10T14:27:44.574Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/5f/188e09a1f20906f98bbdec44ed820e19f4e8eb8aff88b9d1a5a497587ff3/jiter-0.14.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1aca29ba52913f78362ec9c2da62f22cdc4c3083313403f90c15460979b84d9b", size = 463301, upload-time = "2026-04-10T14:27:46.717Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/f0/19046ef965ed8f349e8554775bb12ff4352f443fbe12b95d31f575891256/jiter-0.14.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8b39b7d87a952b79949af5fef44d2544e58c21a28da7f1bae3ef166455c61746", size = 378891, upload-time = "2026-04-10T14:27:48.32Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/c3/da43bd8431ee175695777ee78cf0e93eacbb47393ff493f18c45231b427d/jiter-0.14.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78d918a68b26e9fab068c2b5453577ef04943ab2807b9a6275df2a812599a310", size = 360749, upload-time = "2026-04-10T14:27:49.88Z" },
+    { url = "https://files.pythonhosted.org/packages/72/26/e054771be889707c6161dbdec9c23d33a9ec70945395d70f07cfea1e9a6f/jiter-0.14.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:b08997c35aee1201c1a5361466a8fb9162d03ae7bf6568df70b6c859f1e654a4", size = 358526, upload-time = "2026-04-10T14:27:51.504Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/0f/7bea65ea2a6d91f2bf989ff11a18136644392bf2b0497a1fa50934c30a9c/jiter-0.14.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:260bf7ca20704d58d41f669e5e9fe7fe2fa72901a6b324e79056f5d52e9c9be2", size = 393926, upload-time = "2026-04-10T14:27:53.368Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/a1/b1ff7d70deef61ac0b7c6c2f12d2ace950cdeecb4fdc94500a0926802857/jiter-0.14.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:37826e3df29e60f30a382f9294348d0238ef127f4b5d7f5f8da78b5b9e050560", size = 521052, upload-time = "2026-04-10T14:27:55.058Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/7b/3b0649983cbaf15eda26a414b5b1982e910c67bd6f7b1b490f3cfc76896a/jiter-0.14.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:645be49c46f2900937ba0eaf871ad5183c96858c0af74b6becc7f4e367e36e06", size = 553716, upload-time = "2026-04-10T14:27:57.269Z" },
+    { url = "https://files.pythonhosted.org/packages/97/f8/33d78c83bd93ae0c0af05293a6660f88a1977caef39a6d72a84afab94ce0/jiter-0.14.0-cp314-cp314t-win32.whl", hash = "sha256:2f7877ed45118de283786178eceaf877110abacd04fde31efff3940ae9672674", size = 207957, upload-time = "2026-04-10T14:27:59.285Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/ac/2b760516c03e2227826d1f7025d89bf6bf6357a28fe75c2a2800873c50bf/jiter-0.14.0-cp314-cp314t-win_amd64.whl", hash = "sha256:14c0cb10337c49f5eafe8e7364daca5e29a020ea03580b8f8e6c597fed4e1588", size = 204690, upload-time = "2026-04-10T14:28:00.962Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/2e/a44c20c58aeed0355f2d326969a181696aeb551a25195f47563908a815be/jiter-0.14.0-cp314-cp314t-win_arm64.whl", hash = "sha256:5419d4aa2024961da9fe12a9cfe7484996735dca99e8e090b5c88595ef1951ff", size = 191338, upload-time = "2026-04-10T14:28:02.853Z" },
+]
+
 [[package]]
 name = "jmespath"
 version = "1.1.0"
@@ -1818,6 +1916,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/27/72/428fb01a1043ddbb3f66297363406d6e69ddff5ad89c4d07945a3753a235/slack_sdk-3.40.0-py2.py3-none-any.whl", hash = "sha256:f2bada5ed3adb10a01e154e90db01d6d8938d0461b5790c12bcb807b2d28bbe2", size = 312786, upload-time = "2026-02-10T22:12:11.258Z" },
 ]
 
+[[package]]
+name = "sniffio"
+version = "1.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
+]
+
 [[package]]
 name = "soupsieve"
 version = "2.8.3"