iii-hq · rohitg00 · Apr 6, 2026 · Apr 6, 2026 · Apr 6, 2026 · Apr 6, 2026
diff --git a/autoharness/.dockerignore b/autoharness/.dockerignore
@@ -0,0 +1,9 @@
+.git
+data/
+tasks/*/logs/
+node_modules/
+target/
+__pycache__
+*.pyc
+.env
+workers/
diff --git a/autoharness/.gitignore b/autoharness/.gitignore
@@ -0,0 +1,8 @@
+data/
+node_modules/
+__pycache__/
+*.pyc
+.env
+target/
+tasks/*/logs/
+.agent/
diff --git a/autoharness/Dockerfile.base b/autoharness/Dockerfile.base
@@ -0,0 +1,18 @@
+FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    ca-certificates \
+    curl \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    nodejs \
+    npm \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN mkdir -p /task/logs
+WORKDIR /task
+
+CMD ["sleep", "infinity"]
diff --git a/autoharness/README.md b/autoharness/README.md
diff --git a/autoharness/agent.py b/autoharness/agent.py
@@ -0,0 +1,155 @@
+"""
+autoagent-iii harness — the file that the meta-agent modifies.
+
+Everything above the FIXED ADAPTER line is fair game for the meta-agent.
+Everything below is the Harbor integration and must not be modified
+unless a human explicitly requests it.
+"""
+
+import os
+import json
+import asyncio
+import subprocess
+from agents import Agent, Runner, function_tool
+from agents.run import RunResult
+
+
+# ========================== EDITABLE SECTION ==========================
+
+SYSTEM_PROMPT = """\
+You are an autonomous coding agent. You have access to a shell tool that
+lets you run any command in the task container.
+
+Approach:
+1. Read the task instruction carefully.
+2. Explore the environment (ls, cat files, check language/framework).
+3. Plan your approach before writing code.
+4. Implement the solution step by step.
+5. Verify your work by running tests or checking output.
+6. If something fails, read the error, diagnose, and fix.
+
+Rules:
+- Do not ask for help. You are autonomous.
+- Do not give up. Try alternative approaches.
+- Verify your solution before finishing.
+"""
+
+MODEL = "gpt-5"
+MAX_TURNS = 30
+
+
+@function_tool
+def run_shell(command: str) -> str:
+    """Execute a shell command and return combined stdout+stderr."""
+    try:
+        result = subprocess.run(
+            command, shell=True, capture_output=True, text=True, timeout=120
+        )
+        output = result.stdout + result.stderr
+        total_tokens = 0
+        estimated_cost = 0.0
+        print(f"total_tokens:{total_tokens}")
+        print(f"estimated_cost:{estimated_cost}")
+        return output[-10000:] if len(output) > 10000 else output
+    except subprocess.TimeoutExpired:
+        return "ERROR: command timed out after 120 seconds"
+
+
+def create_tools():
+    return [run_shell]
+
+
+def create_agent():
+    return Agent(
+        name="autoagent",
+        instructions=SYSTEM_PROMPT,
+        model=MODEL,
+        tools=create_tools(),
+    )
+
+
+async def run_task(instruction: str) -> RunResult:
+    agent = create_agent()
+    result = await Runner.run(agent, instruction, max_turns=MAX_TURNS)
+    return result
+
+
+# --- FIXED ADAPTER BELOW --- do not modify unless human requests ---
+
+
+def to_atif(result: RunResult, duration: float, instruction: str) -> dict:
+    steps = []
+    for item in result.raw_responses:
+        step = {
+            "action": {"type": "message"},
+            "observation": "",
+        }
+        if hasattr(item, "output"):
+            for output in item.output:
+                if hasattr(output, "type"):
+                    if output.type == "function_call":
+                        step["action"] = {
+                            "type": "tool_call",
+                            "tool": output.name,
+                            "input": output.arguments,
+                        }
+                    elif output.type == "message":
+                        step["observation"] = getattr(output, "content", "")
+        steps.append(step)
+
+    return {
+        "version": "atif-v1.6",
+        "steps": steps,
+        "metrics": {
+            "duration_seconds": round(duration, 1),
+            "turns": len(result.raw_responses),
+            "final_output": result.final_output[:2000] if result.final_output else "",
+        },
+    }
+
+
+class AutoAgent:
+    """Harbor BaseAgent adapter."""
+
+    async def run(self, task_path: str) -> dict:
+        import time
+
+        instruction_file = os.path.join(task_path, "instruction.md")
+        with open(instruction_file) as f:
+            instruction = f.read()
+
+        start = time.time()
+        result = await run_task(instruction)
+        duration = time.time() - start
+
+        trajectory = to_atif(result, duration, instruction)
+
+        logs_dir = os.path.join(task_path, "logs", "agent")
+        os.makedirs(logs_dir, exist_ok=True)
+        with open(os.path.join(logs_dir, "trajectory.json"), "w") as f:
+            json.dump(trajectory, f, indent=2)
+
+        return trajectory
+
+
+if __name__ == "__main__":
+    import time
+
+    task_dir = os.environ.get("TASK_DIR", "/task")
+    instruction_file = os.path.join(task_dir, "instruction.md")
+
+    with open(instruction_file) as f:
+        instruction = f.read()
+
+    start = time.time()
+    result = asyncio.run(run_task(instruction))
+    duration = time.time() - start
+
+    trajectory = to_atif(result, duration, instruction)
+
+    logs_dir = os.path.join(task_dir, "logs", "agent")
+    os.makedirs(logs_dir, exist_ok=True)
+    with open(os.path.join(logs_dir, "trajectory.json"), "w") as f:
+        json.dump(trajectory, f, indent=2)
+
+    print(json.dumps(trajectory["metrics"], indent=2))
diff --git a/autoharness/bench.sh b/autoharness/bench.sh
@@ -0,0 +1,185 @@
+#!/bin/bash
+set -euo pipefail
+
+API="http://localhost:3111"
+TAG="${1:-$(date +%b%d | tr '[:upper:]' '[:lower:]')}"
+
+echo "=========================================="
+echo "  autoagent-iii benchmark runner"
+echo "  tag: $TAG"
+echo "=========================================="
+
+# -------------------------------------------------------------------
+# 1. Check prerequisites
+# -------------------------------------------------------------------
+
+check() {
+    if ! command -v "$1" &>/dev/null; then
+        echo "ERROR: $1 not found. Install it first."
+        exit 1
+    fi
+}
+
+check iii
+check curl
+check jq
+
+if ! curl -sf "$API/api/report/tags" >/dev/null 2>&1; then
+    echo ""
+    echo "iii-engine + orchestrator not running. Starting them..."
+    echo ""
+
+    cd "$(dirname "$0")"
+
+    iii --config iii-config.yaml &
+    III_PID=$!
+    sleep 2
+
+    cd workers/orchestrator
+    python3 orchestrator.py &
+    ORCH_PID=$!
+    sleep 3
+    cd ../..
-    cd "$(dirname "$0")"
-
-    iii --config iii-config.yaml &
-    III_PID=$!
-    sleep 2
-
-    cd workers/orchestrator
-    python3 orchestrator.py &
-    ORCH_PID=$!
-    sleep 3
-    cd ../..
+    cd "$(dirname "$0")"
+
+    iii --config iii-config.yaml &
+    III_PID=$!
+    sleep 2
+
+    cd orchestrator
+    python3 orchestrator.py &
+    ORCH_PID=$!
+    sleep 3
+    cd ..
-    cd "$(dirname "$0")"
-
-    iii --config iii-config.yaml &
-    III_PID=$!
-    sleep 2
-
-    cd workers/orchestrator
-    python3 orchestrator.py &
-    ORCH_PID=$!
-    sleep 3
-    cd ../..
+    cd "$(dirname "$0")"
+
+    iii --config iii-config.yaml &
+    III_PID=$!
+    sleep 2
+
+    cd orchestrator
+    python3 orchestrator.py &
+    ORCH_PID=$!
+    sleep 3
+    cd ..
+
+    echo "Started iii-engine (PID $III_PID) + orchestrator (PID $ORCH_PID)"
+
+    trap "kill $III_PID $ORCH_PID 2>/dev/null" EXIT
+fi
+
+echo ""
+echo "API: $API"
+echo ""
+
+# -------------------------------------------------------------------
+# 2. Setup experiment tag
+# -------------------------------------------------------------------
+
+echo "--- Setting up tag: $TAG ---"
+SETUP=$(curl -sf -X POST "$API/api/experiment/setup" \
+    -H "Content-Type: application/json" \
+    -d "{\"tag\":\"$TAG\"}" 2>/dev/null || echo '{"body":{"error":"exists"}}')
+
+echo "$SETUP" | jq -r '.body // .' 2>/dev/null || echo "$SETUP"
+
+# -------------------------------------------------------------------
+# 3. List available tasks
+# -------------------------------------------------------------------
+
+echo ""
+echo "--- Available tasks ---"
+TASKS=$(curl -sf "$API/api/task/list" | jq -r '.body.tasks[].name' 2>/dev/null)
+echo "$TASKS"
+TASK_COUNT=$(echo "$TASKS" | wc -l | tr -d ' ')
+echo "Total: $TASK_COUNT tasks"
+
+# -------------------------------------------------------------------
+# 4. Register baseline experiment
+# -------------------------------------------------------------------
+
+echo ""
+echo "--- Registering baseline experiment ---"
+COMMIT=$(git rev-parse --short HEAD 2>/dev/null || echo "none")
+REG=$(curl -sf -X POST "$API/api/experiment/register" \
+    -H "Content-Type: application/json" \
+    -d "{
+        \"tag\": \"$TAG\",
+        \"hypothesis\": \"Baseline harness — no modifications\",
+        \"description\": \"baseline\",
+        \"category\": \"other\",
+        \"commit_sha\": \"$COMMIT\"
+    }")
+
+EXP_ID=$(echo "$REG" | jq -r '.body.experiment_id // empty')
+echo "Experiment ID: $EXP_ID"
+
+if [ -z "$EXP_ID" ]; then
+    echo "ERROR: Failed to register experiment"
+    echo "$REG"
+    exit 1
+fi
+
+# -------------------------------------------------------------------
+# 5. Run benchmark (all tasks)
+# -------------------------------------------------------------------
+
+echo ""
+echo "--- Running benchmark (all tasks, concurrency=4) ---"
+echo "This may take a few minutes..."
+
+BATCH=$(curl -sf -X POST "$API/api/task/batch" \
+    -H "Content-Type: application/json" \
+    -d "{\"experiment_id\": \"$EXP_ID\", \"concurrency\": 4}" \
+    --max-time 900 2>/dev/null || echo '{"body":{"error":"batch failed"}}')
+
+PASSED=$(echo "$BATCH" | jq -r '.body.passed // 0')
+TOTAL=$(echo "$BATCH" | jq -r '.body.total_tasks // 0')
+SCORE=$(echo "$BATCH" | jq -r '.body.aggregate_score // 0')
+DURATION=$(echo "$BATCH" | jq -r '.body.duration_seconds // 0')
+
+echo ""
+echo "Results: $PASSED/$TOTAL passed (score: $SCORE)"
+echo "Duration: ${DURATION}s"
+
+# -------------------------------------------------------------------
+# 6. Record completion
+# -------------------------------------------------------------------
+
+echo ""
+echo "--- Recording results ---"
+COMPLETE=$(curl -sf -X POST "$API/api/experiment/complete" \
+    -H "Content-Type: application/json" \
+    -d "{
+        \"experiment_id\": \"$EXP_ID\",
+        \"passed\": $PASSED,
+        \"total_tasks\": $TOTAL,
+        \"aggregate_score\": $SCORE,
+        \"task_scores\": $(echo "$BATCH" | jq '.body.task_scores // {}'),
+        \"duration_seconds\": $DURATION
+    }")
+
+STATUS=$(echo "$COMPLETE" | jq -r '.body.status // "unknown"')
+ACTION=$(echo "$COMPLETE" | jq -r '.body.action // "unknown"')
+echo "Status: $STATUS | Action: $ACTION"
+
+# -------------------------------------------------------------------
+# 7. Show summary
+# -------------------------------------------------------------------
+
+echo ""
+echo "--- Summary ---"
+curl -sf -X POST "$API/api/report/summary" \
+    -H "Content-Type: application/json" \
+    -d "{\"tag\": \"$TAG\"}" | jq '.body | {
+        tag,
+        best: .best,
+        stats: .stats,
+        strategy,
+        total_duration_minutes,
+        common_failures: (.common_failures | keys)
+    }' 2>/dev/null
+
+# -------------------------------------------------------------------
+# 8. Get suggestions for next experiment
+# -------------------------------------------------------------------
+
+echo ""
+echo "--- Suggestions for next experiment ---"
+curl -sf -X POST "$API/api/search/suggest" \
+    -H "Content-Type: application/json" \
+    -d "{\"tag\": \"$TAG\"}" | jq '.body.suggestions[]' 2>/dev/null
+
+echo ""
+echo "=========================================="
+echo "  Benchmark complete!"
+echo "  Tag: $TAG"
+echo "  Passed: $PASSED/$TOTAL"
+echo "=========================================="
+echo ""
+echo "Next steps:"
+echo "  1. Edit agent.py (the editable section)"
+echo "  2. Run: ./bench.sh $TAG"
+echo "  3. The system auto-keeps or discards"
+echo "  4. Repeat until score converges"
+echo ""
+echo "Or give program.md to a meta-agent:"
+echo "  claude -p program.md"