diff --git a/.agents/skills/commit-push/SKILL.md b/.agents/skills/commit-push/SKILL.md index 14ae623..606e9f8 100644 --- a/.agents/skills/commit-push/SKILL.md +++ b/.agents/skills/commit-push/SKILL.md @@ -63,7 +63,66 @@ If preconditions fail, stop and report. - flaky/infra/transient - permission/workflow policy failure -8. Pre-merge unresolved comment triage and fix loop (max 2 loops): +8. Codex review settle gate (mandatory, latest PR head SHA): +- After PR creation/update and green CI, wait for Codex review output before merge. +- Poll PR reviews/comments every `15s` for up to `5 minutes`, scoped to the latest PR head SHA. +- Default reviewer identity for this gate: `chatgpt-codex-connector` (GitHub UI may render as `chatgpt-codex-connector bot`). +- Accepted settle signals: + - Codex posts actionable review comments/suggestions -> proceed to pre-merge fix loop. + - Codex posts explicit approval/all-good signal -> review gate is satisfied. +- If no Codex review signal appears within timeout, stop and report blocker (`review pending`). +- Example `gh api` polling implementation: +```bash +PR_NUMBER="$(gh pr view --json number --jq .number)" +REPO="$(gh repo view --json nameWithOwner --jq .nameWithOwner)" +HEAD_SHA="$(gh pr view --json headRefOid --jq .headRefOid)" +BOT_RE="${BOT_RE:-^chatgpt-codex-connector(\\[bot\\])?$}" # override if your org uses a different reviewer bot login +DEADLINE=$(( $(date +%s) + 300 )) +SETTLED=0 +SETTLE_KIND="" + +while [ "$(date +%s)" -lt "$DEADLINE" ]; do + APPROVAL_COUNT="$(gh api "repos/$REPO/pulls/$PR_NUMBER/reviews" \ + --jq "[.[] | select((.user.login | test(\"$BOT_RE\"; \"i\")) and .commit_id==\"$HEAD_SHA\" and .state==\"APPROVED\")] | length")" + + ACTION_REVIEW_COUNT="$(gh api "repos/$REPO/pulls/$PR_NUMBER/reviews" \ + --jq "[.[] | select((.user.login | test(\"$BOT_RE\"; \"i\")) and .commit_id==\"$HEAD_SHA\" and (.state==\"CHANGES_REQUESTED\" or .state==\"COMMENTED\"))] | length")" + + ACTION_COMMENT_COUNT="$(gh api "repos/$REPO/pulls/$PR_NUMBER/comments" \ + --jq "[.[] | select((.user.login | test(\"$BOT_RE\"; \"i\")) and .commit_id==\"$HEAD_SHA\")] | length")" + + if [ "$ACTION_REVIEW_COUNT" -gt 0 ] || [ "$ACTION_COMMENT_COUNT" -gt 0 ]; then + SETTLED=1 + SETTLE_KIND="actionable" + break + fi + + if [ "$APPROVAL_COUNT" -gt 0 ]; then + SETTLED=1 + SETTLE_KIND="approved" + break + fi + + sleep 15 +done + +if [ "$SETTLED" -ne 1 ]; then + echo "BLOCKER: review pending (no Codex signal within 5 minutes for head $HEAD_SHA)" + exit 1 +fi + +echo "Codex review settle result: $SETTLE_KIND" +``` +- Example signal inspection commands for reporting: +```bash +gh api "repos/$REPO/pulls/$PR_NUMBER/reviews" \ + --jq ".[] | select((.user.login | test(\"$BOT_RE\"; \"i\")) and .commit_id==\"$HEAD_SHA\") | {state, user: .user.login, submitted_at, body}" + +gh api "repos/$REPO/pulls/$PR_NUMBER/comments" \ + --jq ".[] | select((.user.login | test(\"$BOT_RE\"; \"i\")) and .commit_id==\"$HEAD_SHA\") | {path, user: .user.login, created_at, body}" +``` + +9. Pre-merge unresolved comment triage and fix loop (max 2 loops): - Fetch unresolved PR review threads/comments (including bot comments) for the latest PR head SHA. - Triage each unresolved item: `implement`, `defer`, `reject`. - Auto-fix only `implement` items that are: @@ -76,21 +135,26 @@ If preconditions fail, stop and report. - `git commit -m "fix: address actionable PR comments (loop )"` (skip only if no changes) - push branch - re-watch PR CI to green + - re-run Codex review settle gate on the new PR head SHA (poll `15s`, timeout `5 minutes`) - re-fetch unresolved threads/comments - If unresolved `P0/P1` remain after loop cap, stop and report blocker. -9. Merge PR after green: +10. Merge PR after green and review gate satisfied: +- Merge only when all are true on latest PR head SHA: + - required PR CI is green + - Codex review settle gate is satisfied + - no unresolved `P0/P1` review items remain - Merge non-interactively (repo-default merge strategy or explicitly chosen one). - Record merged PR URL and merge commit SHA. -10. Switch to main and sync: +11. Switch to main and sync: - `git checkout main` - `git pull --ff-only origin main` -11. Monitor post-merge CI on `main`: +12. Monitor post-merge CI on `main`: - Watch the latest `main` CI run with timeout `25 minutes`. -12. Hotfix loop on post-merge red (max 2 loops): +13. Hotfix loop on post-merge red (max 2 loops): - Run only for actionable failures. - Loop cap: `2`. - For each loop: @@ -106,8 +170,9 @@ If preconditions fail, stop and report. - `git checkout main && git pull --ff-only origin main` - Monitor post-merge CI again (25 min timeout). -13. Stop conditions: +14. Stop conditions: - CI green on main: success. +- Codex review signal not received within settle timeout (`5 minutes`): stop and report blocker. - Unresolved pre-merge `P0/P1` comments after 2 fix loops: stop and report blocker. - Non-actionable failure class: stop and report. - Loop count exceeded (`>2`): stop and report blocker. @@ -140,6 +205,8 @@ Never use inline `--body "..."` for multi-line PR text. - Required local gate before push: `make prepush-full` (includes CodeQL in this repo). - PR CI watch timeout: `25 minutes`. +- Codex review settle polling interval: `15 seconds`. +- Codex review settle timeout: `5 minutes` (mandatory pre-merge gate). - Pre-merge comment-fix loop cap: `2`. - Post-merge main CI watch timeout: `25 minutes`. - Retry/hotfix loop cap: `2`. @@ -150,6 +217,7 @@ Never use inline `--body "..."` for multi-line PR text. - Commit SHA(s) - PR URL(s) - CI status per cycle +- Codex review settle status per cycle - Merge commit SHA(s) - Post-merge CI status on `main` - If stopped: blocker reason and last failing check diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 93a5645..b6be8b2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -93,6 +93,11 @@ jobs: go-version: '1.25.7' check-latest: false + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + - name: Run acceptance lane run: | make test-integration @@ -101,6 +106,14 @@ jobs: scripts/validate_contracts.sh scripts/validate_scenarios.sh go test ./internal/scenarios -count=1 -tags=scenario + scripts/run_agent_benchmarks.sh --output .tmp/agent-benchmarks-main.json + + - name: Upload agent benchmark report + if: always() + uses: actions/upload-artifact@v4 + with: + name: agent-benchmark-main + path: .tmp/agent-benchmarks-main.json docs-smoke: name: docs-smoke diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index d366b91..74509c1 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -55,6 +55,9 @@ jobs: - name: Run performance budgets run: scripts/test_perf_budgets.sh + - name: Run agent benchmark gate + run: scripts/run_agent_benchmarks.sh --output .tmp/agent-benchmarks-nightly.json + - name: Run cross-product interop suite run: go test ./internal/integration/interop -count=1 @@ -75,3 +78,10 @@ jobs: path: | .tmp/release/v1-scorecard.json .tmp/release/v1-scorecard.md + + - name: Upload agent benchmark report + if: always() + uses: actions/upload-artifact@v4 + with: + name: agent-benchmark-nightly + path: .tmp/agent-benchmarks-nightly.json diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 5086207..de1a444 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -81,6 +81,14 @@ jobs: make test-fast make test-contracts scripts/validate_scenarios.sh + scripts/run_agent_benchmarks.sh --output .tmp/agent-benchmarks-pr.json + + - name: Upload agent benchmark report + if: always() + uses: actions/upload-artifact@v4 + with: + name: agent-benchmark-pr + path: .tmp/agent-benchmarks-pr.json - name: Docs parity and smoke subset if: steps.changes.outputs.go == 'true' || steps.changes.outputs.docs == 'true' || steps.changes.outputs.workflow_or_policy == 'true' diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f07d325..190b80f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -33,6 +33,11 @@ jobs: go-version: '1.25.7' check-latest: false + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + - name: Setup Node uses: actions/setup-node@v4 with: @@ -79,8 +84,16 @@ jobs: run: | scripts/test_hardening_core.sh scripts/test_perf_budgets.sh + scripts/run_agent_benchmarks.sh --output .tmp/release/agent-benchmarks-release.json go test ./internal/integration/interop -count=1 + - name: Upload agent benchmark report + if: always() + uses: actions/upload-artifact@v4 + with: + name: agent-benchmark-release + path: .tmp/release/agent-benchmarks-release.json + - name: Run pre-publish install-path UAT smoke env: HOMEBREW_TAP_GITHUB_TOKEN: ${{ secrets.HOMEBREW_TAP_GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore index d21affc..4b14000 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,15 @@ venv/ !scenarios/wrkr/extension-detectors/repos/ext-repo/.wrkr/ !scenarios/wrkr/extension-detectors/repos/ext-repo/.wrkr/detectors/ !scenarios/wrkr/extension-detectors/repos/ext-repo/.wrkr/detectors/extensions.json +!scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/.wrkr/ +!scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/.wrkr/agents/ +!scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/.wrkr/agents/custom-agent.yaml +!scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/.wrkr/agents/mcp-client.yaml +!scenarios/wrkr/agent-relationship-correlation/repos/correlated-agent-app/.wrkr/ +!scenarios/wrkr/agent-relationship-correlation/repos/correlated-agent-app/.wrkr/agents/ +!scenarios/wrkr/agent-relationship-correlation/repos/correlated-agent-app/.wrkr/agents/mcp-client.yaml +!testinfra/benchmarks/agents/fixtures/**/.wrkr/ +!testinfra/benchmarks/agents/fixtures/**/.wrkr/** *.sarif /wrkr /wrkr.exe diff --git a/Makefile b/Makefile index c545e37..0847509 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ GOFILES := $(shell git ls-files '*.go') DOCS_SITE_NPM_CACHE ?= $(CURDIR)/.tmp/npm-cache .PHONY: fmt lint lint-fast test test-fast test-integration test-e2e test-contracts test-scenarios \ - test-hardening test-chaos test-perf test-risk-lane build hooks prepush prepush-full codeql lint-ci \ + test-hardening test-chaos test-perf test-agent-benchmarks test-risk-lane build hooks prepush prepush-full codeql lint-ci \ test-docs-consistency test-docs-storyline test-adapter-parity test-v1-acceptance test-uat-local test-release-smoke \ docs-site-install docs-site-lint docs-site-build docs-site-check docs-site-audit-prod @@ -51,7 +51,10 @@ test-chaos: test-perf: @scripts/test_perf_budgets.sh -test-risk-lane: test-contracts test-scenarios test-hardening test-chaos test-perf +test-agent-benchmarks: + @scripts/run_agent_benchmarks.sh --output .tmp/agent-benchmarks.json + +test-risk-lane: test-contracts test-scenarios test-hardening test-chaos test-perf test-agent-benchmarks test-docs-consistency: @scripts/check_docs_cli_parity.sh diff --git a/core/aggregate/inventory/inventory.go b/core/aggregate/inventory/inventory.go index 63c216f..d590eed 100644 --- a/core/aggregate/inventory/inventory.go +++ b/core/aggregate/inventory/inventory.go @@ -740,7 +740,7 @@ func classifyToolCategory(toolType string) string { switch normalized { case "claude", "cursor", "codex", "copilot", "cody", "windsurf": return "assistant" - case "a2a", "agent", "agent_framework", "ci_agent", "compiled_action", "langchain", "crewai", "autogen": + case "a2a", "agent", "agent_framework", "ci_agent", "compiled_action", "langchain", "crewai", "autogen", "llamaindex", "openai_agents", "mcp_client", "custom_agent": return "agent_framework" case "mcp", "mcpgateway", "webmcp": return "mcp_integration" diff --git a/core/detect/agentautogen/detector.go b/core/detect/agentautogen/detector.go index b76351c..5aff377 100644 --- a/core/detect/agentautogen/detector.go +++ b/core/detect/agentautogen/detector.go @@ -17,10 +17,31 @@ func New() Detector { return Detector{} } func (Detector) ID() string { return detectorID } func (Detector) Detect(ctx context.Context, scope detect.Scope, _ detect.Options) ([]model.Finding, error) { - return agentframework.Detect(ctx, scope, agentframework.DetectorConfig{ - DetectorID: detectorID, - Framework: "autogen", - ConfigPath: ".wrkr/agents/autogen.json", - Format: "json", + _ = ctx + return agentframework.DetectMany(scope, []agentframework.DetectorConfig{ + { + DetectorID: detectorID, + Framework: "autogen", + ConfigPath: ".wrkr/agents/autogen.json", + Format: "json", + }, + { + DetectorID: detectorID, + Framework: "autogen", + ConfigPath: ".wrkr/agents/autogen.yaml", + Format: "yaml", + }, + { + DetectorID: detectorID, + Framework: "autogen", + ConfigPath: ".wrkr/agents/autogen.yml", + Format: "yaml", + }, + { + DetectorID: detectorID, + Framework: "autogen", + ConfigPath: ".wrkr/agents/autogen.toml", + Format: "toml", + }, }) } diff --git a/core/detect/agentautogen/detector_test.go b/core/detect/agentautogen/detector_test.go index 1f39aa8..2f4a108 100644 --- a/core/detect/agentautogen/detector_test.go +++ b/core/detect/agentautogen/detector_test.go @@ -4,6 +4,7 @@ import ( "context" "os" "path/filepath" + "reflect" "testing" "github.com/Clyra-AI/wrkr/core/detect" @@ -36,6 +37,47 @@ func TestAutoGenDetector_PrecisionBaseline(t *testing.T) { } } +func TestAutoGenDetector_ExpandedFormatsDeterministic(t *testing.T) { + t.Parallel() + + root := t.TempDir() + writeFile(t, root, ".wrkr/agents/autogen.yaml", `agents: + - name: planner + file: agents/planner.py +`) + writeFile(t, root, ".wrkr/agents/autogen.toml", `[[agents]] +name = "executor" +file = "agents/executor.py" +`) + + scope := detect.Scope{Org: "acme", Repo: "platform", Root: root} + first, err := New().Detect(context.Background(), scope, detect.Options{}) + if err != nil { + t.Fatalf("detect: %v", err) + } + if len(first) != 2 { + t.Fatalf("expected two findings from yaml+toml declarations, got %d", len(first)) + } + for _, finding := range first { + if finding.ToolType != "autogen" { + t.Fatalf("unexpected tool type %q", finding.ToolType) + } + if finding.FindingType != "agent_framework" { + t.Fatalf("unexpected finding type %q", finding.FindingType) + } + } + + for i := 0; i < 10; i++ { + next, err := New().Detect(context.Background(), scope, detect.Options{}) + if err != nil { + t.Fatalf("detect run %d: %v", i+1, err) + } + if !reflect.DeepEqual(first, next) { + t.Fatalf("non-deterministic output at run %d", i+1) + } + } +} + func writeFile(t *testing.T, root, rel, content string) { t.Helper() path := filepath.Join(root, filepath.FromSlash(rel)) diff --git a/core/detect/agentcustom/detector.go b/core/detect/agentcustom/detector.go new file mode 100644 index 0000000..ad3c9f8 --- /dev/null +++ b/core/detect/agentcustom/detector.go @@ -0,0 +1,349 @@ +package agentcustom + +import ( + "context" + "fmt" + "os" + "path/filepath" + "sort" + "strings" + + "github.com/Clyra-AI/wrkr/core/detect" + "github.com/Clyra-AI/wrkr/core/model" +) + +const detectorID = "agentcustom" + +const confidenceGate = 0.85 + +type Detector struct{} + +type customAgent struct { + Name string `json:"name" yaml:"name" toml:"name"` + File string `json:"file" yaml:"file" toml:"file"` + Tools []string `json:"tools" yaml:"tools" toml:"tools"` + Auth []string `json:"auth_surfaces" yaml:"auth_surfaces" toml:"auth_surfaces"` + Deploy []string `json:"deployment_artifacts" yaml:"deployment_artifacts" toml:"deployment_artifacts"` + AutoDeploy bool `json:"auto_deploy" yaml:"auto_deploy" toml:"auto_deploy"` + HumanGate bool `json:"human_gate" yaml:"human_gate" toml:"human_gate"` +} + +type declaration struct { + Agents []customAgent `json:"agents" yaml:"agents" toml:"agents"` +} + +type signalSet struct { + Names map[string]struct{} +} + +func New() Detector { return Detector{} } + +func (Detector) ID() string { return detectorID } + +func (Detector) Detect(_ context.Context, scope detect.Scope, _ detect.Options) ([]model.Finding, error) { + if err := detect.ValidateScopeRoot(scope.Root); err != nil { + return nil, err + } + + configs := []struct { + Path string + Format string + }{ + {Path: ".wrkr/agents/custom-agent.yaml", Format: "yaml"}, + {Path: ".wrkr/agents/custom-agent.yml", Format: "yaml"}, + {Path: ".wrkr/agents/custom-agent.json", Format: "json"}, + {Path: ".wrkr/agents/custom-agent.toml", Format: "toml"}, + } + + findings := make([]model.Finding, 0) + workspaceSignals, err := detectWorkspaceSignals(scope) + if err != nil { + return nil, err + } + + for _, cfg := range configs { + if !detect.FileExists(scope.Root, cfg.Path) { + continue + } + + parsed, parseErr := parseConfig(scope.Root, cfg.Path, cfg.Format) + if parseErr != nil { + findings = append(findings, parseErrorFinding(scope, cfg.Path, cfg.Format, *parseErr)) + continue + } + if len(parsed.Agents) == 0 { + findings = append(findings, parseErrorFinding(scope, cfg.Path, cfg.Format, model.ParseError{ + Kind: "schema_validation_error", + Format: cfg.Format, + Path: cfg.Path, + Detector: detectorID, + Message: "expected at least one agents entry", + })) + continue + } + + for _, agent := range parsed.Agents { + if strings.TrimSpace(agent.Name) == "" || strings.TrimSpace(agent.File) == "" { + findings = append(findings, parseErrorFinding(scope, cfg.Path, cfg.Format, model.ParseError{ + Kind: "schema_validation_error", + Format: cfg.Format, + Path: cfg.Path, + Detector: detectorID, + Message: "each agent requires non-empty name and file", + })) + continue + } + scored := scoreSignals(workspaceSignals, agent) + if !meetsConfidenceGate(scored.score, scored.count, scored.operational) { + continue + } + findings = append(findings, toFinding(scope, cfg.Path, agent, scored)) + } + } + + model.SortFindings(findings) + return findings, nil +} + +func parseConfig(root, rel, format string) (declaration, *model.ParseError) { + var parsed declaration + switch format { + case "yaml": + if parseErr := detect.ParseYAMLFile(detectorID, root, rel, &parsed); parseErr != nil { + return declaration{}, parseErr + } + case "json": + if parseErr := detect.ParseJSONFile(detectorID, root, rel, &parsed); parseErr != nil { + return declaration{}, parseErr + } + case "toml": + if parseErr := detect.ParseTOMLFile(detectorID, root, rel, &parsed); parseErr != nil { + return declaration{}, parseErr + } + default: + return declaration{}, &model.ParseError{ + Kind: "parse_error", + Format: format, + Path: rel, + Detector: detectorID, + Message: "unsupported custom-agent config format", + } + } + return parsed, nil +} + +func detectWorkspaceSignals(scope detect.Scope) (signalSet, error) { + signals := signalSet{Names: map[string]struct{}{}} + + if detect.FileExists(scope.Root, "AGENTS.md") || detect.FileExists(scope.Root, "AGENTS.override.md") || detect.FileExists(scope.Root, "CLAUDE.md") || detect.FileExists(scope.Root, ".claude/CLAUDE.md") { + signals.Names["agent_instruction_surface"] = struct{}{} + } + + skillPaths, err := detect.Glob(scope.Root, ".agents/skills/*/SKILL.md") + if err != nil { + return signalSet{}, err + } + claudeSkillPaths, err := detect.Glob(scope.Root, ".claude/skills/*/SKILL.md") + if err != nil { + return signalSet{}, err + } + if len(skillPaths)+len(claudeSkillPaths) > 0 { + signals.Names["skill_pack_surface"] = struct{}{} + } + + workflowFiles, err := detect.Glob(scope.Root, ".github/workflows/*") + if err != nil { + return signalSet{}, err + } + if detect.FileExists(scope.Root, "Jenkinsfile") { + workflowFiles = append(workflowFiles, "Jenkinsfile") + } + sort.Strings(workflowFiles) + + for _, rel := range workflowFiles { + path := filepath.Join(scope.Root, filepath.FromSlash(rel)) + // #nosec G304 -- reads workflow definitions from the selected repository root. + payload, readErr := os.ReadFile(path) + if readErr != nil { + return signalSet{}, readErr + } + lower := strings.ToLower(string(payload)) + if strings.Contains(lower, "codex --full-auto") || strings.Contains(lower, "claude -p") || strings.Contains(lower, "claude code -p") || strings.Contains(lower, "gait eval --script") { + signals.Names["headless_agent_runtime"] = struct{}{} + break + } + } + + return signals, nil +} + +type scoredSignals struct { + names []string + score float64 + count int + operational bool +} + +func scoreSignals(workspace signalSet, agent customAgent) scoredSignals { + names := map[string]float64{ + "custom_config_declared": 0.45, + } + operational := false + + for name := range workspace.Names { + switch name { + case "skill_pack_surface": + names[name] = 0.20 + case "agent_instruction_surface": + names[name] = 0.15 + case "headless_agent_runtime": + names[name] = 0.30 + operational = true + } + } + + if len(uniqueSorted(agent.Tools)) > 0 { + names["tool_binding_declared"] = 0.20 + operational = true + } + if len(uniqueSorted(agent.Auth)) > 0 { + names["auth_binding_declared"] = 0.15 + operational = true + } + if len(uniqueSorted(agent.Deploy)) > 0 || agent.AutoDeploy { + names["deployment_signal_declared"] = 0.20 + operational = true + } + if agent.AutoDeploy && agent.HumanGate { + names["deployment_gate_declared"] = 0.10 + } + + ordered := make([]string, 0, len(names)) + score := 0.0 + for name, weight := range names { + ordered = append(ordered, name) + score += weight + } + sort.Strings(ordered) + return scoredSignals{ + names: ordered, + score: score, + count: len(ordered), + operational: operational, + } +} + +func meetsConfidenceGate(score float64, count int, operational bool) bool { + return score >= confidenceGate && count >= 3 && operational +} + +func toFinding(scope detect.Scope, declarationPath string, agent customAgent, scored scoredSignals) model.Finding { + severity := model.SeverityLow + if contains(scored.names, "headless_agent_runtime") { + severity = model.SeverityMedium + } + if agent.AutoDeploy && !agent.HumanGate { + severity = model.SeverityHigh + } + + evidence := []model.Evidence{ + {Key: "reason_code", Value: "AGENT-CUSTOM-SCAFFOLD"}, + {Key: "confidence_score", Value: fmt.Sprintf("%.2f", scored.score)}, + {Key: "confidence_gate", Value: fmt.Sprintf("%.2f", confidenceGate)}, + {Key: "signal_count", Value: fmt.Sprintf("%d", scored.count)}, + {Key: "signal_set", Value: strings.Join(scored.names, ",")}, + {Key: "declaration_path", Value: strings.TrimSpace(declarationPath)}, + } + + return model.Finding{ + FindingType: "agent_custom_scaffold", + Severity: severity, + ToolType: "custom_agent", + Location: strings.TrimSpace(agent.File), + Repo: strings.TrimSpace(scope.Repo), + Org: fallbackOrg(scope.Org), + Detector: detectorID, + Permissions: derivePermissions(agent), + Evidence: evidence, + Remediation: "Keep custom-agent scaffolding gated by deterministic approval and explicit runtime controls.", + } +} + +func parseErrorFinding(scope detect.Scope, path string, format string, parseErr model.ParseError) model.Finding { + parseErr.Path = strings.TrimSpace(path) + parseErr.Format = strings.TrimSpace(format) + parseErr.Detector = detectorID + return model.Finding{ + FindingType: "parse_error", + Severity: model.SeverityMedium, + ToolType: "custom_agent", + Location: strings.TrimSpace(path), + Repo: strings.TrimSpace(scope.Repo), + Org: fallbackOrg(scope.Org), + Detector: detectorID, + ParseError: &parseErr, + Remediation: "Fix malformed custom-agent declaration and preserve deterministic schema compliance.", + } +} + +func derivePermissions(agent customAgent) []string { + perms := make([]string, 0) + for _, tool := range uniqueSorted(agent.Tools) { + lower := strings.ToLower(tool) + if strings.Contains(lower, "write") || strings.Contains(lower, "deploy") { + perms = append(perms, "deploy.write") + } + if strings.Contains(lower, "exec") { + perms = append(perms, "proc.exec") + } + } + for _, auth := range uniqueSorted(agent.Auth) { + lower := strings.ToLower(auth) + if strings.Contains(lower, "secret") || strings.Contains(lower, "token") || strings.Contains(lower, "credential") { + perms = append(perms, "secret.read") + } + } + if agent.AutoDeploy { + perms = append(perms, "deploy.write") + } + return uniqueSorted(perms) +} + +func uniqueSorted(in []string) []string { + if len(in) == 0 { + return nil + } + set := map[string]struct{}{} + for _, item := range in { + trimmed := strings.TrimSpace(item) + if trimmed == "" { + continue + } + set[trimmed] = struct{}{} + } + out := make([]string, 0, len(set)) + for item := range set { + out = append(out, item) + } + sort.Strings(out) + if len(out) == 0 { + return nil + } + return out +} + +func contains(items []string, target string) bool { + for _, item := range items { + if item == target { + return true + } + } + return false +} + +func fallbackOrg(org string) string { + if strings.TrimSpace(org) == "" { + return "local" + } + return strings.TrimSpace(org) +} diff --git a/core/detect/agentcustom/detector_test.go b/core/detect/agentcustom/detector_test.go new file mode 100644 index 0000000..1c13d27 --- /dev/null +++ b/core/detect/agentcustom/detector_test.go @@ -0,0 +1,144 @@ +package agentcustom + +import ( + "context" + "os" + "path/filepath" + "reflect" + "testing" + + "github.com/Clyra-AI/wrkr/core/detect" + "github.com/Clyra-AI/wrkr/core/model" +) + +func TestCustomAgentDetector_RequiresStrongSignalCooccurrence(t *testing.T) { + t.Parallel() + + root := t.TempDir() + writeFile(t, root, ".wrkr/agents/custom-agent.yaml", `agents: + - name: custom_triage + file: agents/triage.py +`) + + weakFindings, err := New().Detect(context.Background(), detect.Scope{Org: "acme", Repo: "weak", Root: root}, detect.Options{}) + if err != nil { + t.Fatalf("detect weak: %v", err) + } + if len(weakFindings) != 0 { + t.Fatalf("expected no finding under weak signals, got %+v", weakFindings) + } + + writeFile(t, root, "AGENTS.md", "# agents\n") + writeFile(t, root, ".agents/skills/release/SKILL.md", "release policy\n") + writeFile(t, root, ".github/workflows/release.yml", "jobs:\n release:\n steps:\n - run: codex --full-auto --approval never\n") + + strongFindings, err := New().Detect(context.Background(), detect.Scope{Org: "acme", Repo: "strong", Root: root}, detect.Options{}) + if err != nil { + t.Fatalf("detect strong: %v", err) + } + if len(strongFindings) != 1 { + t.Fatalf("expected one finding under strong signals, got %d (%+v)", len(strongFindings), strongFindings) + } + if strongFindings[0].FindingType != "agent_custom_scaffold" { + t.Fatalf("unexpected finding type %q", strongFindings[0].FindingType) + } + if evidenceValue(strongFindings[0], "signal_count") == "" { + t.Fatalf("expected signal_count evidence in finding %+v", strongFindings[0]) + } +} + +func TestCustomAgentDetector_LowFalsePositiveFixtures(t *testing.T) { + t.Parallel() + + root := t.TempDir() + writeFile(t, root, "README.md", "custom agent architecture discussion only\n") + writeFile(t, root, ".github/workflows/ci.yml", "jobs:\n test:\n steps:\n - run: go test ./...\n") + writeFile(t, root, ".agents/skills/docs/SKILL.md", "docs skill\n") + + findings, err := New().Detect(context.Background(), detect.Scope{Org: "acme", Repo: "low-fp", Root: root}, detect.Options{}) + if err != nil { + t.Fatalf("detect: %v", err) + } + if len(findings) != 0 { + t.Fatalf("expected no custom-agent findings in low-FP fixture, got %+v", findings) + } +} + +func TestCustomAgentDetector_DeterministicEvidenceKeyOrder(t *testing.T) { + t.Parallel() + + root := t.TempDir() + writeFile(t, root, ".wrkr/agents/custom-agent.toml", `[[agents]] +name = "ops_agent" +file = "agents/ops.py" +tools = ["deploy.write", "proc.exec"] +auth_surfaces = ["token"] +`) + writeFile(t, root, "AGENTS.md", "ops agent\n") + writeFile(t, root, ".agents/skills/ops/SKILL.md", "ops playbook\n") + writeFile(t, root, ".github/workflows/ops.yml", "jobs:\n ops:\n steps:\n - run: claude -p \"deploy\"\n") + + scope := detect.Scope{Org: "acme", Repo: "ops", Root: root} + first, err := New().Detect(context.Background(), scope, detect.Options{}) + if err != nil { + t.Fatalf("detect: %v", err) + } + if len(first) != 1 { + t.Fatalf("expected one finding, got %d", len(first)) + } + + for i := 0; i < 12; i++ { + next, err := New().Detect(context.Background(), scope, detect.Options{}) + if err != nil { + t.Fatalf("detect run %d: %v", i+1, err) + } + if !reflect.DeepEqual(first, next) { + t.Fatalf("non-deterministic detector output at run %d", i+1) + } + } + + if signalSet := evidenceValue(first[0], "signal_set"); signalSet == "" { + t.Fatalf("expected signal_set evidence") + } +} + +func TestCustomAgentDetector_ParseErrorForMalformedDeclaration(t *testing.T) { + t.Parallel() + + root := t.TempDir() + writeFile(t, root, ".wrkr/agents/custom-agent.json", `{"agents":[`) + + findings, err := New().Detect(context.Background(), detect.Scope{Org: "acme", Repo: "broken", Root: root}, detect.Options{}) + if err != nil { + t.Fatalf("detect: %v", err) + } + if len(findings) != 1 { + t.Fatalf("expected one parse error finding, got %d", len(findings)) + } + if findings[0].FindingType != "parse_error" { + t.Fatalf("expected parse_error finding type, got %q", findings[0].FindingType) + } + if findings[0].ParseError == nil || findings[0].ParseError.Path != ".wrkr/agents/custom-agent.json" { + t.Fatalf("unexpected parse error payload %+v", findings[0].ParseError) + } +} + +func evidenceValue(finding model.Finding, key string) string { + for _, evidence := range finding.Evidence { + if evidence.Key == key { + return evidence.Value + } + } + return "" +} + +func writeFile(t *testing.T, root, rel, content string) { + t.Helper() + path := filepath.Join(root, filepath.FromSlash(rel)) + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + t.Fatalf("mkdir %s: %v", rel, err) + } + if err := os.WriteFile(path, []byte(content), 0o600); err != nil { + t.Fatalf("write %s: %v", rel, err) + } +} diff --git a/core/detect/agentframework/detector.go b/core/detect/agentframework/detector.go index d8661ac..b9cb21c 100644 --- a/core/detect/agentframework/detector.go +++ b/core/detect/agentframework/detector.go @@ -11,25 +11,25 @@ import ( ) type AgentSpec struct { - Name string `json:"name" yaml:"name"` - File string `json:"file" yaml:"file"` - StartLine int `json:"start_line" yaml:"start_line"` - EndLine int `json:"end_line" yaml:"end_line"` - Tools []string `json:"tools" yaml:"tools"` - DataSources []string `json:"data_sources" yaml:"data_sources"` - AuthSurfaces []string `json:"auth_surfaces" yaml:"auth_surfaces"` - Deployment []string `json:"deployment_artifacts" yaml:"deployment_artifacts"` - DataClass string `json:"data_class" yaml:"data_class"` - ApprovalStatus string `json:"approval_status" yaml:"approval_status"` - DynamicDiscovery bool `json:"dynamic_discovery" yaml:"dynamic_discovery"` - KillSwitch bool `json:"kill_switch" yaml:"kill_switch"` - AutoDeploy bool `json:"auto_deploy" yaml:"auto_deploy"` - HumanGate bool `json:"human_gate" yaml:"human_gate"` - DeploymentGate string `json:"deployment_gate" yaml:"deployment_gate"` + Name string `json:"name" yaml:"name" toml:"name"` + File string `json:"file" yaml:"file" toml:"file"` + StartLine int `json:"start_line" yaml:"start_line" toml:"start_line"` + EndLine int `json:"end_line" yaml:"end_line" toml:"end_line"` + Tools []string `json:"tools" yaml:"tools" toml:"tools"` + DataSources []string `json:"data_sources" yaml:"data_sources" toml:"data_sources"` + AuthSurfaces []string `json:"auth_surfaces" yaml:"auth_surfaces" toml:"auth_surfaces"` + Deployment []string `json:"deployment_artifacts" yaml:"deployment_artifacts" toml:"deployment_artifacts"` + DataClass string `json:"data_class" yaml:"data_class" toml:"data_class"` + ApprovalStatus string `json:"approval_status" yaml:"approval_status" toml:"approval_status"` + DynamicDiscovery bool `json:"dynamic_discovery" yaml:"dynamic_discovery" toml:"dynamic_discovery"` + KillSwitch bool `json:"kill_switch" yaml:"kill_switch" toml:"kill_switch"` + AutoDeploy bool `json:"auto_deploy" yaml:"auto_deploy" toml:"auto_deploy"` + HumanGate bool `json:"human_gate" yaml:"human_gate" toml:"human_gate"` + DeploymentGate string `json:"deployment_gate" yaml:"deployment_gate" toml:"deployment_gate"` } type declaration struct { - Agents []AgentSpec `json:"agents" yaml:"agents"` + Agents []AgentSpec `json:"agents" yaml:"agents" toml:"agents"` } type DetectorConfig struct { @@ -40,16 +40,39 @@ type DetectorConfig struct { } func Detect(_ context.Context, scope detect.Scope, cfg DetectorConfig) ([]model.Finding, error) { + return DetectMany(scope, []DetectorConfig{cfg}) +} + +func DetectMany(scope detect.Scope, configs []DetectorConfig) ([]model.Finding, error) { if err := detect.ValidateScopeRoot(scope.Root); err != nil { return nil, err } - if !detect.FileExists(scope.Root, cfg.ConfigPath) { + + normalized := normalizeConfigs(configs) + if len(normalized) == 0 { + return nil, nil + } + + findings := make([]model.Finding, 0) + for _, cfg := range normalized { + if !detect.FileExists(scope.Root, cfg.ConfigPath) { + continue + } + fileFindings := detectOne(scope, cfg) + findings = append(findings, fileFindings...) + } + + if len(findings) == 0 { return nil, nil } + model.SortFindings(findings) + return findings, nil +} +func detectOne(scope detect.Scope, cfg DetectorConfig) []model.Finding { parsed, parseErr := parse(scope, cfg) if parseErr != nil { - return []model.Finding{parseErrorFinding(scope, cfg, *parseErr)}, nil + return []model.Finding{parseErrorFinding(scope, cfg, *parseErr)} } if len(parsed.Agents) == 0 { return []model.Finding{parseErrorFinding(scope, cfg, model.ParseError{ @@ -58,7 +81,7 @@ func Detect(_ context.Context, scope detect.Scope, cfg DetectorConfig) ([]model. Path: cfg.ConfigPath, Detector: cfg.DetectorID, Message: "expected at least one agents entry", - })}, nil + })} } findings := make([]model.Finding, 0, len(parsed.Agents)) @@ -70,12 +93,11 @@ func Detect(_ context.Context, scope detect.Scope, cfg DetectorConfig) ([]model. Path: cfg.ConfigPath, Detector: cfg.DetectorID, Message: "each agent requires non-empty name and file", - })}, nil + })} } findings = append(findings, frameworkFinding(scope, cfg, agent)) } - model.SortFindings(findings) - return findings, nil + return findings } func parse(scope detect.Scope, cfg DetectorConfig) (declaration, *model.ParseError) { @@ -89,12 +111,59 @@ func parse(scope detect.Scope, cfg DetectorConfig) (declaration, *model.ParseErr if parseErr := detect.ParseYAMLFile(cfg.DetectorID, scope.Root, cfg.ConfigPath, &parsed); parseErr != nil { return declaration{}, parseErr } + case "toml": + if parseErr := detect.ParseTOMLFile(cfg.DetectorID, scope.Root, cfg.ConfigPath, &parsed); parseErr != nil { + return declaration{}, parseErr + } default: return declaration{}, &model.ParseError{Kind: "parse_error", Format: cfg.Format, Path: cfg.ConfigPath, Detector: cfg.DetectorID, Message: "unsupported detector config format"} } return parsed, nil } +func normalizeConfigs(configs []DetectorConfig) []DetectorConfig { + if len(configs) == 0 { + return nil + } + unique := map[string]DetectorConfig{} + for _, cfg := range configs { + detectorID := strings.TrimSpace(cfg.DetectorID) + framework := strings.TrimSpace(cfg.Framework) + configPath := strings.TrimSpace(cfg.ConfigPath) + format := strings.ToLower(strings.TrimSpace(cfg.Format)) + if detectorID == "" || framework == "" || configPath == "" || format == "" { + continue + } + key := fmt.Sprintf("%s|%s|%s", configPath, format, detectorID) + unique[key] = DetectorConfig{ + DetectorID: detectorID, + Framework: framework, + ConfigPath: configPath, + Format: format, + } + } + if len(unique) == 0 { + return nil + } + out := make([]DetectorConfig, 0, len(unique)) + for _, cfg := range unique { + out = append(out, cfg) + } + sort.Slice(out, func(i, j int) bool { + if out[i].ConfigPath != out[j].ConfigPath { + return out[i].ConfigPath < out[j].ConfigPath + } + if out[i].Format != out[j].Format { + return out[i].Format < out[j].Format + } + if out[i].DetectorID != out[j].DetectorID { + return out[i].DetectorID < out[j].DetectorID + } + return out[i].Framework < out[j].Framework + }) + return out +} + func frameworkFinding(scope detect.Scope, cfg DetectorConfig, agent AgentSpec) model.Finding { permissions := derivePermissions(agent) tools := uniqueSorted(agent.Tools) diff --git a/core/detect/agentframework/detector_test.go b/core/detect/agentframework/detector_test.go index 2644d46..9085480 100644 --- a/core/detect/agentframework/detector_test.go +++ b/core/detect/agentframework/detector_test.go @@ -4,6 +4,7 @@ import ( "context" "os" "path/filepath" + "reflect" "strings" "testing" @@ -70,6 +71,84 @@ func TestDetect_UsesExplicitDeploymentGate(t *testing.T) { } } +func TestDetectMany_DeterministicAcrossFormats(t *testing.T) { + t.Parallel() + + root := t.TempDir() + writeFile(t, root, ".wrkr/agents/autogen.yaml", `agents: + - name: alpha + file: agents/alpha.py +`) + writeFile(t, root, ".wrkr/agents/autogen.toml", `[[agents]] +name = "beta" +file = "agents/beta.py" +`) + + scope := detect.Scope{Org: "acme", Repo: "payments", Root: root} + configs := []DetectorConfig{ + {DetectorID: "agentframework_autogen", Framework: "autogen", ConfigPath: ".wrkr/agents/autogen.toml", Format: "toml"}, + {DetectorID: "agentframework_autogen", Framework: "autogen", ConfigPath: ".wrkr/agents/autogen.yaml", Format: "yaml"}, + } + + first, err := DetectMany(scope, configs) + if err != nil { + t.Fatalf("detect many: %v", err) + } + if len(first) != 2 { + t.Fatalf("expected two findings, got %d", len(first)) + } + for _, finding := range first { + if finding.ToolType != "autogen" { + t.Fatalf("expected autogen tool type, got %q", finding.ToolType) + } + } + for i := 0; i < 12; i++ { + next, err := DetectMany(scope, configs) + if err != nil { + t.Fatalf("detect many run %d: %v", i+1, err) + } + if !reflect.DeepEqual(first, next) { + t.Fatalf("non-deterministic output at run %d", i+1) + } + } +} + +func TestDetectMany_ParseErrorDoesNotAbortOtherConfig(t *testing.T) { + t.Parallel() + + root := t.TempDir() + writeFile(t, root, ".wrkr/agents/autogen.json", `{"agents":[`) + writeFile(t, root, ".wrkr/agents/autogen.yaml", `agents: + - name: rescue + file: agents/rescue.py +`) + + findings, err := DetectMany(detect.Scope{Org: "acme", Repo: "payments", Root: root}, []DetectorConfig{ + {DetectorID: "agentframework_autogen", Framework: "autogen", ConfigPath: ".wrkr/agents/autogen.json", Format: "json"}, + {DetectorID: "agentframework_autogen", Framework: "autogen", ConfigPath: ".wrkr/agents/autogen.yaml", Format: "yaml"}, + }) + if err != nil { + t.Fatalf("detect many: %v", err) + } + if len(findings) != 2 { + t.Fatalf("expected one parse error and one finding, got %d", len(findings)) + } + + seenParseErr := false + seenFramework := false + for _, finding := range findings { + switch finding.FindingType { + case "parse_error": + seenParseErr = true + case "agent_framework": + seenFramework = true + } + } + if !seenParseErr || !seenFramework { + t.Fatalf("expected parse_error and agent_framework findings, got %+v", findings) + } +} + func evidenceValue(finding model.Finding, key string) string { target := strings.ToLower(strings.TrimSpace(key)) for _, evidence := range finding.Evidence { diff --git a/core/detect/agentllamaindex/detector.go b/core/detect/agentllamaindex/detector.go index f4e1ee6..7f620a4 100644 --- a/core/detect/agentllamaindex/detector.go +++ b/core/detect/agentllamaindex/detector.go @@ -17,10 +17,31 @@ func New() Detector { return Detector{} } func (Detector) ID() string { return detectorID } func (Detector) Detect(ctx context.Context, scope detect.Scope, _ detect.Options) ([]model.Finding, error) { - return agentframework.Detect(ctx, scope, agentframework.DetectorConfig{ - DetectorID: detectorID, - Framework: "llamaindex", - ConfigPath: ".wrkr/agents/llamaindex.yaml", - Format: "yaml", + _ = ctx + return agentframework.DetectMany(scope, []agentframework.DetectorConfig{ + { + DetectorID: detectorID, + Framework: "llamaindex", + ConfigPath: ".wrkr/agents/llamaindex.yaml", + Format: "yaml", + }, + { + DetectorID: detectorID, + Framework: "llamaindex", + ConfigPath: ".wrkr/agents/llamaindex.yml", + Format: "yaml", + }, + { + DetectorID: detectorID, + Framework: "llamaindex", + ConfigPath: ".wrkr/agents/llamaindex.json", + Format: "json", + }, + { + DetectorID: detectorID, + Framework: "llamaindex", + ConfigPath: ".wrkr/agents/llamaindex.toml", + Format: "toml", + }, }) } diff --git a/core/detect/agentllamaindex/detector_test.go b/core/detect/agentllamaindex/detector_test.go index 5868168..886e36e 100644 --- a/core/detect/agentllamaindex/detector_test.go +++ b/core/detect/agentllamaindex/detector_test.go @@ -4,6 +4,7 @@ import ( "context" "os" "path/filepath" + "reflect" "testing" "github.com/Clyra-AI/wrkr/core/detect" @@ -31,6 +32,43 @@ func TestLlamaIndexDetector_PrecisionBaseline(t *testing.T) { } } +func TestLlamaIndexDetector_ExpandedFormatsDeterministic(t *testing.T) { + t.Parallel() + + root := t.TempDir() + writeFile(t, root, ".wrkr/agents/llamaindex.json", `{"agents":[{"name":"json_agent","file":"agents/json.py"}]}`) + writeFile(t, root, ".wrkr/agents/llamaindex.toml", `[[agents]] +name = "toml_agent" +file = "agents/toml.py" +`) + + scope := detect.Scope{Org: "acme", Repo: "search", Root: root} + first, err := New().Detect(context.Background(), scope, detect.Options{}) + if err != nil { + t.Fatalf("detect: %v", err) + } + if len(first) != 2 { + t.Fatalf("expected two findings from json+toml declarations, got %d", len(first)) + } + for _, finding := range first { + if finding.ToolType != "llamaindex" { + t.Fatalf("unexpected tool type %q", finding.ToolType) + } + if finding.FindingType != "agent_framework" { + t.Fatalf("unexpected finding type %q", finding.FindingType) + } + } + for i := 0; i < 10; i++ { + next, err := New().Detect(context.Background(), scope, detect.Options{}) + if err != nil { + t.Fatalf("detect run %d: %v", i+1, err) + } + if !reflect.DeepEqual(first, next) { + t.Fatalf("non-deterministic output at run %d", i+1) + } + } +} + func writeFile(t *testing.T, root, rel, content string) { t.Helper() path := filepath.Join(root, filepath.FromSlash(rel)) diff --git a/core/detect/agentmcpclient/detector.go b/core/detect/agentmcpclient/detector.go new file mode 100644 index 0000000..008143d --- /dev/null +++ b/core/detect/agentmcpclient/detector.go @@ -0,0 +1,46 @@ +package agentmcpclient + +import ( + "context" + + "github.com/Clyra-AI/wrkr/core/detect" + "github.com/Clyra-AI/wrkr/core/detect/agentframework" + "github.com/Clyra-AI/wrkr/core/model" +) + +const detectorID = "agentmcpclient" + +type Detector struct{} + +func New() Detector { return Detector{} } + +func (Detector) ID() string { return detectorID } + +func (Detector) Detect(_ context.Context, scope detect.Scope, _ detect.Options) ([]model.Finding, error) { + return agentframework.DetectMany(scope, []agentframework.DetectorConfig{ + { + DetectorID: detectorID, + Framework: "mcp_client", + ConfigPath: ".wrkr/agents/mcp-client.yaml", + Format: "yaml", + }, + { + DetectorID: detectorID, + Framework: "mcp_client", + ConfigPath: ".wrkr/agents/mcp-client.yml", + Format: "yaml", + }, + { + DetectorID: detectorID, + Framework: "mcp_client", + ConfigPath: ".wrkr/agents/mcp-client.json", + Format: "json", + }, + { + DetectorID: detectorID, + Framework: "mcp_client", + ConfigPath: ".wrkr/agents/mcp-client.toml", + Format: "toml", + }, + }) +} diff --git a/core/detect/agentmcpclient/detector_test.go b/core/detect/agentmcpclient/detector_test.go new file mode 100644 index 0000000..d460950 --- /dev/null +++ b/core/detect/agentmcpclient/detector_test.go @@ -0,0 +1,75 @@ +package agentmcpclient + +import ( + "context" + "os" + "path/filepath" + "testing" + + "github.com/Clyra-AI/wrkr/core/detect" +) + +func TestMCPClientDetector_FixtureCoverage(t *testing.T) { + t.Parallel() + + root := t.TempDir() + writeFile(t, root, ".wrkr/agents/mcp-client.yaml", `agents: + - name: mcp_orchestrator + file: agents/orchestrator.py + tools: [mcp.server.search, mcp.server.docs] + auth_surfaces: [token] +`) + + findings, err := New().Detect(context.Background(), detect.Scope{Org: "acme", Repo: "platform", Root: root}, detect.Options{}) + if err != nil { + t.Fatalf("detect: %v", err) + } + if len(findings) != 1 { + t.Fatalf("expected one finding, got %d", len(findings)) + } + finding := findings[0] + if finding.FindingType != "agent_framework" { + t.Fatalf("expected agent_framework finding type, got %q", finding.FindingType) + } + if finding.Detector != detectorID { + t.Fatalf("expected detector %q, got %q", detectorID, finding.Detector) + } + if finding.ToolType != "mcp_client" { + t.Fatalf("expected tool_type=mcp_client, got %q", finding.ToolType) + } +} + +func TestMCPClientDetector_ParseErrorsAreDeterministic(t *testing.T) { + t.Parallel() + + root := t.TempDir() + writeFile(t, root, ".wrkr/agents/mcp-client.json", `{"agents":[`) + + findings, err := New().Detect(context.Background(), detect.Scope{Org: "acme", Repo: "broken", Root: root}, detect.Options{}) + if err != nil { + t.Fatalf("detect: %v", err) + } + if len(findings) != 1 { + t.Fatalf("expected one finding, got %d", len(findings)) + } + if findings[0].FindingType != "parse_error" { + t.Fatalf("expected parse_error finding, got %q", findings[0].FindingType) + } + if findings[0].ParseError == nil { + t.Fatalf("expected parse error payload") + } + if findings[0].ParseError.Path != ".wrkr/agents/mcp-client.json" { + t.Fatalf("unexpected parse error path %q", findings[0].ParseError.Path) + } +} + +func writeFile(t *testing.T, root, rel, content string) { + t.Helper() + path := filepath.Join(root, filepath.FromSlash(rel)) + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + t.Fatalf("mkdir %s: %v", rel, err) + } + if err := os.WriteFile(path, []byte(content), 0o600); err != nil { + t.Fatalf("write %s: %v", rel, err) + } +} diff --git a/core/detect/defaults/defaults.go b/core/detect/defaults/defaults.go index f733bf8..08e7d7f 100644 --- a/core/detect/defaults/defaults.go +++ b/core/detect/defaults/defaults.go @@ -5,8 +5,10 @@ import ( "github.com/Clyra-AI/wrkr/core/detect/a2a" "github.com/Clyra-AI/wrkr/core/detect/agentautogen" "github.com/Clyra-AI/wrkr/core/detect/agentcrewai" + "github.com/Clyra-AI/wrkr/core/detect/agentcustom" "github.com/Clyra-AI/wrkr/core/detect/agentlangchain" "github.com/Clyra-AI/wrkr/core/detect/agentllamaindex" + "github.com/Clyra-AI/wrkr/core/detect/agentmcpclient" "github.com/Clyra-AI/wrkr/core/detect/agentopenai" "github.com/Clyra-AI/wrkr/core/detect/ciagent" "github.com/Clyra-AI/wrkr/core/detect/claude" @@ -34,6 +36,8 @@ func Registry() (*detect.Registry, error) { agentopenai.New(), agentautogen.New(), agentllamaindex.New(), + agentmcpclient.New(), + agentcustom.New(), claude.New(), cursor.New(), codex.New(), diff --git a/core/detect/defaults/defaults_test.go b/core/detect/defaults/defaults_test.go index 42950ff..c26332e 100644 --- a/core/detect/defaults/defaults_test.go +++ b/core/detect/defaults/defaults_test.go @@ -77,6 +77,10 @@ func TestRegistryIncludesAgentFrameworkDetectors(t *testing.T) { writeFixtureFile(t, root, ".wrkr/agents/openai-agents.json", `{"agents":[{"name":"oa_agent","file":"agents/openai.py"}]}`) writeFixtureFile(t, root, ".wrkr/agents/autogen.json", `{"agents":[{"name":"ag_agent","file":"agents/autogen.py"}]}`) writeFixtureFile(t, root, ".wrkr/agents/llamaindex.yaml", "agents:\n - name: li_agent\n file: agents/llamaindex.py\n") + writeFixtureFile(t, root, ".wrkr/agents/mcp-client.yaml", "agents:\n - name: mcpc_agent\n file: agents/mcp_client.py\n") + writeFixtureFile(t, root, ".wrkr/agents/custom-agent.yaml", "agents:\n - name: custom_agent\n file: agents/custom.py\n tools: [deploy.write]\n") + writeFixtureFile(t, root, "AGENTS.md", "# Agent instructions\n") + writeFixtureFile(t, root, ".agents/skills/release/SKILL.md", "release playbook\n") registry, err := Registry() if err != nil { @@ -94,7 +98,7 @@ func TestRegistryIncludesAgentFrameworkDetectors(t *testing.T) { for _, finding := range result.Findings { seen[finding.Detector] = true } - for _, detectorID := range []string{"agentlangchain", "agentcrewai", "agentopenai", "agentautogen", "agentllamaindex"} { + for _, detectorID := range []string{"agentlangchain", "agentcrewai", "agentopenai", "agentautogen", "agentllamaindex", "agentmcpclient", "agentcustom"} { if !seen[detectorID] { t.Fatalf("expected detector %s finding in registry run, got %+v", detectorID, result.Findings) } diff --git a/internal/scenarios/contracts_test.go b/internal/scenarios/contracts_test.go index 41797f3..7d4ffba 100644 --- a/internal/scenarios/contracts_test.go +++ b/internal/scenarios/contracts_test.go @@ -26,6 +26,8 @@ func TestScenarioContracts(t *testing.T) { "scenarios/wrkr/extension-detectors/repos", "scenarios/wrkr/attack-path-correlation/repos", "scenarios/wrkr/mcp-enrich-supplychain/repos", + "scenarios/wrkr/agent-relationship-correlation/repos", + "scenarios/wrkr/agent-policy-outcomes/repos", "scenarios/cross-product/proof-record-interop/records-from-all-3.jsonl", "scenarios/cross-product/proof-record-interop/expected.yaml", "internal/scenarios/coverage_map.json", @@ -101,7 +103,7 @@ func TestScenarioContracts(t *testing.T) { t.Fatalf("parse coverage map: %v", err) } - requiredMappings := []string{"FR11", "FR12", "FR13", "AC10", "AC11", "AC15", "AC18", "AC19", "AC20", "AC21", "AC22", "AC23", "AC24", "AC25"} + requiredMappings := []string{"FR11", "FR12", "FR13", "FR14", "FR15", "AC10", "AC11", "AC15", "AC18", "AC19", "AC20", "AC21", "AC22", "AC23", "AC24", "AC25", "AC26", "AC27"} testSymbols := scenarioTestSymbols(t, repoRoot) for _, key := range requiredMappings { mapped, ok := coverage[key] diff --git a/internal/scenarios/coverage_map.json b/internal/scenarios/coverage_map.json index 1c0084f..6192f16 100644 --- a/internal/scenarios/coverage_map.json +++ b/internal/scenarios/coverage_map.json @@ -43,5 +43,17 @@ ], "AC25": [ "TestScenarioEpic10WebMCPStaticDetectionAC25" + ], + "FR14": [ + "TestScenario_AgentRelationshipCorrelation" + ], + "FR15": [ + "TestScenario_AgentPolicyOutcomes" + ], + "AC26": [ + "TestScenario_AgentRelationshipCorrelation" + ], + "AC27": [ + "TestScenario_AgentPolicyOutcomes" ] } diff --git a/internal/scenarios/epic12_scenario_test.go b/internal/scenarios/epic12_scenario_test.go new file mode 100644 index 0000000..7951ca2 --- /dev/null +++ b/internal/scenarios/epic12_scenario_test.go @@ -0,0 +1,125 @@ +//go:build scenario + +package scenarios + +import ( + "path/filepath" + "slices" + "testing" +) + +func TestScenario_AgentRelationshipCorrelation(t *testing.T) { + t.Parallel() + + repoRoot := mustFindRepoRoot(t) + scanPath := filepath.Join(repoRoot, "scenarios", "wrkr", "agent-relationship-correlation", "repos") + payload := runScenarioCommandJSON(t, []string{"scan", "--path", scanPath, "--state", filepath.Join(t.TempDir(), "state.json"), "--json"}) + + findings, ok := payload["findings"].([]any) + if !ok || len(findings) == 0 { + t.Fatalf("expected findings array, got %T", payload["findings"]) + } + + hasMCPClient := false + for _, item := range findings { + finding, ok := item.(map[string]any) + if !ok { + continue + } + if finding["finding_type"] == "agent_framework" && finding["detector"] == "agentmcpclient" { + hasMCPClient = true + break + } + } + if !hasMCPClient { + t.Fatalf("expected agentmcpclient agent_framework finding, got %v", findings) + } + + inventory, ok := payload["inventory"].(map[string]any) + if !ok { + t.Fatalf("expected inventory payload, got %T", payload["inventory"]) + } + agents, ok := inventory["agents"].([]any) + if !ok || len(agents) == 0 { + t.Fatalf("expected inventory.agents entries, got %v", inventory["agents"]) + } + firstAgent, ok := agents[0].(map[string]any) + if !ok { + t.Fatalf("unexpected inventory agent shape: %T", agents[0]) + } + if firstAgent["deployment_status"] != "deployed" { + t.Fatalf("expected deployment_status=deployed, got %v", firstAgent["deployment_status"]) + } + boundTools := toStringSlice(firstAgent["bound_tools"]) + for _, required := range []string{"mcp.server.deploy", "shell.exec"} { + if !slices.Contains(boundTools, required) { + t.Fatalf("expected bound tool %q in %v", required, boundTools) + } + } +} + +func TestScenario_AgentPolicyOutcomes(t *testing.T) { + t.Parallel() + + repoRoot := mustFindRepoRoot(t) + scanPath := filepath.Join(repoRoot, "scenarios", "wrkr", "agent-policy-outcomes", "repos") + payload := runScenarioCommandJSON(t, []string{"scan", "--path", scanPath, "--state", filepath.Join(t.TempDir(), "state.json"), "--json"}) + + findings, ok := payload["findings"].([]any) + if !ok || len(findings) == 0 { + t.Fatalf("expected findings array, got %T", payload["findings"]) + } + + hasCustomFinding := false + violations := map[string]bool{} + for _, item := range findings { + finding, ok := item.(map[string]any) + if !ok { + continue + } + if finding["finding_type"] == "agent_custom_scaffold" { + hasCustomFinding = true + } + if finding["finding_type"] == "policy_violation" { + ruleID, _ := finding["rule_id"].(string) + if ruleID != "" { + violations[ruleID] = true + } + } + } + if !hasCustomFinding { + t.Fatalf("expected agent_custom_scaffold finding in scenario output, got %v", findings) + } + requiredRuleAliases := [][]string{ + {"WRKR-A001", "WRKR-001"}, + {"WRKR-A010", "WRKR-010"}, + } + for _, aliases := range requiredRuleAliases { + found := false + for _, alias := range aliases { + if violations[alias] { + found = true + break + } + } + if !found { + t.Fatalf("expected policy_violation for one of %v, got %v", aliases, violations) + } + } +} + +func toStringSlice(value any) []string { + items, ok := value.([]any) + if !ok { + return nil + } + out := make([]string, 0, len(items)) + for _, item := range items { + text, ok := item.(string) + if !ok { + continue + } + out = append(out, text) + } + return out +} diff --git a/scenarios/wrkr/agent-policy-outcomes/README.md b/scenarios/wrkr/agent-policy-outcomes/README.md new file mode 100644 index 0000000..91858ce --- /dev/null +++ b/scenarios/wrkr/agent-policy-outcomes/README.md @@ -0,0 +1,3 @@ +# Agent Policy Outcomes Scenario + +Validates deterministic agent policy outcomes for high-risk custom-agent scaffolding. diff --git a/scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/.agents/skills/deploy/SKILL.md b/scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/.agents/skills/deploy/SKILL.md new file mode 100644 index 0000000..c8a72be --- /dev/null +++ b/scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/.agents/skills/deploy/SKILL.md @@ -0,0 +1 @@ +name: deploy diff --git a/scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/.github/workflows/release.yml b/scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/.github/workflows/release.yml new file mode 100644 index 0000000..72f589a --- /dev/null +++ b/scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/.github/workflows/release.yml @@ -0,0 +1,6 @@ +name: release +jobs: + release: + runs-on: ubuntu-latest + steps: + - run: codex --full-auto --approval never diff --git a/scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/.wrkr/agents/custom-agent.yaml b/scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/.wrkr/agents/custom-agent.yaml new file mode 100644 index 0000000..c5f5d63 --- /dev/null +++ b/scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/.wrkr/agents/custom-agent.yaml @@ -0,0 +1,8 @@ +agents: + - name: policy_sensitive_agent + file: agents/policy_sensitive.py + tools: [deploy.write, proc.exec] + auth_surfaces: [token] + deployment_artifacts: [.github/workflows/release.yml] + auto_deploy: true + human_gate: false diff --git a/scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/.wrkr/agents/mcp-client.yaml b/scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/.wrkr/agents/mcp-client.yaml new file mode 100644 index 0000000..36edd73 --- /dev/null +++ b/scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/.wrkr/agents/mcp-client.yaml @@ -0,0 +1,8 @@ +agents: + - name: policy_framework_agent + file: agents/policy_framework.py + tools: [deploy.write] + auth_surfaces: [token] + deployment_artifacts: [.github/workflows/release.yml] + auto_deploy: true + human_gate: false diff --git a/scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/AGENTS.md b/scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/AGENTS.md new file mode 100644 index 0000000..fcd4b05 --- /dev/null +++ b/scenarios/wrkr/agent-policy-outcomes/repos/policy-agent-app/AGENTS.md @@ -0,0 +1,2 @@ +# Agent Instructions +Must run unattended release tasks. diff --git a/scenarios/wrkr/agent-relationship-correlation/README.md b/scenarios/wrkr/agent-relationship-correlation/README.md new file mode 100644 index 0000000..8bfef48 --- /dev/null +++ b/scenarios/wrkr/agent-relationship-correlation/README.md @@ -0,0 +1,3 @@ +# Agent Relationship Correlation Scenario + +Validates deterministic agent relationship and deployment correlation outputs. diff --git a/scenarios/wrkr/agent-relationship-correlation/repos/correlated-agent-app/.github/workflows/release.yml b/scenarios/wrkr/agent-relationship-correlation/repos/correlated-agent-app/.github/workflows/release.yml new file mode 100644 index 0000000..5dd5815 --- /dev/null +++ b/scenarios/wrkr/agent-relationship-correlation/repos/correlated-agent-app/.github/workflows/release.yml @@ -0,0 +1,6 @@ +name: release +jobs: + release: + runs-on: ubuntu-latest + steps: + - run: echo release diff --git a/scenarios/wrkr/agent-relationship-correlation/repos/correlated-agent-app/.wrkr/agents/mcp-client.yaml b/scenarios/wrkr/agent-relationship-correlation/repos/correlated-agent-app/.wrkr/agents/mcp-client.yaml new file mode 100644 index 0000000..83d2fc3 --- /dev/null +++ b/scenarios/wrkr/agent-relationship-correlation/repos/correlated-agent-app/.wrkr/agents/mcp-client.yaml @@ -0,0 +1,11 @@ +agents: + - name: release_coordinator + file: agents/release.py + start_line: 12 + end_line: 42 + tools: [mcp.server.deploy, shell.exec] + data_sources: [postgres.audit] + auth_surfaces: [token] + deployment_artifacts: [.github/workflows/release.yml] + auto_deploy: true + human_gate: false diff --git a/schemas/v1/benchmarks/agent-benchmark.schema.json b/schemas/v1/benchmarks/agent-benchmark.schema.json new file mode 100644 index 0000000..3762790 --- /dev/null +++ b/schemas/v1/benchmarks/agent-benchmark.schema.json @@ -0,0 +1,65 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://schemas.clyra.ai/wrkr/v1/agent-benchmark.schema.json", + "title": "Wrkr Agent Benchmark Report", + "type": "object", + "required": ["version", "thresholds", "metrics", "cases", "status", "violations"], + "properties": { + "version": {"type": "string", "const": "v1"}, + "thresholds": { + "type": "object", + "required": ["minimum_precision", "minimum_recall", "baseline_recall", "max_recall_regression", "recall_regression_floor"], + "properties": { + "minimum_precision": {"type": "number", "minimum": 0, "maximum": 1}, + "minimum_recall": {"type": "number", "minimum": 0, "maximum": 1}, + "baseline_recall": {"type": "number", "minimum": 0, "maximum": 1}, + "max_recall_regression": {"type": "number", "minimum": 0, "maximum": 1}, + "recall_regression_floor": {"type": "number", "minimum": 0, "maximum": 1} + }, + "additionalProperties": false + }, + "metrics": { + "type": "object", + "required": ["tp", "fp", "tn", "fn", "precision", "recall"], + "properties": { + "tp": {"type": "integer", "minimum": 0}, + "fp": {"type": "integer", "minimum": 0}, + "tn": {"type": "integer", "minimum": 0}, + "fn": {"type": "integer", "minimum": 0}, + "precision": {"type": "number", "minimum": 0, "maximum": 1}, + "recall": {"type": "number", "minimum": 0, "maximum": 1} + }, + "additionalProperties": false + }, + "cases": { + "type": "array", + "items": { + "type": "object", + "required": ["id", "kind", "path", "expected_detectors", "matched_detectors", "outcome"], + "properties": { + "id": {"type": "string", "minLength": 1}, + "kind": {"type": "string", "enum": ["positive", "negative"]}, + "path": {"type": "string", "minLength": 1}, + "expected_detectors": { + "type": "array", + "items": {"type": "string"}, + "uniqueItems": true + }, + "matched_detectors": { + "type": "array", + "items": {"type": "string"}, + "uniqueItems": true + }, + "outcome": {"type": "string", "enum": ["tp", "fp", "tn", "fn"]} + }, + "additionalProperties": false + } + }, + "status": {"type": "string", "enum": ["pass", "fail"]}, + "violations": { + "type": "array", + "items": {"type": "string"} + } + }, + "additionalProperties": false +} diff --git a/scripts/run_agent_benchmarks.sh b/scripts/run_agent_benchmarks.sh new file mode 100755 index 0000000..f8930c7 --- /dev/null +++ b/scripts/run_agent_benchmarks.sh @@ -0,0 +1,176 @@ +#!/usr/bin/env bash +set -euo pipefail + +if ! command -v python3 >/dev/null 2>&1; then + echo "python3 is required for agent benchmark execution" >&2 + exit 7 +fi + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +OUTPUT_PATH="${REPO_ROOT}/.tmp/agent-benchmarks.json" +PRINT_JSON=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --json) + PRINT_JSON=1 + shift + ;; + --output) + if [[ $# -lt 2 ]]; then + echo "--output requires a value" >&2 + exit 6 + fi + OUTPUT_PATH="$2" + shift 2 + ;; + *) + echo "unknown argument: $1" >&2 + exit 6 + ;; + esac +done + +mkdir -p "$(dirname "$OUTPUT_PATH")" + +python3 - "$REPO_ROOT" "$OUTPUT_PATH" "$PRINT_JSON" <<'PY' +import json +import pathlib +import subprocess +import sys +import tempfile + +repo_root = pathlib.Path(sys.argv[1]) +output_path = pathlib.Path(sys.argv[2]) +print_json = sys.argv[3] == "1" + +corpus = json.loads((repo_root / "testinfra" / "benchmarks" / "agents" / "corpus.json").read_text(encoding="utf-8")) +thresholds = json.loads((repo_root / "testinfra" / "benchmarks" / "agents" / "thresholds.json").read_text(encoding="utf-8")) + +target_detectors = set(corpus["target_detectors"]) +positive_types = set(corpus["positive_finding_types"]) +cases = sorted(corpus["cases"], key=lambda item: item["id"]) + +wrkr_bin = repo_root / ".tmp" / "wrkr" +wrkr_bin.parent.mkdir(parents=True, exist_ok=True) +subprocess.run(["go", "build", "-o", str(wrkr_bin), "./cmd/wrkr"], cwd=repo_root, check=True) + +results = [] +tp = fp = tn = fn = 0 + +for case in cases: + case_id = case["id"] + case_kind = case["kind"] + case_path = repo_root / case["path"] + expected_detectors = sorted(case.get("expected_detectors", [])) + + state_path = pathlib.Path(tempfile.mkdtemp(prefix=f"agent-bench-{case_id}-")) / "state.json" + cmd = [str(wrkr_bin), "scan", "--path", str(case_path), "--state", str(state_path), "--json", "--quiet"] + proc = subprocess.run(cmd, cwd=repo_root, capture_output=True, text=True) + if proc.returncode != 0: + print(f"benchmark case {case_id} failed scan with exit {proc.returncode}: {proc.stderr.strip()}", file=sys.stderr) + sys.exit(1) + + payload = json.loads(proc.stdout) + findings = payload.get("findings", []) + matched = [] + for finding in findings: + detector = str(finding.get("detector", "")).strip() + finding_type = str(finding.get("finding_type", "")).strip() + if detector in target_detectors and finding_type in positive_types: + matched.append({"detector": detector, "finding_type": finding_type}) + + matched_detectors = sorted({item["detector"] for item in matched}) + predicted_positive = len(matched_detectors) > 0 + + if case_kind == "positive": + expected_hit = all(detector in matched_detectors for detector in expected_detectors) + if predicted_positive and expected_hit: + tp += 1 + outcome = "tp" + else: + fn += 1 + outcome = "fn" + elif case_kind == "negative": + if predicted_positive: + fp += 1 + outcome = "fp" + else: + tn += 1 + outcome = "tn" + else: + print(f"benchmark case {case_id} has unsupported kind {case_kind}", file=sys.stderr) + sys.exit(1) + + results.append( + { + "id": case_id, + "kind": case_kind, + "path": case["path"], + "expected_detectors": expected_detectors, + "matched_detectors": matched_detectors, + "outcome": outcome, + } + ) + +precision_denominator = tp + fp +recall_denominator = tp + fn +precision = 1.0 if precision_denominator == 0 else tp / precision_denominator +recall = 1.0 if recall_denominator == 0 else tp / recall_denominator + +metrics = { + "tp": tp, + "fp": fp, + "tn": tn, + "fn": fn, + "precision": round(precision, 6), + "recall": round(recall, 6), +} + +minimum_precision = float(thresholds["minimum_precision"]) +minimum_recall = float(thresholds["minimum_recall"]) +baseline_recall = float(thresholds["baseline_recall"]) +max_recall_regression = float(thresholds["max_recall_regression"]) +recall_floor = baseline_recall - max_recall_regression + +violations = [] +if precision < minimum_precision: + violations.append( + f"precision {precision:.4f} below minimum threshold {minimum_precision:.4f}" + ) +if recall < minimum_recall: + violations.append( + f"recall {recall:.4f} below minimum threshold {minimum_recall:.4f}" + ) +if recall < recall_floor: + violations.append( + f"recall {recall:.4f} below regression floor {recall_floor:.4f} (baseline={baseline_recall:.4f}, max_regression={max_recall_regression:.4f})" + ) + +output = { + "version": "v1", + "thresholds": { + "minimum_precision": minimum_precision, + "minimum_recall": minimum_recall, + "baseline_recall": baseline_recall, + "max_recall_regression": max_recall_regression, + "recall_regression_floor": round(recall_floor, 6), + }, + "metrics": metrics, + "cases": results, + "status": "pass" if not violations else "fail", + "violations": violations, +} + +output_path.write_text(json.dumps(output, indent=2, sort_keys=True) + "\n", encoding="utf-8") + +if print_json: + print(json.dumps(output, indent=2, sort_keys=True)) +else: + print(f"agent benchmark status={output['status']} precision={precision:.4f} recall={recall:.4f} -> {output_path}") + +if violations: + for violation in violations: + print(violation, file=sys.stderr) + sys.exit(1) +PY diff --git a/testinfra/benchmarks/agents/corpus.json b/testinfra/benchmarks/agents/corpus.json new file mode 100644 index 0000000..3260596 --- /dev/null +++ b/testinfra/benchmarks/agents/corpus.json @@ -0,0 +1,93 @@ +{ + "target_detectors": [ + "agentautogen", + "agentcustom", + "agentllamaindex", + "agentmcpclient" + ], + "positive_finding_types": [ + "agent_custom_scaffold", + "agent_framework" + ], + "cases": [ + { + "id": "neg-custom-weak", + "kind": "negative", + "path": "testinfra/benchmarks/agents/fixtures/neg-custom-weak/repos" + }, + { + "id": "neg-docs-only", + "kind": "negative", + "path": "testinfra/benchmarks/agents/fixtures/neg-docs-only/repos" + }, + { + "id": "neg-random", + "kind": "negative", + "path": "testinfra/benchmarks/agents/fixtures/neg-random/repos" + }, + { + "id": "pos-autogen-json", + "kind": "positive", + "path": "testinfra/benchmarks/agents/fixtures/pos-autogen-json/repos", + "expected_detectors": [ + "agentautogen" + ] + }, + { + "id": "pos-autogen-toml", + "kind": "positive", + "path": "testinfra/benchmarks/agents/fixtures/pos-autogen-toml/repos", + "expected_detectors": [ + "agentautogen" + ] + }, + { + "id": "pos-autogen-yaml", + "kind": "positive", + "path": "testinfra/benchmarks/agents/fixtures/pos-autogen-yaml/repos", + "expected_detectors": [ + "agentautogen" + ] + }, + { + "id": "pos-custom-strong", + "kind": "positive", + "path": "testinfra/benchmarks/agents/fixtures/pos-custom-strong/repos", + "expected_detectors": [ + "agentcustom" + ] + }, + { + "id": "pos-llama-json", + "kind": "positive", + "path": "testinfra/benchmarks/agents/fixtures/pos-llama-json/repos", + "expected_detectors": [ + "agentllamaindex" + ] + }, + { + "id": "pos-llama-yaml", + "kind": "positive", + "path": "testinfra/benchmarks/agents/fixtures/pos-llama-yaml/repos", + "expected_detectors": [ + "agentllamaindex" + ] + }, + { + "id": "pos-mcpclient-toml", + "kind": "positive", + "path": "testinfra/benchmarks/agents/fixtures/pos-mcpclient-toml/repos", + "expected_detectors": [ + "agentmcpclient" + ] + }, + { + "id": "pos-mcpclient-yaml", + "kind": "positive", + "path": "testinfra/benchmarks/agents/fixtures/pos-mcpclient-yaml/repos", + "expected_detectors": [ + "agentmcpclient" + ] + } + ] +} diff --git a/testinfra/benchmarks/agents/fixtures/neg-custom-weak/repos/custom-weak/.wrkr/agents/custom-agent.yaml b/testinfra/benchmarks/agents/fixtures/neg-custom-weak/repos/custom-weak/.wrkr/agents/custom-agent.yaml new file mode 100644 index 0000000..cb13365 --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/neg-custom-weak/repos/custom-weak/.wrkr/agents/custom-agent.yaml @@ -0,0 +1,3 @@ +agents: + - name: weak_custom_agent + file: agents/triage.py diff --git a/testinfra/benchmarks/agents/fixtures/neg-custom-weak/repos/custom-weak/README.md b/testinfra/benchmarks/agents/fixtures/neg-custom-weak/repos/custom-weak/README.md new file mode 100644 index 0000000..0a1b758 --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/neg-custom-weak/repos/custom-weak/README.md @@ -0,0 +1 @@ +Negative benchmark fixture for weak custom-agent declaration without strong co-occurrence signals. diff --git a/testinfra/benchmarks/agents/fixtures/neg-docs-only/repos/docs-only/.agents/skills/docs/SKILL.md b/testinfra/benchmarks/agents/fixtures/neg-docs-only/repos/docs-only/.agents/skills/docs/SKILL.md new file mode 100644 index 0000000..f5a5b49 --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/neg-docs-only/repos/docs-only/.agents/skills/docs/SKILL.md @@ -0,0 +1 @@ +name: docs diff --git a/testinfra/benchmarks/agents/fixtures/neg-docs-only/repos/docs-only/.github/workflows/ci.yml b/testinfra/benchmarks/agents/fixtures/neg-docs-only/repos/docs-only/.github/workflows/ci.yml new file mode 100644 index 0000000..47a684d --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/neg-docs-only/repos/docs-only/.github/workflows/ci.yml @@ -0,0 +1,6 @@ +name: ci +jobs: + test: + runs-on: ubuntu-latest + steps: + - run: go test ./... diff --git a/testinfra/benchmarks/agents/fixtures/neg-docs-only/repos/docs-only/AGENTS.md b/testinfra/benchmarks/agents/fixtures/neg-docs-only/repos/docs-only/AGENTS.md new file mode 100644 index 0000000..1784bf6 --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/neg-docs-only/repos/docs-only/AGENTS.md @@ -0,0 +1 @@ +# Docs only diff --git a/testinfra/benchmarks/agents/fixtures/neg-random/repos/random-repo/src/readme.txt b/testinfra/benchmarks/agents/fixtures/neg-random/repos/random-repo/src/readme.txt new file mode 100644 index 0000000..ff2ae61 --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/neg-random/repos/random-repo/src/readme.txt @@ -0,0 +1 @@ +placeholder file for negative benchmark fixture diff --git a/testinfra/benchmarks/agents/fixtures/pos-autogen-json/repos/autogen-json/.wrkr/agents/autogen.json b/testinfra/benchmarks/agents/fixtures/pos-autogen-json/repos/autogen-json/.wrkr/agents/autogen.json new file mode 100644 index 0000000..2c5c055 --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/pos-autogen-json/repos/autogen-json/.wrkr/agents/autogen.json @@ -0,0 +1 @@ +{"agents":[{"name":"ops_json","file":"agents/ops_json.py","tools":["search.read"]}]} diff --git a/testinfra/benchmarks/agents/fixtures/pos-autogen-toml/repos/autogen-toml/.wrkr/agents/autogen.toml b/testinfra/benchmarks/agents/fixtures/pos-autogen-toml/repos/autogen-toml/.wrkr/agents/autogen.toml new file mode 100644 index 0000000..674d9a0 --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/pos-autogen-toml/repos/autogen-toml/.wrkr/agents/autogen.toml @@ -0,0 +1,4 @@ +[[agents]] +name = "ops_toml" +file = "agents/ops_toml.py" +tools = ["search.read"] diff --git a/testinfra/benchmarks/agents/fixtures/pos-autogen-yaml/repos/autogen-yaml/.wrkr/agents/autogen.yaml b/testinfra/benchmarks/agents/fixtures/pos-autogen-yaml/repos/autogen-yaml/.wrkr/agents/autogen.yaml new file mode 100644 index 0000000..1805447 --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/pos-autogen-yaml/repos/autogen-yaml/.wrkr/agents/autogen.yaml @@ -0,0 +1,4 @@ +agents: + - name: ops_yaml + file: agents/ops_yaml.py + tools: [search.read] diff --git a/testinfra/benchmarks/agents/fixtures/pos-custom-strong/repos/custom-strong/.agents/skills/deploy/SKILL.md b/testinfra/benchmarks/agents/fixtures/pos-custom-strong/repos/custom-strong/.agents/skills/deploy/SKILL.md new file mode 100644 index 0000000..c8a72be --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/pos-custom-strong/repos/custom-strong/.agents/skills/deploy/SKILL.md @@ -0,0 +1 @@ +name: deploy diff --git a/testinfra/benchmarks/agents/fixtures/pos-custom-strong/repos/custom-strong/.github/workflows/release.yml b/testinfra/benchmarks/agents/fixtures/pos-custom-strong/repos/custom-strong/.github/workflows/release.yml new file mode 100644 index 0000000..72f589a --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/pos-custom-strong/repos/custom-strong/.github/workflows/release.yml @@ -0,0 +1,6 @@ +name: release +jobs: + release: + runs-on: ubuntu-latest + steps: + - run: codex --full-auto --approval never diff --git a/testinfra/benchmarks/agents/fixtures/pos-custom-strong/repos/custom-strong/.wrkr/agents/custom-agent.yaml b/testinfra/benchmarks/agents/fixtures/pos-custom-strong/repos/custom-strong/.wrkr/agents/custom-agent.yaml new file mode 100644 index 0000000..4fd8d2b --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/pos-custom-strong/repos/custom-strong/.wrkr/agents/custom-agent.yaml @@ -0,0 +1,5 @@ +agents: + - name: custom_ops + file: agents/custom_ops.py + tools: [deploy.write, proc.exec] + auth_surfaces: [token] diff --git a/testinfra/benchmarks/agents/fixtures/pos-custom-strong/repos/custom-strong/AGENTS.md b/testinfra/benchmarks/agents/fixtures/pos-custom-strong/repos/custom-strong/AGENTS.md new file mode 100644 index 0000000..378b025 --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/pos-custom-strong/repos/custom-strong/AGENTS.md @@ -0,0 +1,2 @@ +# Agent Instructions +Use deterministic release controls. diff --git a/testinfra/benchmarks/agents/fixtures/pos-llama-json/repos/llama-json/.wrkr/agents/llamaindex.json b/testinfra/benchmarks/agents/fixtures/pos-llama-json/repos/llama-json/.wrkr/agents/llamaindex.json new file mode 100644 index 0000000..5ac6c79 --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/pos-llama-json/repos/llama-json/.wrkr/agents/llamaindex.json @@ -0,0 +1 @@ +{"agents":[{"name":"llama_json","file":"agents/llama_json.py","data_sources":["vector.rag"]}]} diff --git a/testinfra/benchmarks/agents/fixtures/pos-llama-yaml/repos/llama-yaml/.wrkr/agents/llamaindex.yaml b/testinfra/benchmarks/agents/fixtures/pos-llama-yaml/repos/llama-yaml/.wrkr/agents/llamaindex.yaml new file mode 100644 index 0000000..fe2c39d --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/pos-llama-yaml/repos/llama-yaml/.wrkr/agents/llamaindex.yaml @@ -0,0 +1,4 @@ +agents: + - name: llama_yaml + file: agents/llama_yaml.py + data_sources: [vector.rag] diff --git a/testinfra/benchmarks/agents/fixtures/pos-mcpclient-toml/repos/mcp-toml/.wrkr/agents/mcp-client.toml b/testinfra/benchmarks/agents/fixtures/pos-mcpclient-toml/repos/mcp-toml/.wrkr/agents/mcp-client.toml new file mode 100644 index 0000000..6cb2321 --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/pos-mcpclient-toml/repos/mcp-toml/.wrkr/agents/mcp-client.toml @@ -0,0 +1,5 @@ +[[agents]] +name = "mcp_toml" +file = "agents/mcp_toml.py" +tools = ["mcp.server.docs"] +auth_surfaces = ["token"] diff --git a/testinfra/benchmarks/agents/fixtures/pos-mcpclient-yaml/repos/mcp-yaml/.wrkr/agents/mcp-client.yaml b/testinfra/benchmarks/agents/fixtures/pos-mcpclient-yaml/repos/mcp-yaml/.wrkr/agents/mcp-client.yaml new file mode 100644 index 0000000..b8a0117 --- /dev/null +++ b/testinfra/benchmarks/agents/fixtures/pos-mcpclient-yaml/repos/mcp-yaml/.wrkr/agents/mcp-client.yaml @@ -0,0 +1,5 @@ +agents: + - name: mcp_yaml + file: agents/mcp_yaml.py + tools: [mcp.server.search] + auth_surfaces: [token] diff --git a/testinfra/benchmarks/agents/thresholds.json b/testinfra/benchmarks/agents/thresholds.json new file mode 100644 index 0000000..ebb3178 --- /dev/null +++ b/testinfra/benchmarks/agents/thresholds.json @@ -0,0 +1,6 @@ +{ + "minimum_precision": 0.95, + "minimum_recall": 0.70, + "baseline_recall": 0.95, + "max_recall_regression": 0.03 +} diff --git a/testinfra/contracts/story14_contracts_test.go b/testinfra/contracts/story14_contracts_test.go new file mode 100644 index 0000000..f122a34 --- /dev/null +++ b/testinfra/contracts/story14_contracts_test.go @@ -0,0 +1,170 @@ +package contracts + +import ( + "encoding/json" + "os" + "os/exec" + "path/filepath" + "reflect" + "strings" + "testing" +) + +type benchmarkThresholds struct { + MinimumPrecision float64 `json:"minimum_precision"` + MinimumRecall float64 `json:"minimum_recall"` + BaselineRecall float64 `json:"baseline_recall"` + MaxRecallRegression float64 `json:"max_recall_regression"` +} + +type benchmarkMetrics struct { + Precision float64 `json:"precision"` + Recall float64 `json:"recall"` +} + +func TestStory14AgentBenchmarkContractsPresent(t *testing.T) { + t.Parallel() + + repoRoot := mustFindRepoRoot(t) + requiredPaths := []string{ + "scripts/run_agent_benchmarks.sh", + "testinfra/benchmarks/agents/corpus.json", + "testinfra/benchmarks/agents/thresholds.json", + "schemas/v1/benchmarks/agent-benchmark.schema.json", + } + for _, rel := range requiredPaths { + if _, err := os.Stat(filepath.Join(repoRoot, rel)); err != nil { + t.Fatalf("missing benchmark contract path %s: %v", rel, err) + } + } +} + +func TestAgentBenchmarkHarness_FailsPrecisionBelowThreshold(t *testing.T) { + t.Parallel() + + thresholds := benchmarkThresholds{ + MinimumPrecision: 0.95, + MinimumRecall: 0.70, + BaselineRecall: 0.95, + MaxRecallRegression: 0.03, + } + metrics := benchmarkMetrics{Precision: 0.90, Recall: 0.95} + violations := evaluateBenchmarkThresholds(metrics, thresholds) + if len(violations) == 0 { + t.Fatal("expected precision violation") + } + if !containsSubstring(violations, "precision") { + t.Fatalf("expected precision violation detail, got %v", violations) + } +} + +func TestAgentBenchmarkHarness_FailsRecallRegressionBudget(t *testing.T) { + t.Parallel() + + thresholds := benchmarkThresholds{ + MinimumPrecision: 0.95, + MinimumRecall: 0.70, + BaselineRecall: 0.95, + MaxRecallRegression: 0.03, + } + metrics := benchmarkMetrics{Precision: 0.99, Recall: 0.90} + violations := evaluateBenchmarkThresholds(metrics, thresholds) + if len(violations) == 0 { + t.Fatal("expected recall regression violation") + } + if !containsSubstring(violations, "regression floor") { + t.Fatalf("expected recall regression floor violation detail, got %v", violations) + } +} + +func TestAgentBenchmarkRunner_JsonSchemaAndDeterminism(t *testing.T) { + repoRoot := mustFindRepoRoot(t) + firstPath := filepath.Join(t.TempDir(), "agent-bench-first.json") + secondPath := filepath.Join(t.TempDir(), "agent-bench-second.json") + + first := runAgentBenchmarkJSON(t, repoRoot, firstPath) + second := runAgentBenchmarkJSON(t, repoRoot, secondPath) + if !reflect.DeepEqual(first, second) { + t.Fatalf("benchmark output must be deterministic across runs\nfirst=%v\nsecond=%v", first, second) + } + + if first["version"] != "v1" { + t.Fatalf("expected version=v1, got %v", first["version"]) + } + status, _ := first["status"].(string) + if status != "pass" { + t.Fatalf("expected benchmark status pass, got %q (%v)", status, first["violations"]) + } + + metrics, ok := first["metrics"].(map[string]any) + if !ok { + t.Fatalf("expected metrics object, got %T", first["metrics"]) + } + if _, ok := metrics["precision"]; !ok { + t.Fatalf("metrics missing precision: %v", metrics) + } + if _, ok := metrics["recall"]; !ok { + t.Fatalf("metrics missing recall: %v", metrics) + } + + cases, ok := first["cases"].([]any) + if !ok || len(cases) == 0 { + t.Fatalf("expected non-empty cases list, got %T", first["cases"]) + } + for _, item := range cases { + entry, ok := item.(map[string]any) + if !ok { + t.Fatalf("unexpected case payload type %T", item) + } + for _, key := range []string{"id", "kind", "path", "expected_detectors", "matched_detectors", "outcome"} { + if _, present := entry[key]; !present { + t.Fatalf("case payload missing key %q: %v", key, entry) + } + } + } +} + +func evaluateBenchmarkThresholds(metrics benchmarkMetrics, thresholds benchmarkThresholds) []string { + violations := make([]string, 0) + if metrics.Precision < thresholds.MinimumPrecision { + violations = append(violations, "precision below minimum threshold") + } + if metrics.Recall < thresholds.MinimumRecall { + violations = append(violations, "recall below minimum threshold") + } + recallFloor := thresholds.BaselineRecall - thresholds.MaxRecallRegression + if metrics.Recall < recallFloor { + violations = append(violations, "recall below regression floor") + } + return violations +} + +func runAgentBenchmarkJSON(t *testing.T, repoRoot string, outputPath string) map[string]any { + t.Helper() + + cmd := exec.Command("bash", "scripts/run_agent_benchmarks.sh", "--json", "--output", outputPath) + cmd.Dir = repoRoot + output, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("run agent benchmark: %v\noutput=%s", err, string(output)) + } + + payload, readErr := os.ReadFile(outputPath) + if readErr != nil { + t.Fatalf("read benchmark output %s: %v", outputPath, readErr) + } + parsed := map[string]any{} + if err := json.Unmarshal(payload, &parsed); err != nil { + t.Fatalf("parse benchmark output json: %v", err) + } + return parsed +} + +func containsSubstring(values []string, token string) bool { + for _, item := range values { + if strings.Contains(item, token) { + return true + } + } + return false +} diff --git a/testinfra/contracts/story15_contracts_test.go b/testinfra/contracts/story15_contracts_test.go new file mode 100644 index 0000000..9c553e8 --- /dev/null +++ b/testinfra/contracts/story15_contracts_test.go @@ -0,0 +1,152 @@ +package contracts + +import ( + "bytes" + "encoding/json" + "os" + "path/filepath" + "reflect" + "strings" + "testing" + + "github.com/Clyra-AI/wrkr/core/cli" +) + +func TestStory15ScenarioPacksPresent(t *testing.T) { + t.Parallel() + + repoRoot := mustFindRepoRoot(t) + requiredPaths := []string{ + "scenarios/wrkr/agent-relationship-correlation/repos", + "scenarios/wrkr/agent-policy-outcomes/repos", + "internal/scenarios/epic12_scenario_test.go", + "internal/scenarios/coverage_map.json", + } + for _, rel := range requiredPaths { + if _, err := os.Stat(filepath.Join(repoRoot, rel)); err != nil { + t.Fatalf("missing Story 15 scenario path %s: %v", rel, err) + } + } +} + +func TestStory15CoverageMapIncludesAgentWave3Mappings(t *testing.T) { + t.Parallel() + + repoRoot := mustFindRepoRoot(t) + coveragePath := filepath.Join(repoRoot, "internal", "scenarios", "coverage_map.json") + payload, err := os.ReadFile(coveragePath) + if err != nil { + t.Fatalf("read coverage map: %v", err) + } + coverage := map[string][]string{} + if err := json.Unmarshal(payload, &coverage); err != nil { + t.Fatalf("parse coverage map: %v", err) + } + + required := map[string][]string{ + "FR14": {"TestScenario_AgentRelationshipCorrelation"}, + "FR15": {"TestScenario_AgentPolicyOutcomes"}, + "AC26": {"TestScenario_AgentRelationshipCorrelation"}, + "AC27": {"TestScenario_AgentPolicyOutcomes"}, + } + for key, expectedTests := range required { + mapped := coverage[key] + if len(mapped) == 0 { + t.Fatalf("coverage map missing %s", key) + } + for _, testName := range expectedTests { + if !slicesContains(mapped, testName) { + t.Fatalf("coverage map for %s missing %q: %v", key, testName, mapped) + } + } + } +} + +func TestStory15AgentScenariosDeterministicOutput(t *testing.T) { + t.Parallel() + + repoRoot := mustFindRepoRoot(t) + scenarios := []string{ + filepath.Join(repoRoot, "scenarios", "wrkr", "agent-relationship-correlation", "repos"), + filepath.Join(repoRoot, "scenarios", "wrkr", "agent-policy-outcomes", "repos"), + } + + for _, scanPath := range scenarios { + first := runStory15ScanJSON(t, scanPath, filepath.Join(t.TempDir(), "state-first.json")) + second := runStory15ScanJSON(t, scanPath, filepath.Join(t.TempDir(), "state-second.json")) + + firstNormalized := normalizeStory15Volatile(first) + secondNormalized := normalizeStory15Volatile(second) + if !reflect.DeepEqual(firstNormalized, secondNormalized) { + t.Fatalf("non-deterministic scenario output for %s\nfirst=%v\nsecond=%v", scanPath, firstNormalized, secondNormalized) + } + + for _, key := range []string{"findings", "ranked_findings", "inventory", "agent_privilege_map"} { + if _, present := firstNormalized[key]; !present { + t.Fatalf("scenario payload missing required key %q: %v", key, firstNormalized) + } + } + } +} + +func runStory15ScanJSON(t *testing.T, scanPath, statePath string) map[string]any { + t.Helper() + var out bytes.Buffer + var errOut bytes.Buffer + code := cli.Run([]string{"scan", "--path", scanPath, "--state", statePath, "--json"}, &out, &errOut) + if code != 0 { + t.Fatalf("scan failed: code=%d stderr=%s", code, errOut.String()) + } + payload := map[string]any{} + if err := json.Unmarshal(out.Bytes(), &payload); err != nil { + t.Fatalf("parse scan output json: %v", err) + } + return payload +} + +func normalizeStory15Volatile(in map[string]any) map[string]any { + out := map[string]any{} + for key, value := range in { + switch key { + case "generated_at": + continue + default: + out[key] = normalizeStory15Any(value) + } + } + return out +} + +func normalizeStory15Any(value any) any { + switch typed := value.(type) { + case map[string]any: + out := map[string]any{} + for key, val := range typed { + lower := strings.ToLower(strings.TrimSpace(key)) + switch lower { + case "generated_at", "scan_started_at", "scan_completed_at", "scan_duration_seconds": + continue + default: + out[key] = normalizeStory15Any(val) + } + } + return out + case []any: + out := make([]any, 0, len(typed)) + for _, item := range typed { + out = append(out, normalizeStory15Any(item)) + } + return out + default: + return value + } +} + +func slicesContains(values []string, target string) bool { + for _, value := range values { + if strings.TrimSpace(value) == target { + return true + } + } + return false +} diff --git a/testinfra/contracts/story7_contracts_test.go b/testinfra/contracts/story7_contracts_test.go index 56d30ff..7b1ee80 100644 --- a/testinfra/contracts/story7_contracts_test.go +++ b/testinfra/contracts/story7_contracts_test.go @@ -60,6 +60,7 @@ func TestStory7SchemaContractsStable(t *testing.T) { sort.Strings(files) want := []string{ "schemas/v1/a2a/agent-card.schema.json", + "schemas/v1/benchmarks/agent-benchmark.schema.json", "schemas/v1/cli/command-envelope.schema.json", "schemas/v1/config/config.schema.json", "schemas/v1/evidence/evidence-bundle.schema.json",