From e4d09fa1674cf0a177d4d4124df9fff1987be9e5 Mon Sep 17 00:00:00 2001 From: Codex Date: Fri, 5 Jun 2026 17:05:36 +0000 Subject: [PATCH 1/2] feat: plan harness-aware doctor fix --- ...issue-877-harness-aware-doctor.inputs.json | 72 +++ .../issue-877-harness-aware-doctor.mjs | 481 ++++++++++++++++++ 2 files changed, 553 insertions(+) create mode 100644 .a5c/processes/issue-877-harness-aware-doctor.inputs.json create mode 100644 .a5c/processes/issue-877-harness-aware-doctor.mjs diff --git a/.a5c/processes/issue-877-harness-aware-doctor.inputs.json b/.a5c/processes/issue-877-harness-aware-doctor.inputs.json new file mode 100644 index 0000000000..4720208399 --- /dev/null +++ b/.a5c/processes/issue-877-harness-aware-doctor.inputs.json @@ -0,0 +1,72 @@ +{ + "issueNumber": 877, + "baseBranch": "staging", + "implementationBranch": "fix/issue-877-harness-aware-doctor", + "maxAttempts": 3, + "targetFiles": [ + "plugins/babysitter-unified/commands/doctor.md", + "packages/sdk/src/prompts/templates/commands/doctor.md", + "scripts/sync-sdk-command-templates.cjs", + "scripts/generate-plugins.mjs", + "plugins/babysitter/skills/doctor/SKILL.md", + "plugins/babysitter-unified/per-harness/pi/extensions-index.ts", + "packages/sdk/src/harness/adapters/pi.ts", + "packages/sdk/src/harness/adapters/claude-code.ts", + "packages/sdk/src/harness/adapters/codex.ts", + "packages/sdk/src/harness/adapters/cursor.ts", + "packages/sdk/src/harness/adapters/opencode.ts", + "packages/sdk/src/harness/adapters/openclaw.ts", + "packages/sdk/src/harness/discovery.ts", + "packages/sdk/src/harness/registry.ts", + "packages/sdk/src/harness/types.ts", + "packages/sdk/src/harness/__tests__/discovery.test.ts", + "packages/sdk/src/harness/__tests__/harness.test.ts", + "packages/sdk/src/prompts/__tests__", + "packages/sdk/src/cli/commands/hooks/run.ts" + ], + "verificationCommands": [ + "npm run check:sdk-command-templates", + "npm run generate:plugins", + "npm run build:sdk", + "npm run test:sdk", + "npm run verify:metadata", + "git diff --check" + ], + "acceptanceCriteria": [ + "The doctor command performs an initial harness detection step using SDK-owned harness capability truth or an equivalent existing CLI wrapper.", + "Section 10 hook execution health is gated on HarnessCapability.StopHook or adapter supportsHookType(\"stop\").", + "When the detected harness lacks StopHook, section 10 reports N/A with a clear harness-aware explanation instead of FAIL.", + "N/A outcomes are neutral in the final health determination and cannot make the report WARNING or CRITICAL.", + "Pi no longer requires CLAUDE_PLUGIN_ROOT, hooks.json, babysitter-stop-hook.sh, babysitter-session-start-hook.sh, or ~/.claude settings for a healthy doctor run.", + "Claude Code and other StopHook harnesses retain actionable hook registration, script availability, CLI availability, execution evidence, and root-cause diagnostics.", + "Generic session-provenance and escalation guidance is harness-neutral, with Claude-specific remediation shown only for Claude Code.", + "The canonical unified doctor command, SDK generated template, and relevant generated plugin skill/command copies are synchronized through existing scripts.", + "Regression coverage or static verification prevents a non-StopHook harness from being marked critical solely because Claude-style hook files are absent.", + "The implementation preserves unrelated local changes and avoids broad plugin or generated-file churn outside the doctor surfaces needed for #877." + ], + "knownIssueContext": { + "title": "[Docs] Question: doctor /skill:doctor SKILL.md sections 6/7/10 hard-code Claude Code plugin layout -- false-FAIL on Pi (and other non-StopHook harnesses)", + "labels": [ + "bug", + "documentation", + "sdk", + "effort:medium", + "ready-for-dev", + "plugins", + "priority:medium", + "automated-triage", + "plugin-pi", + "root-cause" + ], + "reportedFailure": "A healthy Pi run can be reported as CRITICAL because doctor section 10 asserts Claude-style Stop and SessionStart hook registration, scripts, hook logs, and ~/.claude settings even though the Pi adapter explicitly does not support StopHook.", + "triageRecommendation": "Fix the canonical doctor source first, reuse SDK harness detection/capability APIs, gate hook checks on StopHook support, make non-StopHook hook health N/A and neutral, clean up Claude-specific generic wording, synchronize generated copies, and add a regression guard.", + "affectedComponents": [ + "plugins", + "plugin-pi", + "sdk", + "doctor command guidance", + "generated command templates", + "harness capability detection" + ] + } +} diff --git a/.a5c/processes/issue-877-harness-aware-doctor.mjs b/.a5c/processes/issue-877-harness-aware-doctor.mjs new file mode 100644 index 0000000000..685d3f9114 --- /dev/null +++ b/.a5c/processes/issue-877-harness-aware-doctor.mjs @@ -0,0 +1,481 @@ +/** + * @process repo/issue-877-harness-aware-doctor + * @description Fix issue #877: make /babysitter:doctor harness-aware so non-StopHook harnesses do not report false CRITICAL hook failures. + * @inputs { issueNumber: number, baseBranch: string, implementationBranch: string, maxAttempts: number, targetFiles: string[], verificationCommands: string[], acceptanceCriteria: string[] } + * @outputs { success: boolean, phases: string[], changedFiles: string[], verification: object, review: object, finalGate: object } + * + * References used while authoring: + * - docs/agent-reference/process-authoring.md + * - library/methodologies/gsd/iterative-convergence.js + * - library/methodologies/superpowers/verification-before-completion.js + * - .a5c/processes/issue-531-plugin-marketplace-version-sync.mjs + * + * Process-library note: + * - The requested .a5c/process-library/ directory is not present in this checkout. + * - Closest available methodology references are under library/methodologies/. + * + * Reuse-audit findings (REVIEW BEFORE PROCEEDING): + * - The canonical doctor command source is plugins/babysitter-unified/commands/doctor.md. + * - The SDK command template is generated from that source at packages/sdk/src/prompts/templates/commands/doctor.md by scripts/sync-sdk-command-templates.cjs. + * - The SDK already exposes harness truth through packages/sdk/src/harness/discovery.ts, packages/sdk/src/harness/registry.ts, packages/sdk/src/harness/types.ts, and adapter getCapabilities()/supportsHookType(). + * - Pi advertises Programmatic, SessionBinding, and HeadlessPrompt, with hookDriven: false and noHookSupport: true in packages/sdk/src/harness/adapters/pi.ts. + * - Existing harness tests cover capability declarations and unsupported stop hooks in packages/sdk/src/harness/__tests__/, so the implementation should extend or mirror those guardrails instead of duplicating capability truth in prose. + */ + +import { defineTask } from '@a5c-ai/babysitter-sdk'; + +export async function process(inputs, ctx) { + const maxAttempts = inputs.maxAttempts ?? 3; + + const issueContext = await ctx.task(readIssueContextTask, inputs, { + key: 'issue-877.read-issue-context', + }); + + const reuseAudit = await ctx.task(reuseAuditTask, { + inputs, + issueContext, + }, { + key: 'issue-877.reuse-audit', + }); + + const sourceTrace = await ctx.task(traceDoctorSourceTask, { + inputs, + issueContext, + reuseAudit, + }, { + key: 'issue-877.source-trace', + }); + + const regressionPlan = await ctx.task(authorRegressionPlanTask, { + inputs, + issueContext, + reuseAudit, + sourceTrace, + }, { + key: 'issue-877.regression-plan', + }); + + const design = await ctx.task(designHarnessAwareDoctorTask, { + inputs, + issueContext, + reuseAudit, + sourceTrace, + regressionPlan, + }, { + key: 'issue-877.fix-design', + }); + + let implementation = null; + let verification = null; + let review = null; + const attempts = []; + + for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { + implementation = await ctx.task(implementFixTask, { + inputs, + issueContext, + reuseAudit, + sourceTrace, + regressionPlan, + design, + previousVerification: verification, + previousReview: review, + attempt, + }, { + key: `issue-877.implementation.${attempt}`, + }); + + verification = await ctx.task(verifyFixTask, { + inputs, + issueContext, + reuseAudit, + sourceTrace, + regressionPlan, + design, + implementation, + attempt, + }, { + key: `issue-877.verification.${attempt}`, + }); + + review = await ctx.task(reviewFixTask, { + inputs, + issueContext, + reuseAudit, + sourceTrace, + regressionPlan, + design, + implementation, + verification, + attempt, + }, { + key: `issue-877.review.${attempt}`, + }); + + attempts.push({ attempt, implementation, verification, review }); + + if (verification?.passed === true && review?.approved === true) { + break; + } + } + + const finalGate = await ctx.task(finalAcceptanceTask, { + inputs, + issueContext, + reuseAudit, + sourceTrace, + regressionPlan, + design, + implementation, + verification, + review, + attempts, + }, { + key: 'issue-877.final-acceptance', + }); + + if (finalGate?.needsMaintainerDecision === true) { + await ctx.breakpoint({ + title: 'Issue #877 Needs Maintainer Decision', + question: finalGate.question, + options: ['Proceed with recommended scope', 'Pause for maintainer guidance'], + expert: 'owner', + tags: ['approval-gate', 'issue-877', 'doctor', 'harness-capabilities'], + context: { + runId: ctx.runId, + finalGate, + attempts: attempts.length, + }, + }); + } + + const delivery = finalGate?.passed === true + ? await ctx.task(deliverTask, { + inputs, + issueContext, + reuseAudit, + sourceTrace, + regressionPlan, + design, + implementation, + verification, + review, + finalGate, + }, { + key: 'issue-877.delivery', + }) + : null; + + return { + success: finalGate?.passed === true, + phases: [ + 'issue-context', + 'reuse-audit', + 'doctor-source-trace', + 'regression-plan', + 'harness-aware-design', + 'implementation-loop', + 'verification-gate', + 'review-gate', + 'final-acceptance', + 'delivery', + ], + changedFiles: finalGate?.changedFiles ?? implementation?.changedFiles ?? [], + reuseAudit, + sourceTrace, + regressionPlan, + design, + implementation, + verification, + review, + attempts, + finalGate, + delivery, + }; +} + +export const readIssueContextTask = defineTask('issue-877.read-issue-context', (args, taskCtx) => ({ + kind: 'agent', + title: 'Read issue #877 and extract the authoritative spec', + labels: ['issue-877', 'issue-context', 'doctor'], + agent: { + name: 'doctor-issue-context-reader', + prompt: { + role: 'senior Babysitter SDK and plugin maintainer', + task: 'Read the issue and produce the implementation spec. Do not edit files.', + instructions: [ + `Run: gh issue view ${args.issueNumber} --json title,body,labels,comments`, + `Also run: gh pr view ${args.issueNumber} --json files,title,body,comments; if GitHub says there is no PR with that number, record that result and continue.`, + 'Treat the issue body, every comment, and labels as the source of truth.', + 'Preserve the triage comment requirements around StopHook capability gating, neutral N/A verdicts, Pi extension health checks, generic session wording, and regression coverage.', + 'Return JSON: { title, labels, rawIssueSummary, commentsSummary, acceptanceCriteria, explicitNonGoals, affectedFilesFromIssue, reproduction, recommendedFix, risks, openQuestions }.', + ], + }, + }, + io: { + inputJsonPath: `tasks/${taskCtx.effectId}/inputs.json`, + outputJsonPath: `tasks/${taskCtx.effectId}/output.json`, + }, +})); + +export const reuseAuditTask = defineTask('issue-877.reuse-audit', (args, taskCtx) => ({ + kind: 'agent', + title: 'Run Phase 0 reuse audit for doctor harness awareness', + labels: ['issue-877', 'reuse-audit', 'sdk', 'plugins'], + agent: { + name: 'doctor-reuse-auditor', + prompt: { + role: 'senior TypeScript monorepo engineer', + task: 'Find existing infrastructure that should be reused before changing doctor guidance or adding tests. Do not edit files.', + instructions: [ + 'Render a section named exactly: Reuse-audit findings (REVIEW BEFORE PROCEEDING).', + 'Extract and scan these keywords: doctor, StopHook, stop-hook, supportsHookType, getCapabilities, detectCallerHarness, detectAdapter, HarnessCapability, noHookSupport, hookDriven, hooks.json, CLAUDE_PLUGIN_ROOT, PI_PLUGIN_ROOT, pi.registerCommand, session_start, sync-sdk-command-templates, generate:plugins.', + 'Inspect the target files and related tests before proposing any new helper, CLI surface, generated file, or duplicated harness capability table.', + 'Start with target files:', + JSON.stringify(args.inputs.targetFiles, null, 2), + 'Identify canonical source files, generated copies, command-template sync checks, generated plugin surfaces, and existing harness capability tests that can be extended.', + 'Return JSON: { findingsMarkdown, canonicalSources, generatedSurfaces, capabilitySources, candidateTests, syncCommands, noNewInfrastructureNeeded, risks }.', + ], + }, + }, + io: { + inputJsonPath: `tasks/${taskCtx.effectId}/inputs.json`, + outputJsonPath: `tasks/${taskCtx.effectId}/output.json`, + }, +})); + +export const traceDoctorSourceTask = defineTask('issue-877.trace-doctor-source', (args, taskCtx) => ({ + kind: 'agent', + title: 'Trace doctor command sources, generated surfaces, and capability APIs', + labels: ['issue-877', 'source-trace', 'doctor'], + agent: { + name: 'doctor-source-tracer', + prompt: { + role: 'senior SDK runtime engineer', + task: 'Map the current doctor command propagation path and harness capability APIs before code changes.', + instructions: [ + 'Work from the issue context and reuse-audit JSON below. Inspect the repository directly.', + 'Issue context JSON:', + JSON.stringify(args.issueContext, null, 2), + 'Reuse-audit JSON:', + JSON.stringify(args.reuseAudit, null, 2), + 'Trace plugins/babysitter-unified/commands/doctor.md into packages/sdk/src/prompts/templates/commands/doctor.md through scripts/sync-sdk-command-templates.cjs.', + 'Identify any generated per-harness doctor command or skill copies that should be updated by existing generation/sync commands rather than by hand.', + 'Trace how detectCallerHarness(), detectAdapter(), getCapabilities(), supportsHookType("stop"), and HarnessCapability.StopHook are exposed to command guidance or CLI helpers.', + 'Inspect Pi, Claude Code, Codex, Cursor, OpenCode, and OpenClaw adapters enough to distinguish StopHook and non-StopHook harnesses.', + 'Return JSON: { sourceOfTruthFiles, generatedFiles, capabilityApis, stopHookHarnesses, nonStopHookHarnesses, currentFalseFailurePaths, syncPath, risks }.', + ], + }, + }, + io: { + inputJsonPath: `tasks/${taskCtx.effectId}/inputs.json`, + outputJsonPath: `tasks/${taskCtx.effectId}/output.json`, + }, +})); + +export const authorRegressionPlanTask = defineTask('issue-877.regression-plan', (args, taskCtx) => ({ + kind: 'agent', + title: 'Author regression strategy for non-StopHook doctor behavior', + labels: ['issue-877', 'tests', 'quality-gate'], + agent: { + name: 'doctor-regression-planner', + prompt: { + role: 'test strategy architect for SDK prompt and harness behavior', + task: 'Design focused regression coverage before implementation.', + instructions: [ + 'Do not edit files.', + 'Use the issue context, reuse audit, and source trace to identify the smallest deterministic guardrails.', + 'The regression plan must prove that doctor guidance no longer requires Claude hook files for a detected harness that lacks HarnessCapability.StopHook.', + 'Include coverage for neutral N/A verdict handling so N/A does not count as WARN, FAIL, or CRITICAL.', + 'Include sync coverage so the SDK doctor template remains generated from the unified doctor source.', + 'Prefer extending existing SDK prompt/template or harness tests over adding broad snapshot churn.', + 'Return JSON: { testFilesToModify, newTests, staticChecks, fixturesOrEnv, commands, expectedFailuresBeforeFix, acceptanceMapping, risks }.', + ], + }, + }, + io: { + inputJsonPath: `tasks/${taskCtx.effectId}/inputs.json`, + outputJsonPath: `tasks/${taskCtx.effectId}/output.json`, + }, +})); + +export const designHarnessAwareDoctorTask = defineTask('issue-877.design-fix', (args, taskCtx) => ({ + kind: 'agent', + title: 'Design the harness-aware doctor fix', + labels: ['issue-877', 'design', 'doctor'], + agent: { + name: 'doctor-fix-designer', + prompt: { + role: 'senior Babysitter plugin and SDK maintainer', + task: 'Produce the concrete implementation design for issue #877. Do not edit files.', + instructions: [ + 'Use SDK-owned harness capability truth. Do not duplicate a manually-maintained harness capability table in the doctor prose if an existing API or CLI helper can expose it.', + 'Design a Phase 0 harness detection section for doctor that records detected harness name, matched env vars, capabilities, and whether StopHook is advertised.', + 'Gate section 10 hook-specific checks on HarnessCapability.StopHook or supportsHookType("stop").', + 'When StopHook is absent, make section 10 emit N/A with an explicit harness-aware explanation. N/A must be neutral in the final health determination.', + 'For Pi, either leave section 10 at N/A or add a small Pi extension/command registration health check only if it can be implemented from existing PI_PLUGIN_ROOT/package surfaces without broad new infrastructure.', + 'Replace Claude-specific wording in generic session-provenance and escalation sections with harness-neutral language, keeping Claude-specific remediation only under Claude Code conditions.', + 'Plan generated file sync from unified doctor source to SDK template and generated plugin copies using existing scripts.', + 'Return JSON: { implementationSteps, filePlan, dataFlow, nAVerdictRules, piSpecificBehavior, syncPlan, testPlan, riskControls, maintainerDecisionNeeded, question }.', + ], + }, + }, + io: { + inputJsonPath: `tasks/${taskCtx.effectId}/inputs.json`, + outputJsonPath: `tasks/${taskCtx.effectId}/output.json`, + }, +})); + +export const implementFixTask = defineTask('issue-877.implement', (args, taskCtx) => ({ + kind: 'agent', + title: `Implement issue #877 harness-aware doctor fix (attempt ${args.attempt})`, + labels: ['issue-877', 'implementation', 'doctor'], + agent: { + name: 'doctor-harness-aware-implementer', + responderType: 'agent', + adapter: 'codex', + fallbackType: 'internal', + prompt: { + role: 'senior TypeScript, SDK prompt, and plugin generation engineer', + task: 'Implement the issue #877 fix and focused regression coverage in the repository.', + instructions: [ + 'Edit the repository directly.', + 'Keep the change scoped to doctor command guidance, generated SDK command templates, generated plugin copies if required by existing scripts, and focused regression tests or sync checks.', + 'Do not alter unrelated dirty worktree files.', + 'Update plugins/babysitter-unified/commands/doctor.md as the canonical source before generated copies.', + 'Sync packages/sdk/src/prompts/templates/commands/doctor.md through the existing command-template sync path.', + 'Use SDK-owned harness capability truth in the guidance: detectCallerHarness()/detectAdapter()/getCapabilities()/supportsHookType("stop") or an existing CLI wrapper exposing equivalent adapter capabilities.', + 'Ensure non-StopHook harnesses, especially Pi, do not require CLAUDE_PLUGIN_ROOT, hooks.json, babysitter-stop-hook.sh, babysitter-session-start-hook.sh, or ~/.claude settings for section 10.', + 'Ensure N/A is neutral in final health determination and does not produce WARNING or CRITICAL.', + 'Clean up Claude-specific session-provenance and /debug wording so it appears only when the detected harness is Claude Code or when explicitly diagnosing Claude Code.', + 'Add or update the focused regression coverage from the regression plan.', + 'Run only the verification commands needed while iterating; the verification task will run the full gate.', + 'Previous verification JSON:', + JSON.stringify(args.previousVerification ?? {}, null, 2), + 'Previous review JSON:', + JSON.stringify(args.previousReview ?? {}, null, 2), + 'Design JSON:', + JSON.stringify(args.design ?? {}, null, 2), + 'Return JSON: { changedFiles, summary, rootCauseAddressed, testsAdded, generatedSyncRun, verificationCommandsRun, residualRisks }.', + ], + }, + }, + io: { + inputJsonPath: `tasks/${taskCtx.effectId}/inputs.json`, + outputJsonPath: `tasks/${taskCtx.effectId}/output.json`, + }, +})); + +export const verifyFixTask = defineTask('issue-877.verify', (args, taskCtx) => ({ + kind: 'agent', + title: `Run issue #877 verification gate (attempt ${args.attempt})`, + labels: ['issue-877', 'verification', 'quality-gate'], + agent: { + name: 'doctor-fix-verifier', + responderType: 'agent', + adapter: 'codex', + fallbackType: 'internal', + prompt: { + role: 'evidence-focused SDK verification engineer', + task: 'Run the full verification gate and report exact evidence.', + instructions: [ + 'Run the verification commands from inputs unless a command is clearly inapplicable; if skipped, explain why.', + JSON.stringify(args.inputs.verificationCommands, null, 2), + 'Also inspect the diff for doctor-specific invariants:', + '- non-StopHook path emits N/A rather than FAIL for section 10', + '- N/A is neutral in overall health determination', + '- Pi/non-StopHook path does not require CLAUDE_PLUGIN_ROOT, hooks.json, hook shell scripts, or ~/.claude settings', + '- Claude Code StopHook diagnostics remain available when StopHook is advertised', + '- SDK template and generated/synced command copies match the canonical doctor source', + 'Return JSON: { passed, commands, skippedCommands, invariantChecks, changedFiles, failures, evidence, residualRisks }.', + ], + }, + }, + io: { + inputJsonPath: `tasks/${taskCtx.effectId}/inputs.json`, + outputJsonPath: `tasks/${taskCtx.effectId}/output.json`, + }, +})); + +export const reviewFixTask = defineTask('issue-877.review', (args, taskCtx) => ({ + kind: 'agent', + title: `Review issue #877 fix against spec (attempt ${args.attempt})`, + labels: ['issue-877', 'review', 'quality-gate'], + agent: { + name: 'doctor-capability-reviewer', + responderType: 'agent', + adapter: 'codex', + fallbackType: 'internal', + prompt: { + role: 'senior SDK/plugin code reviewer', + task: 'Review the implementation against issue #877, the triage comment, and the current diff.', + instructions: [ + 'Use a code-review stance. Prioritize behavioral regressions, stale generated copies, false positives/negatives in doctor verdicts, and missing tests.', + 'Compare directly against the issue context, design, implementation result, and verification result.', + 'Check that the implementation reuses SDK capability truth instead of duplicating fragile per-harness tables.', + 'Check that StopHook harnesses still get actionable hook diagnostics.', + 'Check that non-StopHook harnesses get neutral N/A behavior and no Claude-only requirements.', + 'Check that generated files were updated by the established sync path and unrelated dirty files were not included.', + 'Return JSON: { approved, findings, blockingIssues, requiredFixes, changedFiles, testGaps, residualRisks }.', + ], + }, + }, + io: { + inputJsonPath: `tasks/${taskCtx.effectId}/inputs.json`, + outputJsonPath: `tasks/${taskCtx.effectId}/output.json`, + }, +})); + +export const finalAcceptanceTask = defineTask('issue-877.final-acceptance', (args, taskCtx) => ({ + kind: 'agent', + title: 'Evaluate final acceptance for issue #877', + labels: ['issue-877', 'final-gate', 'acceptance'], + agent: { + name: 'doctor-final-acceptance', + prompt: { + role: 'release-minded maintainer', + task: 'Decide whether the run is ready to deliver.', + instructions: [ + 'Evaluate all acceptance criteria line by line.', + JSON.stringify(args.inputs.acceptanceCriteria, null, 2), + 'Require verification.passed === true and review.approved === true.', + 'Require evidence that N/A is neutral and Pi/non-StopHook harnesses are not marked CRITICAL for missing Claude hooks.', + 'If the only unresolved question is whether to add a Pi-specific extension health check beyond N/A, mark needsMaintainerDecision only if the issue cannot be safely closed without that choice.', + 'Return JSON: { passed, acceptanceResults, changedFiles, needsMaintainerDecision, question, blockers, deliveryNotes }.', + ], + }, + }, + io: { + inputJsonPath: `tasks/${taskCtx.effectId}/inputs.json`, + outputJsonPath: `tasks/${taskCtx.effectId}/output.json`, + }, +})); + +export const deliverTask = defineTask('issue-877.deliver', (args, taskCtx) => ({ + kind: 'agent', + title: 'Commit, open implementation PR, and comment on issue #877', + labels: ['issue-877', 'delivery', 'github'], + agent: { + name: 'doctor-fix-deliverer', + responderType: 'agent', + adapter: 'codex', + fallbackType: 'internal', + prompt: { + role: 'GitHub delivery engineer', + task: 'Deliver the completed implementation after all gates pass.', + instructions: [ + `Create or switch to implementation branch ${args.inputs.implementationBranch} from ${args.inputs.baseBranch}.`, + 'Stage only files changed for issue #877. Do not stage unrelated dirty worktree files.', + 'Commit with a concise fix-scoped message.', + 'Push the branch.', + 'Create a GitHub PR against the base branch that links to the issue.', + 'Comment on the issue with a summary, verification evidence, and PR link.', + 'Return JSON: { committed, commitSha, branch, prUrl, issueCommentUrl, stagedFiles, summary }.', + ], + }, + }, + io: { + inputJsonPath: `tasks/${taskCtx.effectId}/inputs.json`, + outputJsonPath: `tasks/${taskCtx.effectId}/output.json`, + }, +})); From d943cee04cd7e14b63fc4216472e42d7c5b71682 Mon Sep 17 00:00:00 2001 From: a5c-ai Date: Fri, 5 Jun 2026 18:32:42 +0000 Subject: [PATCH 2/2] Fix harness-aware doctor hook diagnostics --- .../__tests__/doctorCommandTemplate.test.ts | 37 +++++++ .../prompts/templates/commands/blueprints.md | 4 +- .../src/prompts/templates/commands/doctor.md | 102 ++++++++++++++---- plugins/babysitter-unified/commands/doctor.md | 102 ++++++++++++++---- 4 files changed, 203 insertions(+), 42 deletions(-) create mode 100644 packages/sdk/src/prompts/__tests__/doctorCommandTemplate.test.ts diff --git a/packages/sdk/src/prompts/__tests__/doctorCommandTemplate.test.ts b/packages/sdk/src/prompts/__tests__/doctorCommandTemplate.test.ts new file mode 100644 index 0000000000..942adbd204 --- /dev/null +++ b/packages/sdk/src/prompts/__tests__/doctorCommandTemplate.test.ts @@ -0,0 +1,37 @@ +import { describe, expect, it } from 'vitest'; +import { renderCommandTemplate } from '../commandTemplates'; + +describe('doctor command template', () => { + it('keeps StopHook diagnostics gated by SDK harness capability truth', () => { + const output = renderCommandTemplate('doctor'); + + expect(output).toContain('## Phase 0. Harness Capability Detection'); + expect(output).toContain('detectCallerHarness'); + expect(output).toContain('getAdapterByName'); + expect(output).toContain("supportsHookType?.('stop')"); + expect(output).toContain('HarnessCapability.StopHook'); + expect(output).toContain('If `supportsStopHook` is `false`, mark check 10 as `N/A`'); + expect(output).toContain('Do not mark missing `CLAUDE_PLUGIN_ROOT`'); + expect(output).toContain('Pi uses command-backed skills and extension/session events'); + expect(output).toContain('Continue with 10a-10f only when `supportsStopHook` is `true`'); + }); + + it('treats N/A as neutral for final health and recommendations', () => { + const output = renderCommandTemplate('doctor'); + + expect(output).toContain('Treat N/A as neutral'); + expect(output).toContain('Do not add this N/A to warnings, failures, or recommendations.'); + expect(output).toContain('All 14 checks are PASS or N/A'); + expect(output).toContain('A report with PASS checks plus N/A-only capability skips is HEALTHY.'); + expect(output).toContain('Do not list N/A checks here unless the user asks for skipped capability checks'); + }); + + it('keeps Claude-only remediation conditional on Claude Code', () => { + const output = renderCommandTemplate('doctor'); + + expect(output).toContain('For Claude Code only, check Claude settings files'); + expect(output).toContain("For non-Claude StopHook-capable harnesses, use that harness's plugin enablement file or extension registration mechanism instead of `~/.claude`."); + expect(output).toContain('If the detected harness is `claude-code`'); + expect(output).toContain('For non-Claude harnesses, do not suggest `/debug`'); + }); +}); diff --git a/packages/sdk/src/prompts/templates/commands/blueprints.md b/packages/sdk/src/prompts/templates/commands/blueprints.md index 757b4c4322..3c9396b2e1 100644 --- a/packages/sdk/src/prompts/templates/commands/blueprints.md +++ b/packages/sdk/src/prompts/templates/commands/blueprints.md @@ -32,12 +32,12 @@ babysitter blueprints:update-marketplace --marketplace-name [--marketplac ### List blueprints in a marketplace ```bash -babysitter blueprints:list --marketplace-name --global|--project [--json] +babysitter blueprints:list-plugins --marketplace-name --global|--project [--json] ``` ## Blueprint Lifecycle -For `blueprint:install`, `blueprint:update`, `blueprint:configure`, and `blueprint:list`, the `--marketplace-name` flag is auto-detected when only one marketplace is cloned for the selected scope. +For `blueprint:install`, `blueprint:update`, `blueprint:configure`, and `blueprint:list-plugins`, the `--marketplace-name` flag is auto-detected when only one marketplace is cloned for the selected scope. ```bash babysitter blueprints:install --plugin-name [--marketplace-name ] --global|--project [--json] diff --git a/packages/sdk/src/prompts/templates/commands/doctor.md b/packages/sdk/src/prompts/templates/commands/doctor.md index 3cc1a7a520..8f68f4bdc0 100644 --- a/packages/sdk/src/prompts/templates/commands/doctor.md +++ b/packages/sdk/src/prompts/templates/commands/doctor.md @@ -6,7 +6,7 @@ You are a diagnostic agent for the babysitter runtime. Your job is to perform a comprehensive health check across 14 areas and produce a structured diagnostic report. Follow each section methodically. Track results as you go and produce the final summary at the end. -Initialize a results tracker with these 14 checks, all starting as PENDING: +Initialize a results tracker with these 14 checks, all starting as PENDING. Valid final check statuses are PASS, WARN, FAIL, ERROR, INFO, and N/A. Treat N/A as neutral: it must never contribute to WARNING or CRITICAL health. 1. Run Discovery 2. Journal Integrity 3. State Cache Consistency @@ -22,6 +22,46 @@ Initialize a results tracker with these 14 checks, all starting as PENDING: 13. Concurrent Session Detection 14. Windows Ancestor-Walk Strategy +## Phase 0. Harness Capability Detection + +**Goal:** Identify the active harness and whether stop-hook diagnostics apply before evaluating hook health. + +- Run this detection before section 1 and save the result for sections 7, 10, 11, 12, 13, and escalation guidance. +- Prefer SDK-owned capability truth. Use `detectCallerHarness()` first, then `detectAdapter()` / `getAdapterByName()` as a fallback. A direct Node probe is acceptable: + +```bash +node - <<'NODE' +const sdk = require('@a5c-ai/babysitter-sdk'); +const caller = sdk.detectCallerHarness?.() ?? null; +const explicit = process.env.BABYSITTER_HARNESS || process.env.BABYSITTER_HARNESS_NAME; +const adapter = explicit + ? sdk.getAdapterByName?.(explicit) + : caller?.name + ? sdk.getAdapterByName?.(caller.name) + : sdk.detectAdapter?.(); +const capabilities = (adapter?.getCapabilities?.() ?? caller?.capabilities ?? []).map(String); +const supportsStopHook = adapter?.supportsHookType?.('stop') + ?? (capabilities.length ? capabilities.includes(String(sdk.HarnessCapability.StopHook)) : null); +const supportsSessionStartHook = adapter?.supportsHookType?.('session-start') ?? null; +console.log(JSON.stringify({ + harness: explicit ?? caller?.name ?? adapter?.name ?? 'unknown', + source: explicit ? 'env' : caller ? 'detectCallerHarness' : adapter ? 'detectAdapter' : 'fallback', + matchedEnvVars: caller?.matchedEnvVars ?? [], + capabilities, + supportsStopHook, + supportsSessionStartHook, +}, null, 2)); +NODE +``` + +- This probe must use SDK-owned harness truth (`detectCallerHarness`, `getAdapterByName`, `getCapabilities`, and `supportsHookType`) rather than a local capability table in the doctor guidance. +- If the Node probe cannot import the SDK, record the failure and try again after `npm run build:sdk`. If it still fails, use `npx babysitter session:whoami --json`, `run.json`, and harness env vars as low-confidence evidence. Record the fallback and confidence level. +- Display the detected harness, matched evidence, capabilities, and `supportsStopHook` value near the top of the report. +- If the detected harness is known and does not advertise `HarnessCapability.StopHook`, section 10 must be marked `N/A` with a harness-aware explanation. Do not inspect Claude hook files, `hooks.json`, hook shell scripts, or `~/.claude` settings for that harness. +- If the harness is `pi`, explicitly note that Pi uses command-backed skills and extension/session events rather than Claude Code StopHook registration. +- If the harness is unknown and no StopHook capability evidence is available, do not turn missing Claude hook files into a FAIL. Mark section 10 as N/A unless there is explicit evidence that the current harness should provide StopHook hooks. +- Any check may report `N/A` when a capability is explicitly unsupported. `N/A` is neutral: it must not count as PASS, WARN, FAIL, or ERROR for the overall health verdict. + --- ## 1. Run Discovery @@ -255,15 +295,26 @@ Mark as PASS if total size < 500MB and no files > 10MB. Mark as WARN if total si ## 10. Hook Execution Health -**Goal:** Verify that the stop hook and session-start hook are properly configured, can execute, and have been running. If the stop hook has NOT been running, diagnose why. +**Goal:** Verify stop-hook health only for harnesses that support StopHook. If the active harness does not support StopHook, report a neutral N/A instead of a failure. + +Before running 10a, inspect the Phase 0 harness detection result: + +- If `supportsStopHook` is `false`, mark check 10 as `N/A` and skip 10a-10f. +- The N/A detail must say: `N/A - harness does not advertise HarnessCapability.StopHook; stop-hook registration and Claude-style hook files are not required for this harness.` +- Include the detected capability list and matched evidence in the N/A detail. +- For Pi and Oh My Pi, also note that these harnesses use command-backed skills and extension/session events instead of Claude-style `hooks.json`, `CLAUDE_PLUGIN_ROOT`, hook shell scripts, or `~/.claude` plugin settings. +- Do not mark missing `CLAUDE_PLUGIN_ROOT`, `hooks.json`, `babysitter-stop-hook.sh`, `babysitter-session-start-hook.sh`, or `~/.claude` files as FAIL when `supportsStopHook` is `false`. +- If harness detection is inconclusive but the environment exposes non-StopHook markers such as `PI_SESSION_ID` or `PI_PLUGIN_ROOT`, treat stop-hook execution health as `N/A` for the same reason. +- If `supportsStopHook` is `null` or unknown and there is no explicit evidence that the current harness should provide StopHook hooks, mark check 10 as `N/A` instead of turning missing Claude-style hook files into FAIL. +- Continue with 10a-10f only when `supportsStopHook` is `true` or there is explicit evidence that the current harness should provide StopHook hooks. ### 10a. Hook Registration -- Locate the plugin root. Check for `CLAUDE_PLUGIN_ROOT` env var first, or search for a babysitter `hooks.json` by walking up from the current directory. +- Locate the StopHook-capable plugin root. For Claude Code, check `CLAUDE_PLUGIN_ROOT` first. Otherwise, search for a babysitter `hooks.json` by walking up from the current directory or use the harness-specific plugin root env var from Phase 0. - If found, read `hooks.json` and verify: - A `Stop` hook entry exists with a command referencing `babysitter-stop-hook.sh`. - A `SessionStart` hook entry exists with a command referencing `babysitter-session-start-hook.sh`. -- If `hooks.json` is not found, mark as FAIL ("Hook registration file not found — hooks are not registered with Claude Code"). +- If `hooks.json` is not found for a StopHook-capable harness, mark as FAIL ("Hook registration file not found -- hooks are not registered for the detected StopHook-capable harness"). ### 10b. Hook Script Availability @@ -314,12 +365,13 @@ If the stop hook shows NO evidence of execution (no log entries, no journal even Perform these diagnostic steps in order and report the first failure found: -1. **Plugin not installed**: Check if `CLAUDE_PLUGIN_ROOT` is set or if a babysitter plugin directory exists relative to the project root. If neither exists, report: "Plugin not installed — the babysitter plugin directory is missing." +1. **Plugin not installed**: For Claude Code, check if `CLAUDE_PLUGIN_ROOT` is set. For other StopHook-capable harnesses, check the harness-specific plugin root from Phase 0. Also check if a babysitter plugin directory exists relative to the project root. If none exist, report: "Plugin not installed -- the babysitter plugin directory is missing." -2. **Plugin not enabled**: Check for Claude settings files: +2. **Plugin not enabled**: For Claude Code only, check Claude settings files: - `~/.claude/settings.json` — look for `babysitter` in `enabledPlugins`. - `~/.claude/plugins/installed_plugins.json` — look for `babysitter` in the plugins list. - If not found in either, report: "Plugin not enabled in Claude Code settings." + For non-Claude StopHook-capable harnesses, use that harness's plugin enablement file or extension registration mechanism instead of `~/.claude`. 3. **hooks.json not registered**: If `hooks.json` doesn't contain a `Stop` hook entry (checked in 10a), report: "Stop hook not registered in hooks.json." @@ -351,6 +403,12 @@ Mark as FAIL if: - CLI is not available - Stop hook is failing (consistent non-zero exit codes or stderr errors) +Mark as N/A if: +- The Phase 0 SDK harness capability probe shows `supportsStopHook: false` +- The detected harness lacks `HarnessCapability.StopHook` + +`N/A` is terminal for check 10 and neutral for the final verdict. + --- ## 11. Session-ID Provenance @@ -359,10 +417,10 @@ Mark as FAIL if: - Invoke: `npx babysitter session:whoami --json` - Parse the output and inspect the `resolvedFrom` field. Classify as follows: - - `resolvedFrom: "pid-marker"` → mark as PASS ("Session ID derives from the live Claude Code ancestor process -- authoritative"). - - `resolvedFrom: "env-file"` → mark as PASS with a note ("CLAUDE_ENV_FILE was used; typically healthy"). - - `resolvedFrom: "env-var"` → mark as WARN ("`AGENT_SESSION_ID` is set without a corroborating PID marker. Likely stale from a prior Claude Code session -- see GitHub issue #130"). - - Remediation: run `babysitter session:cleanup` and start a fresh Claude Code session, or `unset AGENT_SESSION_ID` before invoking babysitter. + - `resolvedFrom: "pid-marker"` → mark as PASS ("Session ID derives from the live harness ancestor process -- authoritative"). + - `resolvedFrom: "env-file"` → mark as PASS with a note ("A harness env file was used; typically healthy. For Claude Code this is commonly `CLAUDE_ENV_FILE`."). + - `resolvedFrom: "env-var"` → mark as WARN ("`AGENT_SESSION_ID` is set without a corroborating PID marker. Likely stale from a prior harness session -- see GitHub issue #130"). + - Remediation: run `babysitter session:cleanup` and start a fresh harness session, or `unset AGENT_SESSION_ID` before invoking babysitter. - `resolvedFrom: "none"` → mark as ERROR ("No session ID resolvable. Either no session-start hook fired, or the ancestor walk failed"). **Env-var shadow check:** @@ -373,11 +431,11 @@ Mark as FAIL if: ## 12. Ancestor Liveness -**Goal:** Confirm the PID marker references a live Claude Code process. +**Goal:** Confirm the PID marker references a live harness process. - Reuse the `session:whoami --json` output from check 11. - Inspect the `ancestorAlive` field. -- If `ancestorAlive === false`, mark as ERROR ("The PID marker references a dead Claude Code process"). +- If `ancestorAlive === false`, mark as ERROR ("The PID marker references a dead harness process"). - Remediation: `babysitter session:cleanup`. - Otherwise mark as PASS. @@ -389,7 +447,7 @@ Mark as FAIL if: - Enumerate files in `~/.a5c/` matching the pattern `current-session-*-pid-*`. - Count markers per harness (derived from the filename). -- If more than one live marker exists for the same harness, mark as INFO ("Multiple live Claude Code / harness sessions detected; ensure each shell scopes `AGENT_SESSION_ID` appropriately -- the PID marker handles this automatically"). +- If more than one live marker exists for the same harness, mark as INFO ("Multiple live harness sessions detected; ensure each shell scopes `AGENT_SESSION_ID` appropriately -- the PID marker handles this automatically"). - Otherwise mark as PASS. --- @@ -447,31 +505,35 @@ OVERALL HEALTH: ISSUES & RECOMMENDATIONS -------------------------------------------- - -- [WARN|FAIL] : + +- [WARN|FAIL|ERROR] : Fix: +- Do not add this N/A to warnings, failures, or recommendations. -------------------------------------------- ``` **Overall health determination:** -- **HEALTHY**: All 14 checks are PASS (INFO notes are acceptable). +- **HEALTHY**: All 14 checks are PASS or N/A (INFO notes are acceptable). - **WARNING**: At least one check is WARN but none are FAIL or ERROR. - **CRITICAL**: At least one check is FAIL or ERROR. +- **N/A is neutral**: Do not count N/A as PASS, WARN, FAIL, or ERROR. A report with PASS checks plus N/A-only capability skips is HEALTHY. Present the full detailed findings for each check BEFORE the summary table, so the user can see the evidence. End with the summary table and recommendations. Also, create a single HTML report file with all the findings that uses the arwes UI framework and open it for the user in the browser. --- -## Escalation: Claude /debug +## Escalation -If any check results in FAIL and the root cause is unclear after your own analysis -- especially for environment issues, hook execution failures, CLI availability problems, or permission errors that may relate to the Claude Code runtime itself -- invoke the built-in Claude `/debug` command to get additional diagnostic context from the Claude Code environment. This is particularly useful for: +If the detected harness is `claude-code` and any check results in FAIL with an unclear root cause after your own analysis -- especially for environment issues, hook execution failures, CLI availability problems, or permission errors that may relate to the Claude Code runtime itself -- invoke the built-in Claude `/debug` command to get additional diagnostic context from the Claude Code environment. This is particularly useful for: - Hook scripts that should be running but show no evidence of execution (check 10) - Permission or path resolution issues that don't match expected behavior - Unexpected CLI behavior that might be a Claude Code environment issue rather than a babysitter issue Call `/debug` with a summary of the failing check and what you've already ruled out, so it can focus on environment-level causes. +For non-Claude harnesses, do not suggest `/debug`; use the harness's native diagnostics if available, or `/babysitter:contrib` when the issue should be reported upstream. + --- ## After Diagnosis: Contribute Back @@ -505,7 +567,7 @@ unset AGENT_SESSION_ID # 3. Re-bind a run explicitly if needed babysitter session:resume --session-id --state-dir ~/.a5c --run-id --runs-dir .a5c/runs -# 4. Start a fresh Claude Code session (closes and reopens the session) +# 4. Start a fresh harness session (closes and reopens the session) ``` -Run steps 1 and 2 first; re-run `/babysitter:doctor` after each step to confirm the session-provenance checks return to PASS. Step 3 is only needed when a specific run must be re-bound to the fresh session. If the issue persists after step 4, escalate via `/debug` or `/babysitter:contrib`. +Run steps 1 and 2 first; re-run `/babysitter:doctor` after each step to confirm the session-provenance checks return to PASS. Step 3 is only needed when a specific run must be re-bound to the fresh session. If the issue persists after step 4, escalate via Claude `/debug` only on Claude Code; otherwise use the harness's native diagnostics or `/babysitter:contrib`. diff --git a/plugins/babysitter-unified/commands/doctor.md b/plugins/babysitter-unified/commands/doctor.md index 6a3e4ce73f..e371eb3901 100644 --- a/plugins/babysitter-unified/commands/doctor.md +++ b/plugins/babysitter-unified/commands/doctor.md @@ -6,7 +6,7 @@ allowed-tools: Read, Grep, Write, Task, Bash, Edit, Grep, Glob, WebFetch, WebSea You are a diagnostic agent for the babysitter runtime. Your job is to perform a comprehensive health check across 14 areas and produce a structured diagnostic report. Follow each section methodically. Track results as you go and produce the final summary at the end. -Initialize a results tracker with these 14 checks, all starting as PENDING: +Initialize a results tracker with these 14 checks, all starting as PENDING. Valid final check statuses are PASS, WARN, FAIL, ERROR, INFO, and N/A. Treat N/A as neutral: it must never contribute to WARNING or CRITICAL health. 1. Run Discovery 2. Journal Integrity 3. State Cache Consistency @@ -22,6 +22,46 @@ Initialize a results tracker with these 14 checks, all starting as PENDING: 13. Concurrent Session Detection 14. Windows Ancestor-Walk Strategy +## Phase 0. Harness Capability Detection + +**Goal:** Identify the active harness and whether stop-hook diagnostics apply before evaluating hook health. + +- Run this detection before section 1 and save the result for sections 7, 10, 11, 12, 13, and escalation guidance. +- Prefer SDK-owned capability truth. Use `detectCallerHarness()` first, then `detectAdapter()` / `getAdapterByName()` as a fallback. A direct Node probe is acceptable: + +```bash +node - <<'NODE' +const sdk = require('@a5c-ai/babysitter-sdk'); +const caller = sdk.detectCallerHarness?.() ?? null; +const explicit = process.env.BABYSITTER_HARNESS || process.env.BABYSITTER_HARNESS_NAME; +const adapter = explicit + ? sdk.getAdapterByName?.(explicit) + : caller?.name + ? sdk.getAdapterByName?.(caller.name) + : sdk.detectAdapter?.(); +const capabilities = (adapter?.getCapabilities?.() ?? caller?.capabilities ?? []).map(String); +const supportsStopHook = adapter?.supportsHookType?.('stop') + ?? (capabilities.length ? capabilities.includes(String(sdk.HarnessCapability.StopHook)) : null); +const supportsSessionStartHook = adapter?.supportsHookType?.('session-start') ?? null; +console.log(JSON.stringify({ + harness: explicit ?? caller?.name ?? adapter?.name ?? 'unknown', + source: explicit ? 'env' : caller ? 'detectCallerHarness' : adapter ? 'detectAdapter' : 'fallback', + matchedEnvVars: caller?.matchedEnvVars ?? [], + capabilities, + supportsStopHook, + supportsSessionStartHook, +}, null, 2)); +NODE +``` + +- This probe must use SDK-owned harness truth (`detectCallerHarness`, `getAdapterByName`, `getCapabilities`, and `supportsHookType`) rather than a local capability table in the doctor guidance. +- If the Node probe cannot import the SDK, record the failure and try again after `npm run build:sdk`. If it still fails, use `npx babysitter session:whoami --json`, `run.json`, and harness env vars as low-confidence evidence. Record the fallback and confidence level. +- Display the detected harness, matched evidence, capabilities, and `supportsStopHook` value near the top of the report. +- If the detected harness is known and does not advertise `HarnessCapability.StopHook`, section 10 must be marked `N/A` with a harness-aware explanation. Do not inspect Claude hook files, `hooks.json`, hook shell scripts, or `~/.claude` settings for that harness. +- If the harness is `pi`, explicitly note that Pi uses command-backed skills and extension/session events rather than Claude Code StopHook registration. +- If the harness is unknown and no StopHook capability evidence is available, do not turn missing Claude hook files into a FAIL. Mark section 10 as N/A unless there is explicit evidence that the current harness should provide StopHook hooks. +- Any check may report `N/A` when a capability is explicitly unsupported. `N/A` is neutral: it must not count as PASS, WARN, FAIL, or ERROR for the overall health verdict. + --- ## 1. Run Discovery @@ -255,15 +295,26 @@ Mark as PASS if total size < 500MB and no files > 10MB. Mark as WARN if total si ## 10. Hook Execution Health -**Goal:** Verify that the stop hook and session-start hook are properly configured, can execute, and have been running. If the stop hook has NOT been running, diagnose why. +**Goal:** Verify stop-hook health only for harnesses that support StopHook. If the active harness does not support StopHook, report a neutral N/A instead of a failure. + +Before running 10a, inspect the Phase 0 harness detection result: + +- If `supportsStopHook` is `false`, mark check 10 as `N/A` and skip 10a-10f. +- The N/A detail must say: `N/A - harness does not advertise HarnessCapability.StopHook; stop-hook registration and Claude-style hook files are not required for this harness.` +- Include the detected capability list and matched evidence in the N/A detail. +- For Pi and Oh My Pi, also note that these harnesses use command-backed skills and extension/session events instead of Claude-style `hooks.json`, `CLAUDE_PLUGIN_ROOT`, hook shell scripts, or `~/.claude` plugin settings. +- Do not mark missing `CLAUDE_PLUGIN_ROOT`, `hooks.json`, `babysitter-stop-hook.sh`, `babysitter-session-start-hook.sh`, or `~/.claude` files as FAIL when `supportsStopHook` is `false`. +- If harness detection is inconclusive but the environment exposes non-StopHook markers such as `PI_SESSION_ID` or `PI_PLUGIN_ROOT`, treat stop-hook execution health as `N/A` for the same reason. +- If `supportsStopHook` is `null` or unknown and there is no explicit evidence that the current harness should provide StopHook hooks, mark check 10 as `N/A` instead of turning missing Claude-style hook files into FAIL. +- Continue with 10a-10f only when `supportsStopHook` is `true` or there is explicit evidence that the current harness should provide StopHook hooks. ### 10a. Hook Registration -- Locate the plugin root. Check for `CLAUDE_PLUGIN_ROOT` env var first, or search for a babysitter `hooks.json` by walking up from the current directory. +- Locate the StopHook-capable plugin root. For Claude Code, check `CLAUDE_PLUGIN_ROOT` first. Otherwise, search for a babysitter `hooks.json` by walking up from the current directory or use the harness-specific plugin root env var from Phase 0. - If found, read `hooks.json` and verify: - A `Stop` hook entry exists with a command referencing `babysitter-stop-hook.sh`. - A `SessionStart` hook entry exists with a command referencing `babysitter-session-start-hook.sh`. -- If `hooks.json` is not found, mark as FAIL ("Hook registration file not found — hooks are not registered with Claude Code"). +- If `hooks.json` is not found for a StopHook-capable harness, mark as FAIL ("Hook registration file not found -- hooks are not registered for the detected StopHook-capable harness"). ### 10b. Hook Script Availability @@ -314,12 +365,13 @@ If the stop hook shows NO evidence of execution (no log entries, no journal even Perform these diagnostic steps in order and report the first failure found: -1. **Plugin not installed**: Check if `CLAUDE_PLUGIN_ROOT` is set or if a babysitter plugin directory exists relative to the project root. If neither exists, report: "Plugin not installed — the babysitter plugin directory is missing." +1. **Plugin not installed**: For Claude Code, check if `CLAUDE_PLUGIN_ROOT` is set. For other StopHook-capable harnesses, check the harness-specific plugin root from Phase 0. Also check if a babysitter plugin directory exists relative to the project root. If none exist, report: "Plugin not installed -- the babysitter plugin directory is missing." -2. **Plugin not enabled**: Check for Claude settings files: +2. **Plugin not enabled**: For Claude Code only, check Claude settings files: - `~/.claude/settings.json` — look for `babysitter` in `enabledPlugins`. - `~/.claude/plugins/installed_plugins.json` — look for `babysitter` in the plugins list. - If not found in either, report: "Plugin not enabled in Claude Code settings." + For non-Claude StopHook-capable harnesses, use that harness's plugin enablement file or extension registration mechanism instead of `~/.claude`. 3. **hooks.json not registered**: If `hooks.json` doesn't contain a `Stop` hook entry (checked in 10a), report: "Stop hook not registered in hooks.json." @@ -351,6 +403,12 @@ Mark as FAIL if: - CLI is not available - Stop hook is failing (consistent non-zero exit codes or stderr errors) +Mark as N/A if: +- The Phase 0 SDK harness capability probe shows `supportsStopHook: false` +- The detected harness lacks `HarnessCapability.StopHook` + +`N/A` is terminal for check 10 and neutral for the final verdict. + --- ## 11. Session-ID Provenance @@ -359,10 +417,10 @@ Mark as FAIL if: - Invoke: `npx babysitter session:whoami --json` - Parse the output and inspect the `resolvedFrom` field. Classify as follows: - - `resolvedFrom: "pid-marker"` → mark as PASS ("Session ID derives from the live Claude Code ancestor process -- authoritative"). - - `resolvedFrom: "env-file"` → mark as PASS with a note ("CLAUDE_ENV_FILE was used; typically healthy"). - - `resolvedFrom: "env-var"` → mark as WARN ("`AGENT_SESSION_ID` is set without a corroborating PID marker. Likely stale from a prior Claude Code session -- see GitHub issue #130"). - - Remediation: run `babysitter session:cleanup` and start a fresh Claude Code session, or `unset AGENT_SESSION_ID` before invoking babysitter. + - `resolvedFrom: "pid-marker"` → mark as PASS ("Session ID derives from the live harness ancestor process -- authoritative"). + - `resolvedFrom: "env-file"` → mark as PASS with a note ("A harness env file was used; typically healthy. For Claude Code this is commonly `CLAUDE_ENV_FILE`."). + - `resolvedFrom: "env-var"` → mark as WARN ("`AGENT_SESSION_ID` is set without a corroborating PID marker. Likely stale from a prior harness session -- see GitHub issue #130"). + - Remediation: run `babysitter session:cleanup` and start a fresh harness session, or `unset AGENT_SESSION_ID` before invoking babysitter. - `resolvedFrom: "none"` → mark as ERROR ("No session ID resolvable. Either no session-start hook fired, or the ancestor walk failed"). **Env-var shadow check:** @@ -373,11 +431,11 @@ Mark as FAIL if: ## 12. Ancestor Liveness -**Goal:** Confirm the PID marker references a live Claude Code process. +**Goal:** Confirm the PID marker references a live harness process. - Reuse the `session:whoami --json` output from check 11. - Inspect the `ancestorAlive` field. -- If `ancestorAlive === false`, mark as ERROR ("The PID marker references a dead Claude Code process"). +- If `ancestorAlive === false`, mark as ERROR ("The PID marker references a dead harness process"). - Remediation: `babysitter session:cleanup`. - Otherwise mark as PASS. @@ -389,7 +447,7 @@ Mark as FAIL if: - Enumerate files in `~/.a5c/` matching the pattern `current-session-*-pid-*`. - Count markers per harness (derived from the filename). -- If more than one live marker exists for the same harness, mark as INFO ("Multiple live Claude Code / harness sessions detected; ensure each shell scopes `AGENT_SESSION_ID` appropriately -- the PID marker handles this automatically"). +- If more than one live marker exists for the same harness, mark as INFO ("Multiple live harness sessions detected; ensure each shell scopes `AGENT_SESSION_ID` appropriately -- the PID marker handles this automatically"). - Otherwise mark as PASS. --- @@ -447,31 +505,35 @@ OVERALL HEALTH: ISSUES & RECOMMENDATIONS -------------------------------------------- - -- [WARN|FAIL] : + +- [WARN|FAIL|ERROR] : Fix: +- Do not add this N/A to warnings, failures, or recommendations. -------------------------------------------- ``` **Overall health determination:** -- **HEALTHY**: All 14 checks are PASS (INFO notes are acceptable). +- **HEALTHY**: All 14 checks are PASS or N/A (INFO notes are acceptable). - **WARNING**: At least one check is WARN but none are FAIL or ERROR. - **CRITICAL**: At least one check is FAIL or ERROR. +- **N/A is neutral**: Do not count N/A as PASS, WARN, FAIL, or ERROR. A report with PASS checks plus N/A-only capability skips is HEALTHY. Present the full detailed findings for each check BEFORE the summary table, so the user can see the evidence. End with the summary table and recommendations. Also, create a single HTML report file with all the findings that uses the arwes UI framework and open it for the user in the browser. --- -## Escalation: Claude /debug +## Escalation -If any check results in FAIL and the root cause is unclear after your own analysis -- especially for environment issues, hook execution failures, CLI availability problems, or permission errors that may relate to the Claude Code runtime itself -- invoke the built-in Claude `/debug` command to get additional diagnostic context from the Claude Code environment. This is particularly useful for: +If the detected harness is `claude-code` and any check results in FAIL with an unclear root cause after your own analysis -- especially for environment issues, hook execution failures, CLI availability problems, or permission errors that may relate to the Claude Code runtime itself -- invoke the built-in Claude `/debug` command to get additional diagnostic context from the Claude Code environment. This is particularly useful for: - Hook scripts that should be running but show no evidence of execution (check 10) - Permission or path resolution issues that don't match expected behavior - Unexpected CLI behavior that might be a Claude Code environment issue rather than a babysitter issue Call `/debug` with a summary of the failing check and what you've already ruled out, so it can focus on environment-level causes. +For non-Claude harnesses, do not suggest `/debug`; use the harness's native diagnostics if available, or `/babysitter:contrib` when the issue should be reported upstream. + --- ## After Diagnosis: Contribute Back @@ -505,7 +567,7 @@ unset AGENT_SESSION_ID # 3. Re-bind a run explicitly if needed babysitter session:resume --session-id --state-dir ~/.a5c --run-id --runs-dir .a5c/runs -# 4. Start a fresh Claude Code session (closes and reopens the session) +# 4. Start a fresh harness session (closes and reopens the session) ``` -Run steps 1 and 2 first; re-run `/babysitter:doctor` after each step to confirm the session-provenance checks return to PASS. Step 3 is only needed when a specific run must be re-bound to the fresh session. If the issue persists after step 4, escalate via `/debug` or `/babysitter:contrib`. +Run steps 1 and 2 first; re-run `/babysitter:doctor` after each step to confirm the session-provenance checks return to PASS. Step 3 is only needed when a specific run must be re-bound to the fresh session. If the issue persists after step 4, escalate via Claude `/debug` only on Claude Code; otherwise use the harness's native diagnostics or `/babysitter:contrib`.