diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index ad1f4a8..3d529b2 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -6,8 +6,7 @@ }, "metadata": { "description": "Official Bitwarden Claude Plugin Marketplace", - "version": "1.0.1", - "pluginRoot": "./plugins" + "version": "1.1.0" }, "plugins": [ { @@ -25,7 +24,7 @@ { "name": "bitwarden-code-review", "source": "./plugins/bitwarden-code-review", - "version": "1.10.0", + "version": "1.11.0", "description": "Comprehensive code review system with organization-wide standards." }, { diff --git a/README.md b/README.md index 2266252..8ab23c8 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ A curated collection of plugins for AI-assisted development at Bitwarden. Enable | ------------------------------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------- | | [bitwarden-tech-lead](plugins/bitwarden-tech-lead/) | 2.0.0 | Software architect for technical planning, architecture reviews, and implementation phasing | | [bitwarden-atlassian-tools](plugins/bitwarden-atlassian-tools/) | 2.2.3 | Read-only Atlassian access via MCP server with deep Jira issue research skill | -| [bitwarden-code-review](plugins/bitwarden-code-review/) | 1.10.0 | Autonomous code review agent following Bitwarden engineering standards with GitHub integration | +| [bitwarden-code-review](plugins/bitwarden-code-review/) | 1.11.0 | Autonomous code review agent following Bitwarden engineering standards with GitHub integration | | [bitwarden-delivery-tools](plugins/bitwarden-delivery-tools/) | 1.0.0 | Generic delivery workflow skills for committing, PR creation, preflight checks, and change labeling | | [bitwarden-devops-engineer](plugins/bitwarden-devops-engineer/) | 0.1.1 | DevOps engineering assistant: workflow compliance linting, action security auditing, and org-wide CI/CD remediation | | [bitwarden-init](plugins/bitwarden-init/) | 1.1.0 | Initialize and enhance CLAUDE.md files with Bitwarden's standardized template format | diff --git a/plugins/bitwarden-code-review/.claude-plugin/plugin.json b/plugins/bitwarden-code-review/.claude-plugin/plugin.json index 56ea8ba..5134995 100644 --- a/plugins/bitwarden-code-review/.claude-plugin/plugin.json +++ b/plugins/bitwarden-code-review/.claude-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "bitwarden-code-review", - "version": "1.10.0", + "version": "1.11.0", "description": "Comprehensive code review system with organization-wide standards.", "author": { "name": "Bitwarden", diff --git a/plugins/bitwarden-code-review/CHANGELOG.md b/plugins/bitwarden-code-review/CHANGELOG.md index 486e73a..c03b989 100644 --- a/plugins/bitwarden-code-review/CHANGELOG.md +++ b/plugins/bitwarden-code-review/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to the Bitwarden Code Review Plugin will be documented in th The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.11.0] - 2026-05-04 + +### Added + +- New `performing-multi-agent-code-review` skill: orchestrates a multi-agent code review pipeline. + ## [1.10.0] - 2026-04-28 ### Added diff --git a/plugins/bitwarden-code-review/README.md b/plugins/bitwarden-code-review/README.md index 420d1e0..af276c5 100644 --- a/plugins/bitwarden-code-review/README.md +++ b/plugins/bitwarden-code-review/README.md @@ -17,14 +17,15 @@ This plugin provides an autonomous code review agent that conducts thorough, pro ## Skills -| Skill | Triggers | Purpose | -| --------------------------------------------------------------------------------- | --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | -| [`classifying-review-findings`](./skills/classifying-review-findings/SKILL.md) | "classify finding", "severity" | 5-tier severity system (CRITICAL / IMPORTANT / DEBT / SUGGESTED / QUESTION) with emoji and label mapping | -| [`avoiding-false-positives`](./skills/avoiding-false-positives/SKILL.md) | "validate finding", "verify before posting" | Rejection criteria and verification checks that drop low-confidence findings before they reach a comment | -| [`posting-bitwarden-review-comments`](./skills/posting-bitwarden-review-comments/SKILL.md) | "post inline comment", "post PR comment" | Inline PR comment formatting per Bitwarden standards (severity emojis, explanation, actionable suggestion) | -| [`posting-review-summary`](./skills/posting-review-summary/SKILL.md) | "post summary", "summary comment" | Final summary comment handling — routes to sticky comment, GitHub Actions MCP tool, or local file based on context | -| [`reviewing-dependency-changes`](./skills/reviewing-dependency-changes/SKILL.md) | "package.json", "Renovate PR", "dependency manifest" | Flags dependency manifest changes for AppSec approval, version-bump significance, and lock-file hygiene | -| [`addressing-code-review-comments`](./skills/addressing-code-review-comments/SKILL.md) | "address review comments", "respond to PR feedback" | Guides developers working through review comments locally — verify before implementing, surface ambiguity, no performative agreement | +| Skill | Triggers | Purpose | +| ------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | +| [`classifying-review-findings`](./skills/classifying-review-findings/SKILL.md) | "classify finding", "severity" | 5-tier severity system (CRITICAL / IMPORTANT / DEBT / SUGGESTED / QUESTION) with emoji and label mapping | +| [`avoiding-false-positives`](./skills/avoiding-false-positives/SKILL.md) | "validate finding", "verify before posting" | Rejection criteria and verification checks that drop low-confidence findings before they reach a comment | +| [`performing-multi-agent-code-review`](./skills/performing-multi-agent-code-review) | "perform multi-agent code review", "review the last week of commits in this repo" | Perform a rigorous, multi-agent code review | +| [`posting-bitwarden-review-comments`](./skills/posting-bitwarden-review-comments/SKILL.md) | "post inline comment", "post PR comment" | Inline PR comment formatting per Bitwarden standards (severity emojis, explanation, actionable suggestion) | +| [`posting-review-summary`](./skills/posting-review-summary/SKILL.md) | "post summary", "summary comment" | Final summary comment handling — routes to sticky comment, GitHub Actions MCP tool, or local file based on context | +| [`reviewing-dependency-changes`](./skills/reviewing-dependency-changes/SKILL.md) | "package.json", "Renovate PR", "dependency manifest" | Flags dependency manifest changes for AppSec approval, version-bump significance, and lock-file hygiene | +| [`addressing-code-review-comments`](./skills/addressing-code-review-comments/SKILL.md) | "address review comments", "respond to PR feedback" | Guides developers working through review comments locally — verify before implementing, surface ambiguity, no performative agreement | ## Architecture diff --git a/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/SKILL.md b/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/SKILL.md new file mode 100644 index 0000000..ffbc891 --- /dev/null +++ b/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/SKILL.md @@ -0,0 +1,180 @@ +--- +name: performing-multi-agent-code-review +description: Perform a rigorous, multi-agent code review with architecture-compliance, parallel quality/security analysis, finding validation, and severity audit. Use whenever the user asks for a structured, deep, thorough, multi-pass, or multi-agent code review — or a review that includes architecture/pattern compliance, confidence-scored findings, or a severity audit — even if they don't say the exact phrase "multi-agent". Prefer this over a single-agent review when the user wants high-signal findings with validation. Also use whenever the user asks for a code review across a commit range, time window, or N most recent commits in a locally checked-out repo (e.g. "review the last week of commits in bitwarden/server", "review the last 20 commits", "review changes since 2026-04-23") — these route to the commit-range mode below. +allowed-tools: "Bash(gh pr diff:*), Bash(gh pr view:*), Bash(git diff:*), Bash(git status:*), Bash(git rev-parse:*), Bash(git check-ignore:*), Bash(git log:*), Bash(git rev-list:*), Read, Write, Grep, Glob, Task, Skill, AskUserQuestion" +--- + +# Overview + +Execute a structured, multi-agent code review on a set of code changes. Follow the process below precisely — skipping steps degrades consistency and accuracy. + +## Prerequisites + +This skill depends on the following sibling plugins. If any are not installed, **abort the review with a clear error message** identifying the missing plugin — do not attempt to proceed with a degraded pipeline. + +- **`bitwarden-tech-lead`** — provides the architecture review subagent. +- **`bitwarden-security-engineer`** — provides security context and analysis skills. + +Before Step 1, verify each prerequisite is resolvable. If a prerequisite is missing, print: + +> Prerequisite plugin `` is not installed. Install it and retry. Review aborted. + +…and stop. + +## Mode + +Read `references/modes.md`. Loaded in Step 1; the orchestrator determines the mode from the invocation, runs the resolution sequence (commit-range mode only), and uses the matching diff-source commands to populate Step 1's gathered context. Modes are orchestrator-only and not propagated to subagents. + +## Operating Rules + +Applies to all agents and subagents. + +- Model: Default to the opus model unless `--model` is specified. +- Announce which model is being used before starting the review. +- Don't write to GitHub. All findings go to a local markdown file. +- Tool discipline (see Orchestration → Tool Discipline) applies to the main agent and is propagated verbatim to every subagent. Rationale for the WebFetch/WebSearch ban: bypasses `gh` auth, skips audit trails, can return stale cached pages. + +## Orchestration + +Applied when launching subagents. + +### Project Preamble Propagation + +Subagents do not inherit the main agent's CLAUDE.md context. Every subagent prompt in Steps 2–5 MUST open with the two required blocks below, in order, followed by the conditional block if it applies. + +**Required — Bitwarden security context.** Include this directive verbatim: + +> At the start of your analysis, invoke `Skill(bitwarden-security-engineer:bitwarden-security-context)`. Use its principles, vocabulary, and requirement categories verbatim when classifying findings — do not paraphrase. + +**Required — zero-knowledge and threat-model preamble.** Include this block verbatim in the subagent prompt: + +> **Zero-knowledge invariant.** Bitwarden servers only store and synchronize encrypted vault data. The server, Bitwarden employees, and third parties must never be able to access unencrypted vault data. Encryption and decryption happen client-side only. The Master Key and Stretched Master Key are never stored on or transmitted to Bitwarden servers. +> +> **Threat-model directive.** Evaluate every change against P01–P06 and the requirements under VD/EK/AT/SC/TC (loaded via the `bitwarden-security-context` skill per the preceding block). For each finding that touches vault data, keys, auth tokens, or user authenticity, name the principle or category it implicates. + +**Conditional — repo-specific forwarding.** A repo's checked-in `CLAUDE.md` may contain a section that explicitly instructs you to forward it to subagents (e.g., _"when spawning subagents, include..."_ or _"propagate this to subagents"_). If so, paste that section verbatim. If not, the two required blocks alone suffice. + +### Tool Discipline + +Include this block verbatim in every Step 2–5 subagent prompt, immediately after the Preamble Propagation blocks: + +> **Tool discipline.** +> +> - Use Bash for all `gh`/`git` commands. Never use WebFetch or WebSearch. +> - Assume tools work. Do not probe — no `ls`, `pwd`, `which`, `--version`, `--help`, or pre-read existence checks. +> - The diff, file paths, and PR metadata are in this prompt. Do not re-fetch. +> - On tool failure: note in output and continue. Do not probe to diagnose. + +### Context Partitioning + +Feature context — issue descriptions, Jira tickets, PR history, removed-predecessor rationale, product framing — sharpens adversarial thinking but biases baseline diff reading. Classify each subagent before launch: + +- **Context-allowed** (Step 2 architecture agent; Step 3 Agent 3 security & logic): pass full feature context. These agents think adversarially from intent. +- **Context-forbidden** (Step 3 Agent 1 code quality; Step 3 Agent 2 bug analysis): **ONLY** pass the diff and the Review Rules. **DO NOT** paste issue summaries, Jira tickets, or PR description prose into these prompts. +- **Style-matching requirement.** The main agent's tone and framing across parallel agents leaks — a rich-context prompt for the security agent alongside a bare prompt for the bug agent still implicitly biases the bug agent through the shared authored reality. When drafting context-forbidden prompts, match the terse style of the diff-only sibling prompts; do not echo the framing of the context-allowed siblings. + +## Discovery Standards + +Read `references/discovery-standards.md`. Referenced by Step 2 (architect doc/code consistency pass) and Step 3 Agent 1 (Hygiene Sweep). The Line Number Accuracy rule is propagated verbatim into every Step 2–5 subagent prompt. + +## Evaluation Standards + +Read `references/evaluation-standards.md`. Severity Levels, Do Not Flag, and Confidence Scoring are propagated verbatim into every Step 2–5 subagent prompt; the Finding Shape schema lives in `references/finding-shape.md` and is also propagated verbatim. + +## Review Rules + +Every Step 2–5 subagent prompt MUST include all of the following blocks verbatim, in order. Throughout this skill, this bundle is referred to as the **Review Rules**: + +- **Project Preamble Propagation** (above) — Bitwarden security context, zero-knowledge invariant, threat-model directive. +- **Tool Discipline** (above). +- **Line Number Accuracy** from `references/discovery-standards.md`. +- **Severity Levels**, **Do Not Flag**, and **Confidence Scoring** from `references/evaluation-standards.md`. +- **Finding Shape** schema from `references/finding-shape.md`. + +When a step below says "the Review Rules," it means this exact bundle — never a subset. + +## Code Review Process + +Execute these steps in order. Do not skip, reorder, or combine steps. + +1. Gather context (no subagents). All `references/...` paths below resolve relative to this skill's directory — do not search elsewhere. + - **READ** `references/modes.md`. The orchestrator follows it to determine the review mode and the matching diff-source commands. + - Determine the mode per `references/modes.md`. Fetch the list of changed files with the mode's command: `gh pr diff {number} --name-only` (PR), `git diff --name-only` (local), `git diff origin/HEAD --name-only` (branch comparison), or `git diff .. --name-only` (commit range). In PR mode, also fetch the title and description with `gh pr view`. + - **READ** CLAUDE.md, README.md, and any other relevant .md files in or near the directories containing modified files. + - **READ** `references/report-template.md` for formatting the final report in Step 7. + - **READ** `references/finding-shape.md`. Its contents are pasted verbatim into every Step 2–5 subagent prompt. + - **READ** `references/discovery-standards.md`. The Hygiene Sweep is referenced by name in the Step 3 Agent 1 prompt; Line Number Accuracy is propagated verbatim into every Step 2–5 subagent prompt. + - **READ** `references/evaluation-standards.md`. Severity Levels, Do Not Flag, and Confidence Scoring are propagated verbatim into every Step 2–5 subagent prompt. + +2. Launch a single architecture & pattern compliance agent using the `bitwarden-tech-lead:bitwarden-tech-lead` subagent type. Give it the diff, the list of changed file paths, and — in PR mode only — the PR title and description. + + Unlike the diff agents in Step 3, this agent reads BEYOND the diff to check whether changes fit the codebase. + + Responsibilities: + - Read the full files being modified (not just diff hunks) to understand surrounding context. + - Read CLAUDE.md, README.md, and other relevant .md files in or near the modified directories; verify each change complies with explicit project rules. + - Use Glob and Grep to find how similar code is structured elsewhere in the codebase. + - **Doc/code consistency pass** — flag contradictions this diff creates between the code and same-repo documentation, configuration, or agent-facing files (e.g., a `CLAUDE.md` entry describing handler behavior the diff now changes; a README example that no longer matches the new signature; `.claude/` agent instructions referencing behavior the PR removes). Only flag divergence this change creates or worsens — do not audit pre-existing drift. + + **Scope.** Raise pattern inconsistencies, architectural boundary violations, duplicated abstractions, and new conventions introduced where an established one applies. Do NOT raise correctness bugs, security issues, or code-quality concerns — those belong to Step 3. + + Apply the Review Rules. Threshold ≥ 80. Emit findings as a JSON array per the Finding Shape schema. + +3. Launch 3 agents using the `general-purpose` subagent type to independently review the changes. Each receives the diff and the Review Rules; each emits findings as a JSON array per the Finding Shape schema. In PR mode, pass the PR title and description only to Agent 3 per Context Partitioning — Agents 1 and 2 receive diff + Review Rules only. Send all 3 Agent tool calls in a single message (do NOT use run_in_background). + + **Agent 1: Code quality agent** + Read the introduced code as a senior engineer reviewing it for the first time. Surface anything that hurts correctness, clarity, or long-term maintainability — code duplication, missing critical error handling, accessibility gaps, inadequate test coverage, overly complex logic, unclear naming, inconsistent patterns. Prefer readable, explicit code over compact solutions; flag readability problems alongside correctness ones rather than treating them as separate categories. + + Before submitting findings, perform the **Hygiene Sweep** defined in `references/discovery-standards.md`. + + **Agent 2: Bug analysis agent** + Scan the diff for significant bugs visible without outside context. Skip nitpicks, likely false positives, and anything you'd need to read other files to confirm. + + **Agent 3: Security & logic agent** + Find security flaws and logic errors in the introduced code. Stay scoped to changed lines. + + Invoke `Skill(bitwarden-security-engineer:analyzing-code-security)`, `Skill(bitwarden-security-engineer:detecting-secrets)`, and `Skill(bitwarden-security-engineer:reviewing-dependencies)` from the `bitwarden-security-engineer` plugin to cover classic application-security items. + + In addition to attacker-as-LLM and attacker-as-server threat models, evaluate the **user-side threat surface**. Apply the **Trusted Channel** concept from the loaded security context — ask whether the user-facing surface qualifies: + - **Authenticity of prompts shown to the user** — can the user tell which application is requesting sensitive input? Dialog titles, branding, and prompt strings should allow the user to resist spoofed-dialog phishing. + - **Consent gates** — is every action requiring user authorization clearly labeled, with sufficient context for the user to make an informed decision? + - **Output authenticity** — are success/failure messages returned to the user distinguishable from messages an attacker could forge through the same channel? + + This vector is distinct from preventing secrets from reaching the LLM. Both must be evaluated. + + Apply the Review Rules. Threshold ≥ 80. + +4. Launch a single `general-purpose` validation subagent for all findings from Steps 2 and 3. The subagent receives the diff fetched with the mode's diff command from Step 1, the full array of finding objects, the Review Rules, and — in PR mode only — the PR title and description. The subagent returns an array of Step 4 objects (one per input finding) per the Finding Shape schema. + + **Chunking escape hatch.** If raw findings from Steps 2 and 3 number more than 25, partition them into chunks of ≤ 15 (preserving collateral context within each chunk; do not split a `source_agent` group across chunks if it would put related findings on opposite sides) and launch one validation subagent per chunk in a single message (do NOT use run_in_background). + + A finding is **dismissed** if ANY of the following are true: + - It is a pre-existing finding, not introduced by this change. In commit-range mode, treat the cumulative diff of `..` as "this change" and the parent of `` as the pre-existing baseline. + - **Bugs**: The problem does not actually exist in the code (e.g., the variable is not truly undefined, the logic error does not actually produce wrong results) + - It is a nitpick that a senior engineer would not flag in a real code review + - It would be caught by a linter (**do not run** the linter to verify) + - It is a general code quality concern that wouldn't be flagged in a real code review. In other words, do not state generics. All findings **MUST** be specific and actionable. + + **Collateral-change check.** When a finding is about to be dismissed as "deliberate divergence from an established pattern" or "documented exception," before dismissing it check whether supporting code was updated _consistent with_ the divergence. Specifically, scan the diff for: + - Allowlist, registry, or lookup-table entries that assume the old pattern and are now stale or dead. + - Schema, type, or interface definitions that still describe the pre-divergence contract. + - Documentation, comments, or error messages that reference the abandoned path. + + If the divergence is deliberate but its collateral was not updated, the collateral is a new finding (typically ♻️ Refactor) — do not dismiss the original finding silently; route the collateral problem as its own finding instead. + +5. Launch a single `general-purpose` severity-audit agent. Give it all validated findings from step 4, the diff, and the Review Rules. For each finding, the agent must: + - Confirm the severity assigned by the review agent, or + - Downgrade it to a lower severity if the evidence doesn't support the original rating, or + - Dismiss it entirely if it does not meet the bar for any severity level. + + The agent returns a Step 5 object per the Finding Shape schema for each input finding. + +6. Merge all Step 4 and Step 5 returns by `id` into the master finding map. Creation-time fields are immutable (see `references/finding-shape.md`). For dismissed findings, set `dismissal_stage` to `"Step 4 validation"` or `"Step 5 severity audit"` based on which step set the dismissal status — it renders as `**Dismissed at:**`. Partition by final status: validated (Step 5 `confirmed` or `downgraded`) becomes the main Findings section; dismissed (Step 4 `dismissed` or Step 5 `dismissed`) preserves original severity, original confidence, dismissal stage, and dismissal reason for rendering in the Dismissed block. + +7. Format the report using the template in `references/report-template.md`. Cite every validated AND dismissed finding with full file path and line: `file/path.ext:{line}` (or `:{start}-{end}` for ranges). Omit any severity section with zero findings. If zero findings total, replace the Findings section with: "No findings found." For every rendered finding (validated and dismissed), populate the `**Caught by:**` line from the finding's `source_agent` field, translated to the friendly label per the table in `references/report-template.md`. Dismissed findings additionally render `**Original severity:**`, `**Original confidence:**`, `**Dismissed at:**`, and `**Dismissed because:**` per the template — past runs have silently dropped these, so do not omit any of them; per-finding traceability requires the full set. + +8. Print the full formatted report to the terminal. + +9. Write the formatted report to the repository root in a markdown file with the following naming convention: + +- File name: `code-review-PR-{number}.md` (PR mode), `code-review-{YYYY-MM-DD}.md` (local mode), `code-review-{branch}-{YYYY-MM-DD}.md` (branch comparison mode), or `code-review-{from-short}..{to-short}.md` (commit-range mode, where `{from-short}`/`{to-short}` are 7-char SHAs or shorter ref names). diff --git a/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/references/discovery-standards.md b/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/references/discovery-standards.md new file mode 100644 index 0000000..742cc0c --- /dev/null +++ b/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/references/discovery-standards.md @@ -0,0 +1,22 @@ +# Discovery Standards + +Loaded by the orchestrator in Step 1. The **Hygiene Sweep** is invoked by name from the Step 3 Agent 1 (code quality) prompt. The **Line Number Accuracy** rule is propagated verbatim into every Step 2–5 subagent prompt. + +## Hygiene Sweep + +Agent 1 (code quality) performs a hygiene sweep of the diff before submitting findings; the Step 2 architect performs an analogous doc/code consistency pass per its own directive. When referenced, look specifically for: + +- **Dead code added by this PR** — allowlist/registry/lookup-table entries added for features that don't flow through the validated entry point; unused imports; unreachable branches. +- **Stale references** — documentation, comments, error messages, or assertions in this diff that contradict the same diff's implementation. +- **Cross-site inconsistency** — a new call site that differs from established sibling sites in a way not explained by the change (e.g., four platform dialogs where three carry a title and the fourth silently drops it). + +This is not an exhaustive checklist — surface anything diff-visible that a senior engineer would flag in a real review. + +## Line Number Accuracy + +Cite **actual file line numbers**, not positions within the diff. Derive them from the hunk header: + +- Parse `@@ -A,B +C,D @@` — `+C` is the starting file line for the hunk. New files use `@@ -0,0 +1,N @@`, so C=1. +- From `+C`, count `+` lines and context lines (no prefix) up to your target. Skip `-` lines, `@@` lines, and `---`/`+++` lines. + +**Never guess. Always derive from the hunk header.** diff --git a/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/references/evaluation-standards.md b/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/references/evaluation-standards.md new file mode 100644 index 0000000..d0bd6e6 --- /dev/null +++ b/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/references/evaluation-standards.md @@ -0,0 +1,40 @@ +# Evaluation Standards + +Loaded by the orchestrator in Step 1. **Severity Levels**, **Do Not Flag**, and **Confidence Scoring** below are propagated verbatim into every Step 2–5 subagent prompt. The **Finding Shape** schema lives in `finding-shape.md` and is propagated the same way. + +## Severity Levels + +Every finding must be assigned one of the following. Do not guess — apply these definitions literally. + +- 🛑 **Blocker** — Will cause a production failure, data loss, or security breach. +- ⚠️ **Important** — A real bug or significant risk that is likely to be hit in practice. +- ♻️ **Refactor** — True technical debt being created that will cost more to maintain over time, even if it doesn't cause immediate problems. Must cite concrete evidence — duplication of an existing pattern, violation of a documented convention, or a measurable structural improvement. If the rationale can't be made concrete, it isn't a finding. + +There is no "suggestion" or other lower tier. Findings that don't clear the Refactor bar are not findings. + +## Do Not Flag + +The following are not valid findings under any tier. Subagents must not emit them, and Step 5 dismisses any that slip through. + +- Code style or quality concerns absent a documented project rule. +- Subjective suggestions or improvements — "could be cleaner", "consider doing X", "this might be simpler". +- Pedantic nit-picks a senior engineer would not raise in code review. +- Issues a linter would catch. +- Speculative issues that depend on specific inputs or runtime state without evidence those inputs occur in practice. +- Pre-existing issues not introduced or worsened by this change. + +## Confidence Scoring + +Rate each potential finding on a 0–100 scale: + +- **0**: Not confident — false positive or pre-existing issue. +- **25**: Somewhat confident — might be real, might be a false positive. Stylistic issues not called out in project guidelines land here. +- **50**: Moderately confident — real issue, but a nitpick, unlikely to hit in practice, or is a stylistic preference without project-rule backing. +- **80**: Highly confident — verified; very likely to hit in practice. Directly impacts functionality or violates a project guideline. +- **100**: Certain — evidence directly confirms it will happen frequently. + +**Only report findings with confidence ≥ 80.** Findings rated 50–79 are dismissed silently; do not re-rate upward to clear the threshold. + +## Finding Shape + +Every finding and every Step 4/5 return object follows the JSON schema in `finding-shape.md`. The main orchestrator loads that file in Step 1 and propagates its contents verbatim to every subagent. diff --git a/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/references/finding-shape.md b/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/references/finding-shape.md new file mode 100644 index 0000000..c47c10b --- /dev/null +++ b/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/references/finding-shape.md @@ -0,0 +1,53 @@ +# Finding Shape + +Every finding and every Step 4/5 return object follows the JSON schema below. Subagents emit JSON arrays; the main orchestrator parses by field. + +## Finding object (created in Steps 2 and 3) + +Emit as a JSON array. Each finding: + +| field | type | notes | +| -------------- | ------- | -------------------------------------------------------------------------------- | +| `id` | string | `{source}-{n}`, e.g. `"bug-3"`. Source ∈ `arch`, `quality`, `bug`, `sec`, `val`. | +| `file` | string | Repo-relative path. | +| `line` | string | `"42"` or `"42-50"`. Derived per Line Number Accuracy. | +| `severity` | string | `"blocker"` \| `"important"` \| `"refactor"`. | +| `confidence` | integer | 0–100. Only findings ≥ 80 are emitted. | +| `title` | string | < 100 chars. Renders as the section header in the final report. | +| `detail` | string | Markdown. Explanation, why it matters, suggested fix. | +| `source_agent` | string | `"architect"` \| `"quality"` \| `"bug"` \| `"security"` \| `"validation"`. | + +If an agent produces no findings, return `[]`. + +The orchestrator renders `source_agent` on every finding in the final report — set it accurately. The id-prefix → source_agent mapping is fixed: `arch → architect`, `quality → quality`, `bug → bug`, `sec → security`, `val → validation`. + +## Step 4 return (validation) + +One entry per incoming finding, keyed by `id`: + +| field | type | notes | +| ------------------ | ------ | ----------------------------------------- | +| `id` | string | Matches input. | +| `status` | string | `"validated"` \| `"dismissed"`. | +| `dismissal_reason` | string | Present only when `status = "dismissed"`. | + +**Collateral findings** produced during Step 4 (per the collateral-change check) use the full **Finding object** schema above with `source_agent: "validation"` and `id: "val-N"`. They append to Step 5's input. + +## Step 5 return (severity audit) + +One entry per incoming finding, keyed by `id`: + +| field | type | notes | +| ------------------ | ------ | ------------------------------------------------- | +| `id` | string | Matches input. | +| `status` | string | `"confirmed"` \| `"downgraded"` \| `"dismissed"`. | +| `final_severity` | string | Severity value. Omit when `status = "dismissed"`. | +| `dismissal_reason` | string | Present only when `status = "dismissed"`. | + +## Orchestrator behavior + +- Maintains a master finding map keyed by `id`. +- Each step's return merges into the master object by `id`. +- Creation-time fields — `severity`, `confidence`, `source_agent`, `title`, `detail`, `file`, `line` — are set by the Step 2/3 agent and **MUST NOT** be rewritten in Step 4, Step 5, or Step 6 merge. Step 4 and Step 5 returns carry only `id`, `status`, and disposition fields by design; the merge MUST preserve all creation-time fields from the original Step 2/3 finding. +- For dismissed findings, the orchestrator records a `dismissal_stage` field on the master-map entry: `"Step 4 validation"` if Step 4 set the dismissal status, or `"Step 5 severity audit"` if Step 5 did. This field is rendered in the final report as `**Dismissed at:**`. +- Step 6 partitions the master map by final status (validated vs dismissed) and renders the report. diff --git a/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/references/modes.md b/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/references/modes.md new file mode 100644 index 0000000..3390863 --- /dev/null +++ b/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/references/modes.md @@ -0,0 +1,64 @@ +# Modes + +Loaded by the orchestrator in Step 1. Mode logic is orchestrator-only — it determines which diff-source commands run before subagents launch, and is **not** propagated to subagents. + +Determine review mode from the invocation. Inspect both the slash-command argument and any natural-language framing the user provided. The four modes are mutually exclusive — if more than one seems to apply, invoke `AskUserQuestion` to disambiguate before proceeding rather than guessing. + +## Mode 1 — PR mode + +**Trigger:** the user supplied a GitHub pull request reference. Recognize a bare number (`123`), a `#`-prefixed reference (`#123`, `PR #123`), or a pull-request URL (`https://github.com/owner/repo/pull/123`). + +**Diff sources:** + +- Title & description: `gh pr view ` +- Changed files: `gh pr diff --name-only` +- Diff: `gh pr diff ` + +## Mode 2 — Local changes mode + +**Trigger:** no PR reference, no commit-range framing, AND `git status --porcelain` returns non-empty (working tree has uncommitted changes). + +**Diff sources:** + +- Changed files: `git diff --name-only` +- Diff: `git diff` (combines staged + unstaged) + +## Mode 3 — Branch comparison mode + +**Trigger:** no PR reference, no commit-range framing, AND `git status --porcelain` returns empty (clean working tree). + +**Diff sources:** + +- Current branch: `git rev-parse --abbrev-ref HEAD` (needed for the Step 9 filename) +- Base ref: `git rev-parse --abbrev-ref origin/HEAD` (yields e.g. `origin/main`) +- Changed files: `git diff origin/HEAD --name-only` +- Diff: `git diff origin/HEAD` + +## Mode 4 — Commit-range mode + +**Trigger:** the user described a commit range, time window, or commit count against a locally checked-out repo. Recognize natural-language phrases such as: + +- **Time windows** — "the last week", "the last 7 days", "the past month", "since 2026-04-23", "between Apr 1 and Apr 28" +- **Commit counts** — "the last 20 commits", "the last 5 commits" +- **Explicit refs** — "from abc123 to def456", "between v1.0 and v1.1", "since the v2.0 tag" + +The user is expected to invoke this skill from inside the target repo's working tree. Mentions like "in the bitwarden/server repo" are confirmatory framing — the orchestrator does NOT navigate to other paths or search the filesystem. + +**Resolution sequence (perform before launching any subagents):** + +1. **Confirm the working directory is a git work tree.** Run `git rev-parse --is-inside-work-tree`. If it fails or returns false, abort with: "commit-range mode must be invoked from inside the target repo." Do not search elsewhere. + +2. **Resolve the commit range to a `..` pair.** + - **Time windows** → `=HEAD`. Determine the oldest commit in the window with `git log --since='' --reverse --pretty=%H | head -1`; `` is that commit's first parent (suffix `^`). If the window contains zero commits, abort with a clear message — there is nothing to review. + - **Commit counts** → `=HEAD~N`, `=HEAD`. + - **Explicit refs** → use them verbatim after validating each with `git rev-parse `. + +3. **Confirm with the user before launching subagents.** Print the `..` range (with short SHAs), the commit count, and the changed-file list, then invoke `AskUserQuestion` to confirm before proceeding. Reason: the multi-agent pipeline is expensive — a wrong range wastes substantial tokens and time, and the natural-language inputs leave room for misinterpretation that subagents cannot recover from. + +**Diff sources (after confirmation):** + +- Commits in range (for context only, not validation): `git log .. --oneline` +- Changed files: `git diff .. --name-only` +- Diff (cumulative across the range): `git diff ..` + +**Interpretation of "introduced by this change" in commit-range mode:** "introduced" means present in the cumulative diff of `..`; "pre-existing" means present at the parent of ``. Step 4 validation subagents must use this interpretation when applying the dismissal rules. diff --git a/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/references/report-template.md b/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/references/report-template.md new file mode 100644 index 0000000..4352662 --- /dev/null +++ b/plugins/bitwarden-code-review/skills/performing-multi-agent-code-review/references/report-template.md @@ -0,0 +1,73 @@ +# Report Template + +## Severity Icons + +- 🛑 **Blocker** — Must fix before merge +- ⚠️ **Important** — Potential issue, should fix +- ♻️ **Refactor** — Code restructuring needed + +## Source-Agent Friendly Names + +Every finding carries a `source_agent` value (per `finding-shape.md`). Render it on each finding using the friendly label below — it tells the reader which subagent caught the issue, which aids triage and per-agent calibration. + +| `source_agent` | Rendered label | +| -------------- | ----------------------------- | +| `architect` | Architecture agent | +| `quality` | Code quality agent | +| `bug` | Bug analysis agent | +| `security` | Security & logic agent | +| `validation` | Validation agent (collateral) | + +## Template + +```markdown +# Code Review: {PR title} (#{number}) + +**Date:** {YYYY-MM-DD} | **Reviewed by:** Claude Code + +## Summary + +| Severity | Count | +| ------------ | ----- | +| 🛑 Blocker | {n} | +| ⚠️ Important | {n} | +| ♻️ Refactor | {n} | + +{1-5 sentences for overall assessment.} + +## Findings + +### 🛑 Blockers + +#### {One-line summary (<100 chars)} + +`{file/path.ext}:{line}` +**Caught by:** {Friendly agent label} + +
Details + {Explanation, why it matters, suggested fix. Include code snippets where helpful.} +
+ +### ⚠️ Important + +### ♻️ Refactor + + + +## Reviewed and Dismissed + +
🔍 {n} initial findings dismissed after validation + + + +#### {One-line summary} + +`{file/path.ext}:{line}` +**Caught by:** {Friendly agent label} +**Original severity:** {🛑|⚠️|♻️} {Blocker|Important|Refactor} +**Original confidence:** {n}/100 +**Dismissed at:** {Step 4 validation | Step 5 severity audit} +**Dismissed because:** {One-sentence rejection reason} + +
+```