diff --git a/.github/workflows/test-all-integration.yml b/.github/workflows/test-all-integration.yml index c1d76f639..f583ddf0e 100644 --- a/.github/workflows/test-all-integration.yml +++ b/.github/workflows/test-all-integration.yml @@ -292,6 +292,10 @@ jobs: echo "No skill report found" fi + - name: Generate quality report + if: always() && matrix.skill == 'microsoft-foundry' + run: npm run quality-report || true + - name: Export report if: always() id: export-report diff --git a/tests/package.json b/tests/package.json index 3f44e7002..80d715db2 100644 --- a/tests/package.json +++ b/tests/package.json @@ -16,6 +16,7 @@ "coverage:grid": "node scripts/generate-coverage-grid.js", "report": "npx tsx scripts/generate-test-reports.ts", "results": "node scripts/show-test-results.js", + "quality-report": "node scripts/generate-quality-report.js", "update:snapshots": "node scripts/update-snapshots.js", "typecheck": "tsc --noEmit", "lint": "eslint", diff --git a/tests/scripts/generate-quality-report.js b/tests/scripts/generate-quality-report.js new file mode 100644 index 000000000..6586bd777 --- /dev/null +++ b/tests/scripts/generate-quality-report.js @@ -0,0 +1,1013 @@ +#!/usr/bin/env node + +/** + * Quality Report Generator + * + * Post-processes raw test outputs (JUnit XML, token-usage.json, agent-metadata) + * into a single skill-quality-report.json contract file. + * + * This is the "Layer 2" processor — sits between raw pipeline outputs and the + * reporting dashboard. The dashboard reads only this JSON, doing zero processing. + * + * Usage: + * node generate-quality-report.js # Process most recent test run + * node generate-quality-report.js --run # Process a specific test run + * node generate-quality-report.js --junit # Use a specific JUnit XML + */ + +import fs from "fs"; +import path from "path"; +import { fileURLToPath } from "url"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +const REPORTS_PATH = path.resolve(__dirname, "../reports"); +const TEST_RUN_PREFIX = "test-run-"; +const CONTRACT_VERSION = "1.0"; + +// ─── CLI Argument Parsing ──────────────────────────────────────────────────── + +function parseArgs(argv) { + const args = argv.slice(2); + let runDir = null; + let junitPath = null; + + for (let i = 0; i < args.length; i++) { + if (args[i] === "--run" && i + 1 < args.length) runDir = args[++i]; + if (args[i] === "--junit" && i + 1 < args.length) junitPath = args[++i]; + } + + return { runDir, junitPath }; +} + +// ─── Find Most Recent Test Run ─────────────────────────────────────────────── + +function getMostRecentTestRun() { + if (!fs.existsSync(REPORTS_PATH)) return null; + const entries = fs.readdirSync(REPORTS_PATH, { withFileTypes: true }); + const testRuns = entries + .filter(e => e.isDirectory() && e.name.startsWith(TEST_RUN_PREFIX)) + .map(e => e.name) + .sort() + .reverse(); + return testRuns.length > 0 ? testRuns[0] : null; +} + +// ─── JUnit XML Parser (regex-based, matches existing show-test-results.js) ── + +function extractAttr(tag, name) { + const match = tag.match(new RegExp(`${name}="([^"]*)"`)); + return match ? match[1] : null; +} + +function parseJunitXml(xmlPath) { + if (!fs.existsSync(xmlPath)) return null; + const xml = fs.readFileSync(xmlPath, "utf-8"); + + const result = { + totalTests: 0, + failures: 0, + errors: 0, + time: 0, + suites: [], + }; + + const headerMatch = xml.match(/]*>/); + if (headerMatch) { + const attrs = headerMatch[0]; + result.totalTests = parseInt(extractAttr(attrs, "tests") || "0", 10); + result.failures = parseInt(extractAttr(attrs, "failures") || "0", 10); + result.errors = parseInt(extractAttr(attrs, "errors") || "0", 10); + result.time = parseFloat(extractAttr(attrs, "time") || "0"); + } + + const suiteRegex = /]*>[\s\S]*?<\/testsuite>/g; + let suiteMatch; + while ((suiteMatch = suiteRegex.exec(xml)) !== null) { + const suiteXml = suiteMatch[0]; + const suiteAttrsMatch = suiteXml.match(/]*>/); + if (!suiteAttrsMatch) continue; + + const sa = suiteAttrsMatch[0]; + const suite = { + name: extractAttr(sa, "name") || "Unknown", + tests: parseInt(extractAttr(sa, "tests") || "0", 10), + failures: parseInt(extractAttr(sa, "failures") || "0", 10), + time: parseFloat(extractAttr(sa, "time") || "0"), + testcases: [], + }; + + const tcRegex = /]*>[\s\S]*?<\/testcase>|/g; + let tcMatch; + while ((tcMatch = tcRegex.exec(suiteXml)) !== null) { + const tcXml = tcMatch[0]; + const tcAttrs = tcXml.match(/]*>/)?.[0]; + if (!tcAttrs) continue; + + const tc = { + classname: extractAttr(tcAttrs, "classname") || "", + name: extractAttr(tcAttrs, "name") || "Unknown", + time: parseFloat(extractAttr(tcAttrs, "time") || "0"), + status: "passed", + failure: null, + }; + + const failMatch = tcXml.match(/]*>([\s\S]*?)<\/failure>/); + if (failMatch) { + tc.status = "failed"; + tc.failure = failMatch[1].trim().substring(0, 500); + } + const errMatch = tcXml.match(/]*>([\s\S]*?)<\/error>/); + if (errMatch) { + tc.status = "error"; + tc.failure = errMatch[1].trim().substring(0, 500); + } + if (tcXml.includes(" { + try { return JSON.parse(line); } catch { return null; } + }).filter(Boolean); + } catch { + // Fall back to legacy JSON format + } + const summaryPath = path.join(testRunPath, "token-summary.json"); + try { + const raw = fs.readFileSync(summaryPath, "utf-8"); + if (!raw.trim()) return []; + return JSON.parse(raw); + } catch (err) { + console.warn( + `Warning: Failed to load token summary at ${testRunPath}: ${ + err && err.message ? err.message : err + }` + ); + return []; + } +} + +// ─── Skill Area Extraction ─────────────────────────────────────────────────── + +/** + * Extract skill area from test directory name or JUnit classname. + * Handles both formats: + * Directory: "microsoft-foundry_deploy-model_-_Integration_Tests_..." → "microsoft-foundry/deploy-model" + * JUnit: "microsoft-foundry_deploy-model - Integration Tests ›..." → "microsoft-foundry/deploy-model" + * JUnit: "microsoft-foundry_ - Integration Tests" → "microsoft-foundry" + */ +function extractSkillArea(name) { + // Try JUnit classname format first: "skill_sub - Integration Tests..." + const match = name.match(/^(.+?)\s*-\s*Integration Tests/); + if (match) { + let skillPart = match[1].trim(); + // "microsoft-foundry_deploy-model" → "microsoft-foundry/deploy-model" + // "microsoft-foundry_" → "microsoft-foundry" + skillPart = skillPart.replace(/_$/, ""); // trailing underscore + return skillPart.replace(/_/g, "/"); + } + + // Try directory name format: "skill_sub_-_Integration_Tests_..." + const integrationIdx = name.indexOf("_-_Integration_Tests"); + if (integrationIdx !== -1) { + const skillPart = name.substring(0, integrationIdx); + return skillPart.replace(/_/g, "/"); + } + + // Fallback + const parts = name.split("_"); + return parts[0]; +} + +// ─── Tool Call Extraction ───────────────────────────────────────────────────── + +/** + * Extract tool calls from structured agent-metadata.json. + */ +function extractToolCalls(testRunPath, dirName) { + const dirPath = path.join(testRunPath, dirName); + if (!fs.existsSync(dirPath)) return []; + + const jsonPath = path.join(dirPath, "agent-metadata.json"); + if (!fs.existsSync(jsonPath)) return []; + + try { + const data = JSON.parse(fs.readFileSync(jsonPath, "utf-8")); + const toolCalls = []; + + for (const event of (data.events || [])) { + // SDK events use tool.execution_start with data.toolName + if (event.type === "tool.execution_start") { + const toolName = event.data?.toolName || ""; + if (toolName === "skill") { + const args = event.data?.arguments; + const skillName = typeof args === "string" ? args : JSON.stringify(args || ""); + toolCalls.push({ tool: "skill", args: skillName, source: "agent-metadata.json" }); + } else { + const args = event.data?.arguments || ""; + let argsStr = ""; + if (typeof args === "string") { + try { + const parsed = JSON.parse(args); + argsStr = Object.entries(parsed).map(([k,v]) => `${k}: ${typeof v === "string" ? v.substring(0, 80) : v}`).join(", "); + } catch { argsStr = args.substring(0, 100); } + } else if (typeof args === "object") { + argsStr = Object.entries(args).map(([k,v]) => `${k}: ${typeof v === "string" ? v.substring(0, 80) : v}`).join(", "); + } + toolCalls.push({ tool: toolName, args: argsStr, source: "agent-metadata.json" }); + } + } else if (event.type === "assistant.reasoning") { + const reasoning = event.data?.content || event.data?.text || ""; + const lastCall = toolCalls[toolCalls.length - 1]; + if (lastCall && reasoning) { + lastCall.reasoning = reasoning.substring(0, 500); + } + } + } + + return toolCalls; + } catch (err) { + console.warn(`Warning: Failed to parse ${jsonPath}: ${err.message}`); + return []; + } +} + +// ─── Build Area Summaries ──────────────────────────────────────────────────── + +function buildAreaSummaries(junit, tokenEntries) { + const areaMap = new Map(); + + // Process JUnit test cases + if (junit) { + for (const suite of junit.suites) { + for (const tc of suite.testcases) { + const area = extractSkillArea(tc.classname.split(" › ")[0] || suite.name); + if (!areaMap.has(area)) { + areaMap.set(area, { + name: area, + tests: 0, + passed: 0, + failed: 0, + skipped: 0, + totalInputTokens: 0, + totalOutputTokens: 0, + totalLLMCalls: 0, + totalDurationMs: 0, + testDetails: [], + }); + } + const entry = areaMap.get(area); + entry.tests++; + if (tc.status === "passed") entry.passed++; + else if (tc.status === "failed" || tc.status === "error") entry.failed++; + else if (tc.status === "skipped") entry.skipped++; + + entry.testDetails.push({ + name: tc.name, + status: tc.status, + time: tc.time, + failure: tc.failure, + }); + } + } + } + + // Merge token data + for (const tokenEntry of tokenEntries) { + const area = extractSkillArea(tokenEntry.testName); + if (!areaMap.has(area)) { + areaMap.set(area, { + name: area, + tests: 0, + passed: 0, + failed: 0, + skipped: 0, + totalInputTokens: 0, + totalOutputTokens: 0, + totalLLMCalls: 0, + totalDurationMs: 0, + testDetails: [], + }); + } + const entry = areaMap.get(area); + entry.totalInputTokens += tokenEntry.inputTokens || 0; + entry.totalOutputTokens += tokenEntry.outputTokens || 0; + entry.totalLLMCalls += tokenEntry.apiCallCount || 0; + entry.totalDurationMs += tokenEntry.totalApiDurationMs || 0; + } + + // Compute derived metrics + const areas = []; + for (const [, entry] of areaMap) { + const totalTests = entry.tests || 1; + areas.push({ + name: entry.name, + tests: entry.tests, + passed: entry.passed, + failed: entry.failed, + skipped: entry.skipped, + passRate: totalTests > 0 ? Math.round((entry.passed / totalTests) * 1000) / 10 : null, + avgInputTokens: Math.round(entry.totalInputTokens / totalTests), + avgOutputTokens: Math.round(entry.totalOutputTokens / totalTests), + totalInputTokens: entry.totalInputTokens, + totalOutputTokens: entry.totalOutputTokens, + avgLLMCalls: Math.round((entry.totalLLMCalls / totalTests) * 10) / 10, + totalLLMCalls: entry.totalLLMCalls, + avgDurationMs: Math.round(entry.totalDurationMs / totalTests), + testDetails: entry.testDetails, + }); + } + + return areas.sort((a, b) => a.passRate - b.passRate); +} + +// ─── Build Token Usage Per Test ────────────────────────────────────────────── + +function buildTokenUsage(tokenEntries) { + return tokenEntries.map(entry => ({ + testName: entry.testName, + prompt: entry.prompt, + timestamp: entry.timestamp, + model: entry.model, + inputTokens: entry.inputTokens, + outputTokens: entry.outputTokens, + cacheReadTokens: entry.cacheReadTokens || 0, + cacheWriteTokens: entry.cacheWriteTokens || 0, + llmCalls: entry.apiCallCount, + durationMs: entry.totalApiDurationMs, + perCall: (entry.perCallUsage || []).map((call, i) => ({ + call: i + 1, + model: call.model, + inputTokens: call.inputTokens, + outputTokens: call.outputTokens, + durationMs: call.durationMs, + initiator: call.initiator, + })), + })); +} + +// ─── Expected Path Definitions ─────────────────────────────────────────────── + +/** + * Expected execution paths per skill area. + * Each entry defines the ideal sequence of tool calls the agent should make. + * Used for side-by-side comparison in the trace viewer. + */ +const EXPECTED_PATHS = { + // Derived from passing test traces — each path reflects the common + // tool-call sequence observed across successful test executions. + "microsoft-foundry/deploy-model": { + steps: [ + { tool: "skill", args: "microsoft-foundry", label: "🔧 skill:\nmicrosoft-foundry" }, + { tool: "report_intent", label: "📋 report_intent\nLog deployment intent" }, + { tool: "view", label: "📖 Read skill doc\ndeploy-model SKILL.md" }, + { tool: "azure-foundry", label: "🏗️ azure-foundry\nList/check resources" }, + { tool: "powershell", label: "💻 powershell\naz cognitiveservices deploy" }, + ], + endLabel: "⬜ EXPECTED END\nModel deployed", + }, + "microsoft-foundry/quota": { + steps: [ + { tool: "skill", args: "microsoft-foundry", label: "🔧 skill:\nmicrosoft-foundry" }, + { tool: "report_intent", label: "📋 report_intent\nLog quota check" }, + { tool: "view", label: "📖 Read skill doc\nquota SKILL.md" }, + { tool: "azure-foundry", label: "🏗️ azure-foundry\nQuery Foundry resources" }, + { tool: "azure-quota", label: "📊 azure-quota\nRetrieve quota data" }, + ], + endLabel: "⬜ EXPECTED END\nQuota info returned", + }, + "microsoft-foundry/resource/create": { + steps: [ + { tool: "skill", args: "microsoft-foundry", label: "🔧 skill:\nmicrosoft-foundry" }, + { tool: "report_intent", label: "📋 report_intent\nLog resource creation" }, + { tool: "view", label: "📖 Read skill doc\nresource/create SKILL.md" }, + { tool: "powershell", label: "💻 powershell\naz group create" }, + { tool: "powershell", label: "💻 powershell\naz cognitiveservices create" }, + ], + endLabel: "⬜ EXPECTED END\nResource created", + }, + "microsoft-foundry/capacity": { + steps: [ + { tool: "report_intent", label: "📋 report_intent\nLog capacity check" }, + { tool: "azure-foundry", label: "🏗️ azure-foundry\nQuery Foundry resources" }, + { tool: "azure-quota", label: "📊 azure-quota\nCheck capacity/quota" }, + { tool: "azure-extension_cli_generate", label: "🔧 cli_generate\nGenerate CLI command" }, + ], + endLabel: "⬜ EXPECTED END\nCapacity report shown", + }, + "microsoft-foundry/customize-deployment": { + steps: [ + { tool: "skill", args: "microsoft-foundry", label: "🔧 skill:\nmicrosoft-foundry" }, + { tool: "report_intent", label: "📋 report_intent\nLog customization" }, + { tool: "view", label: "📖 Read skill doc\ncustomize-deployment SKILL.md" }, + { tool: "view", label: "📖 Read references\nDeployment config docs" }, + { tool: "azure-foundry", label: "🏗️ azure-foundry\nApply custom settings" }, + ], + endLabel: "⬜ EXPECTED END\nDeployment customized", + }, + "microsoft-foundry/deploy-model-optimal-region": { + steps: [ + { tool: "skill", args: "microsoft-foundry", label: "🔧 skill:\nmicrosoft-foundry" }, + { tool: "report_intent", label: "📋 report_intent\nLog region search" }, + { tool: "view", label: "📖 Read skill doc\noptimal-region SKILL.md" }, + { tool: "azure-foundry", label: "🏗️ azure-foundry\nCheck region availability" }, + { tool: "powershell", label: "💻 powershell\nDeploy to optimal region" }, + { tool: "azure-quota", label: "📊 azure-quota\nVerify region capacity" }, + ], + endLabel: "⬜ EXPECTED END\nDeployed to best region", + }, + "microsoft-foundry/create": { + steps: [ + { tool: "skill", args: "microsoft-foundry", label: "🔧 skill:\nmicrosoft-foundry" }, + { tool: "report_intent", label: "📋 report_intent\nLog agent creation" }, + { tool: "view", label: "📖 Read skill doc\ncreate agent SKILL.md" }, + { tool: "powershell", label: "💻 powershell\nSetup agent resources" }, + ], + endLabel: "⬜ EXPECTED END\nAgent created", + }, + "microsoft-foundry/foundry-agent": { + steps: [ + { tool: "skill", args: "microsoft-foundry", label: "🔧 skill:\nmicrosoft-foundry" }, + { tool: "report_intent", label: "📋 report_intent\nLog agent task" }, + { tool: "view", label: "📖 Read skill doc\nfoundry-agent SKILL.md" }, + { tool: "azure-foundry", label: "🏗️ azure-foundry\nManage agent resources" }, + { tool: "powershell", label: "💻 powershell\nExecute agent operation" }, + ], + endLabel: "⬜ EXPECTED END\nAgent task complete", + }, + "microsoft-foundry/observe": { + steps: [ + { tool: "skill", args: "microsoft-foundry", label: "🔧 skill:\nmicrosoft-foundry" }, + { tool: "report_intent", label: "📋 report_intent\nLog agent observe" }, + { tool: "view", label: "📖 Read skill doc\nobserve SKILL.md" }, + { tool: "powershell", label: "💻 powershell\nQuery agent metrics" }, + ], + endLabel: "⬜ EXPECTED END\nAgent observability shown", + }, + "microsoft-foundry": { + steps: [ + { tool: "skill", args: "microsoft-foundry", label: "🔧 skill:\nmicrosoft-foundry" }, + { tool: "report_intent", label: "📋 report_intent\nLog skill routing" }, + { tool: "view", label: "📖 Read skill doc\nmicrosoft-foundry SKILL.md" }, + { tool: "view", label: "📖 Read sub-skill doc\nSub-skill reference" }, + { tool: "powershell", label: "💻 powershell\nExecute Azure operation" }, + ], + endLabel: "⬜ EXPECTED END\nOperation complete", + }, +}; + +/** + * Build expected path nodes and edges for a skill area. + * Returns { nodes: [], edges: [] } with Cytoscape-ready data. + */ +function buildExpectedPath(skillArea) { + const expectedDef = EXPECTED_PATHS[skillArea]; + if (!expectedDef) return null; + + const nodes = []; + const edges = []; + let prevId = "start"; // share start node with actual + + for (let i = 0; i < expectedDef.steps.length; i++) { + const step = expectedDef.steps[i]; + const llmId = `el${i + 1}`; + const stepId = `e${i + 1}`; + + // LLM call node + nodes.push({ + id: llmId, + label: `🤖 LLM Call #${i + 1}\nAnalyze & route`, + type: "expected_llm", + detail: { desc: `Expected: LLM decides to call ${step.tool}` }, + }); + edges.push({ source: prevId, target: llmId, type: "expected" }); + + // Tool call node + nodes.push({ + id: stepId, + label: step.label, + type: "expected", + step: i + 1, + detail: { tool: step.tool, args: step.args || "", desc: step.label.replace(/\n/g, " ") }, + }); + edges.push({ source: llmId, target: stepId, type: "expected" }); + + prevId = stepId; + } + + // End expected + const endLlmId = `el${expectedDef.steps.length + 1}`; + nodes.push({ + id: endLlmId, + label: `🤖 LLM Call #${expectedDef.steps.length + 1}\nGenerate response`, + type: "expected_llm", + detail: { desc: "Expected: LLM generates final response to user" }, + }); + edges.push({ source: prevId, target: endLlmId, type: "expected" }); + + nodes.push({ + id: "end_exp", + label: expectedDef.endLabel, + type: "endOk", + shape: "ellipse", + detail: { desc: "Expected outcome" }, + }); + edges.push({ source: endLlmId, target: "end_exp", type: "expected" }); + + return { nodes, edges, stepCount: expectedDef.steps.length }; +} + +// ─── Build Execution Traces ────────────────────────────────────────────────── + +/** + * Build execution trace graphs for the dashboard trace viewer. + * + * For each test that has agent-metadata output (from agent-runner), this function: + * 1. Extracts the ordered sequence of tool/skill calls from agent-metadata.json. + * 2. Pairs each call with token usage data from the per-call breakdown. + * 3. Builds a Cytoscape-compatible node/edge graph: Start → LLM Call → Tool → ... → End. + * 4. Overlays the expected execution path (if defined) and computes path adherence + * metrics (matched/deviated/extra/skipped steps and adherence percentage). + * + * @param {string} testRunPath - Path to the test-run reports directory. + * @param {Array} tokenEntries - Token usage entries from token-summary.jsonl. + * @returns {Object} Map of testName → { prompt, model, summary, nodes, edges, + * expectedNodes?, expectedEdges?, pathAdherence? } + */ +function buildTraces(testRunPath, tokenEntries) { + const traces = {}; + + // Group token entries by testName + const tokenByTest = new Map(); + for (const entry of tokenEntries) { + if (!tokenByTest.has(entry.testName)) { + tokenByTest.set(entry.testName, []); + } + tokenByTest.get(entry.testName).push(entry); + } + + for (const [testName, entries] of tokenByTest) { + // Use the last entry (most recent run) for this test + const latestEntry = entries[entries.length - 1]; + const toolCalls = extractToolCalls(testRunPath, testName); + + // Build Cytoscape-ready nodes + const nodes = []; + const edges = []; + + // Start node + nodes.push({ + id: "start", + label: `🟢 START\n${(latestEntry.prompt || "").substring(0, 40)}`, + type: "start", + shape: "ellipse", + detail: { prompt: latestEntry.prompt }, + }); + + // Build actual path nodes from perCallUsage + tool calls + const perCall = latestEntry.perCallUsage || []; + let prevNodeId = "start"; + + for (let i = 0; i < perCall.length; i++) { + const call = perCall[i]; + const llmNodeId = `llm${i + 1}`; + const toolCall = toolCalls[i]; // May not align 1:1 + + // LLM call node + nodes.push({ + id: llmNodeId, + label: `🤖 LLM Call #${i + 1}\n${call.inputTokens.toLocaleString()}↓ ${call.outputTokens.toLocaleString()}↑`, + type: "llmcall", + detail: { + desc: `LLM inference call #${i + 1}`, + tokens: { in: call.inputTokens, out: call.outputTokens }, + durationMs: call.durationMs, + model: call.model, + }, + }); + + edges.push({ + source: prevNodeId, + target: llmNodeId, + type: "llmcall", + label: `${call.inputTokens.toLocaleString()} in`, + }); + + // If there's a corresponding tool call, add it + if (toolCall) { + const toolNodeId = `t${i + 1}`; + nodes.push({ + id: toolNodeId, + label: `🔧 ${toolCall.tool}\n${toolCall.args.substring(0, 30)}`, + type: "matched", + detail: { + tool: toolCall.tool, + args: toolCall.args, + reasoning: toolCall.reasoning, + }, + }); + edges.push({ + source: llmNodeId, + target: toolNodeId, + type: "matched", + label: `${call.outputTokens.toLocaleString()} out`, + }); + prevNodeId = toolNodeId; + } else { + prevNodeId = llmNodeId; + } + } + + // End node + const endNodeId = "end_act"; + nodes.push({ + id: endNodeId, + label: `⬜ END\n${perCall.length} LLM calls`, + type: "endOk", + shape: "ellipse", + detail: { + totalTokens: { + in: latestEntry.inputTokens, + out: latestEntry.outputTokens, + llmCalls: latestEntry.apiCallCount, + }, + }, + }); + edges.push({ source: prevNodeId, target: endNodeId, type: "matched" }); + + traces[testName] = { + prompt: latestEntry.prompt, + model: latestEntry.model, + summary: { + inputTokens: latestEntry.inputTokens, + outputTokens: latestEntry.outputTokens, + llmCalls: latestEntry.apiCallCount, + durationMs: latestEntry.totalApiDurationMs, + }, + nodes, + edges, + }; + + // Add expected path and compute path adherence if available + const skillArea = extractSkillArea(testName); + const expectedPath = buildExpectedPath(skillArea); + if (expectedPath) { + traces[testName].expectedNodes = expectedPath.nodes; + traces[testName].expectedEdges = expectedPath.edges; + traces[testName].expectedSteps = expectedPath.stepCount; + + // Compute path adherence: compare actual tool calls vs expected steps + const actualTools = nodes.filter(n => n.type === "matched").map(n => ({ + tool: n.detail.tool, + args: n.detail.args || "", + })); + const expectedSteps = expectedPath.nodes + .filter(n => n.type === "expected") + .map(n => ({ tool: n.detail.tool, args: n.detail.args || "" })); + + let matched = 0, deviated = 0, extra = 0; + const matchedExpectedIdx = new Set(); + + for (let ai = 0; ai < actualTools.length; ai++) { + const at = actualTools[ai]; + // Find matching expected step (in order, not yet matched) + let found = false; + for (let ei = 0; ei < expectedSteps.length; ei++) { + if (matchedExpectedIdx.has(ei)) continue; + const et = expectedSteps[ei]; + if (at.tool === et.tool && (et.args === "" || at.args.includes(et.args))) { + matched++; + matchedExpectedIdx.add(ei); + found = true; + // Update node type to 'matched' + const matchIdx = nodes.findIndex(n => n.id === `t${ai + 1}`); + if (matchIdx >= 0) nodes[matchIdx].type = "matched"; + break; + } + } + if (!found) { + // Check if tool exists in expected but wrong order/args → deviated + const anyMatch = expectedSteps.some(et => at.tool === et.tool); + if (anyMatch) { + deviated++; + const nodeIdx = nodes.findIndex(n => n.id === `t${ai + 1}`); + if (nodeIdx >= 0) { + nodes[nodeIdx].type = "deviated"; + nodes[nodeIdx].label = nodes[nodeIdx].label.replace("🔧", "❌"); + } + // Update edge type too + const edgeIdx = edges.findIndex(e => e.target === `t${ai + 1}`); + if (edgeIdx >= 0) edges[edgeIdx].type = "deviated"; + } else { + extra++; + const nodeIdx = nodes.findIndex(n => n.id === `t${ai + 1}`); + if (nodeIdx >= 0) { + nodes[nodeIdx].type = "extra"; + nodes[nodeIdx].label = nodes[nodeIdx].label.replace("🔧", "⚡"); + } + const edgeIdx = edges.findIndex(e => e.target === `t${ai + 1}`); + if (edgeIdx >= 0) edges[edgeIdx].type = "extra"; + } + } + } + + const skipped = expectedSteps.length - matchedExpectedIdx.size; + const adherence = expectedSteps.length > 0 + ? Math.round((matched / expectedSteps.length) * 100) + : 0; + + // Update end node based on adherence + const endNode = nodes.find(n => n.id === "end_act"); + if (endNode) { + if (adherence >= 70) { + endNode.type = "endOk"; + endNode.label = `🟢 END\n${adherence}% adherence`; + } else if (adherence >= 40) { + endNode.type = "endWarn"; + endNode.label = `🟡 END\n${adherence}% adherence`; + } else { + endNode.type = "endBad"; + endNode.label = `🔴 END\n${adherence}% adherence`; + } + } + + traces[testName].pathAdherence = { + expected: expectedSteps.length, + actual: actualTools.length, + matched, + deviated, + extra, + skipped, + adherence, + }; + } + } + + return traces; +} + +// ─── Skill Coverage Analysis ───────────────────────────────────────────────── + +const SKILLS_PATH = path.resolve(__dirname, "../../plugin/skills"); +const TESTS_PATH = path.resolve(__dirname, ".."); + +/** + * Recursively find sub-skills within a skill directory. + * A sub-skill is a directory containing a SKILL.md or a primary .md file + * (excluding references/ directories). + */ +function findSubSkills(skillDir, prefix = "") { + const subSkills = []; + if (!fs.existsSync(skillDir)) return subSkills; + + const entries = fs.readdirSync(skillDir, { withFileTypes: true }); + for (const entry of entries) { + if (!entry.isDirectory()) continue; + if (["references", "templates", "assets", "scripts", "examples"].includes(entry.name)) continue; + + const subPath = path.join(skillDir, entry.name); + const relPath = prefix ? `${prefix}/${entry.name}` : entry.name; + + // Check if this directory is a sub-skill (has SKILL.md or a primary .md) + const hasSkillMd = fs.existsSync(path.join(subPath, "SKILL.md")); + const mdFiles = fs.readdirSync(subPath).filter(f => + f.endsWith(".md") && f !== "SKILL.md" && !f.startsWith("TEST_") && !f.startsWith("EXAMPLES") + ); + const hasPrimaryMd = mdFiles.length > 0; + + if (hasSkillMd || hasPrimaryMd) { + subSkills.push({ + name: entry.name, + path: relPath, + hasSkillMd, + primaryMd: hasSkillMd ? "SKILL.md" : mdFiles[0] || null, + }); + } + + // Recurse into subdirectories + const nested = findSubSkills(subPath, relPath); + subSkills.push(...nested); + } + return subSkills; +} + +/** + * Check if a test directory exists for a given skill/sub-skill path. + */ +function hasIntegrationTest(skillPath) { + // Direct match: tests//integration.test.ts + const testDir = path.join(TESTS_PATH, skillPath); + if (fs.existsSync(path.join(testDir, "integration.test.ts"))) return true; + + const parts = skillPath.split("/"); + const lastPart = parts[parts.length - 1]; + const parentSkill = parts[0]; + + // Check alternative test layout patterns + const testPaths = [ + // Nested under parent: tests/microsoft-foundry/quota/integration.test.ts + path.join(TESTS_PATH, parentSkill, lastPart, "integration.test.ts"), + // Skill path with models/deploy nesting: tests/microsoft-foundry/models/deploy/deploy-model + path.join(TESTS_PATH, skillPath, "integration.test.ts"), + ]; + + // Also try finding a test dir anywhere under parent that contains lastPart + if (fs.existsSync(path.join(TESTS_PATH, parentSkill))) { + const findTest = (dir) => { + if (!fs.existsSync(dir)) return false; + const entries = fs.readdirSync(dir, { withFileTypes: true }); + for (const e of entries) { + if (!e.isDirectory()) continue; + const sub = path.join(dir, e.name); + if (e.name === lastPart && fs.existsSync(path.join(sub, "integration.test.ts"))) return true; + if (findTest(sub)) return true; + } + return false; + }; + if (findTest(path.join(TESTS_PATH, parentSkill))) return true; + } + + return testPaths.some(p => fs.existsSync(p)); +} + +/** + * Build coverage analysis for all skills. + * Compares skill definitions (SKILL.md) against test coverage. + */ +function buildCoverage(areas) { + const coverage = {}; + + if (!fs.existsSync(SKILLS_PATH)) { + console.warn("⚠️ Skills directory not found, skipping coverage analysis"); + return coverage; + } + + const topLevelSkills = fs.readdirSync(SKILLS_PATH, { withFileTypes: true }) + .filter(e => e.isDirectory()) + .map(e => e.name); + + // Set of area keys that have test results + const testedAreas = new Set(areas.map(a => a.name)); + + for (const skillName of topLevelSkills) { + const skillDir = path.join(SKILLS_PATH, skillName); + const subSkills = findSubSkills(skillDir); + + // Check top-level integration test + const hasTopLevelTest = fs.existsSync( + path.join(TESTS_PATH, skillName, "integration.test.ts") + ); + const hasUnitTest = fs.existsSync( + path.join(TESTS_PATH, skillName, "unit.test.ts") + ); + const hasTriggersTest = fs.existsSync( + path.join(TESTS_PATH, skillName, "triggers.test.ts") + ); + + // Check sub-skill test coverage + const subSkillCoverage = subSkills.map(ss => { + const fullPath = `${skillName}/${ss.path}`; + const hasTest = hasIntegrationTest(fullPath); + // Also check if area appears in test results + const hasResults = testedAreas.has(fullPath) || + testedAreas.has(`${skillName}/${ss.name}`); + return { + name: ss.name, + path: ss.path, + hasTest, + hasResults, + }; + }); + + const coveredSubSkills = subSkillCoverage.filter(s => s.hasTest || s.hasResults).length; + const totalSubSkills = subSkillCoverage.length; + + coverage[skillName] = { + hasIntegrationTest: hasTopLevelTest, + hasUnitTest, + hasTriggersTest, + subSkills: { + total: totalSubSkills, + covered: coveredSubSkills, + percentage: totalSubSkills > 0 + ? Math.round((coveredSubSkills / totalSubSkills) * 100) + : (hasTopLevelTest ? 100 : 0), + missing: subSkillCoverage + .filter(s => !s.hasTest && !s.hasResults) + .map(s => s.path), + details: subSkillCoverage, + }, + }; + } + + return coverage; +} + +// ─── Main ──────────────────────────────────────────────────────────────────── + +function main() { + const { runDir, junitPath } = parseArgs(process.argv); + + // Resolve test run directory + let testRunName = runDir; + if (!testRunName) { + testRunName = getMostRecentTestRun(); + if (!testRunName) { + console.error("❌ No test runs found in", REPORTS_PATH); + process.exit(1); + } + } + const testRunPath = path.join(REPORTS_PATH, testRunName); + console.log(`📂 Processing test run: ${testRunName}`); + + // Parse JUnit XML + const resolvedJunit = junitPath || path.join(REPORTS_PATH, "junit.xml"); + console.log(`📋 Reading JUnit XML: ${resolvedJunit}`); + const junit = parseJunitXml(resolvedJunit); + + // Load token data + console.log("🔢 Loading token data..."); + const tokenEntries = loadTokenSummary(testRunPath); + console.log(` Found ${tokenEntries.length} token entries`); + + // Build report sections + console.log("📊 Building area summaries..."); + const areas = buildAreaSummaries(junit, tokenEntries); + + console.log("💬 Building token usage details..."); + const tokenUsage = buildTokenUsage(tokenEntries); + + console.log("🔀 Building execution traces..."); + const traces = buildTraces(testRunPath, tokenEntries); + + console.log("📋 Building skill coverage analysis..."); + const coverage = buildCoverage(areas); + + // Compute global summary + const totalTests = junit ? junit.totalTests : areas.reduce((s, a) => s + a.tests, 0); + const totalPassed = areas.reduce((s, a) => s + a.passed, 0); + const totalFailed = areas.reduce((s, a) => s + a.failed, 0); + const totalInputTokens = areas.reduce((s, a) => s + a.totalInputTokens, 0); + const totalOutputTokens = areas.reduce((s, a) => s + a.totalOutputTokens, 0); + const totalLLMCalls = areas.reduce((s, a) => s + a.totalLLMCalls, 0); + + // Build the contract + const report = { + version: CONTRACT_VERSION, + generatedAt: new Date().toISOString(), + testRun: testRunName, + model: tokenEntries.length > 0 ? tokenEntries[0].model : "unknown", + + summary: { + totalTests, + passed: totalPassed, + failed: totalFailed, + passRate: totalTests > 0 ? Math.round((totalPassed / totalTests) * 1000) / 10 : null, + totalInputTokens, + totalOutputTokens, + totalLLMCalls, + avgTokensPerTest: totalTests > 0 + ? Math.round((totalInputTokens + totalOutputTokens) / totalTests) + : 0, + totalDurationSec: junit ? Math.round(junit.time) : 0, + }, + + areas, + tokenUsage, + traces, + coverage, + }; + + // Write output + const outputPath = path.join(testRunPath, "skill-quality-report.json"); + fs.writeFileSync(outputPath, JSON.stringify(report, null, 2), "utf-8"); + console.log(`\n✅ Report generated: ${outputPath}`); + console.log(` Version: ${CONTRACT_VERSION}`); + console.log(` Tests: ${totalTests} (${totalPassed} passed, ${totalFailed} failed)`); + console.log(` Areas: ${areas.length}`); + console.log(` Traces: ${Object.keys(traces).length}`); + console.log(` Total tokens: ${(totalInputTokens + totalOutputTokens).toLocaleString()}`); + console.log(` Total LLM calls: ${totalLLMCalls}`); + + // Coverage summary + const covKeys = Object.keys(coverage); + const withTests = covKeys.filter(k => coverage[k].hasIntegrationTest).length; + const totalMissingSubs = covKeys.reduce((s, k) => s + coverage[k].subSkills.missing.length, 0); + console.log(` Coverage: ${withTests}/${covKeys.length} skills have integration tests, ${totalMissingSubs} sub-skills missing tests`); +} + +main(); diff --git a/tests/utils/agent-runner.ts b/tests/utils/agent-runner.ts index 0129ab861..84a908960 100644 --- a/tests/utils/agent-runner.ts +++ b/tests/utils/agent-runner.ts @@ -22,6 +22,7 @@ import { redactSecrets } from "./redact"; // Re-export for backward compatibility (consumers still import from agent-runner) export { getAllAssistantMessages } from "./evaluate"; +export { isSkillInvoked, getToolCalls } from "./evaluate"; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); @@ -38,6 +39,31 @@ function getBundledCliPath(): string { return path.resolve(__dirname, "../node_modules/@github/copilot/index.js"); } +export interface TokenUsage { + /** Total input tokens across all LLM calls */ + inputTokens: number; + /** Total output tokens across all LLM calls */ + outputTokens: number; + /** Total cache read tokens */ + cacheReadTokens: number; + /** Total cache write tokens */ + cacheWriteTokens: number; + /** Total API duration in milliseconds */ + totalApiDurationMs: number; + /** Number of LLM API calls made */ + apiCallCount: number; + /** Model used */ + model: string; + /** Per-call breakdown */ + perCallUsage: Array<{ + model: string; + inputTokens: number; + outputTokens: number; + durationMs: number; + initiator?: string; + }>; +} + export interface AgentMetadata { /** * Events emitted by the Copilot SDK agent during the agent run. @@ -49,6 +75,11 @@ export interface AgentMetadata { * These comments will be added to the agentMetadata markdown for an LLM or human reviewer to read. */ testComments: string[]; + + /** + * Token usage and cost data extracted from assistant.usage and session.shutdown events. + */ + tokenUsage?: TokenUsage; } /** @@ -113,6 +144,23 @@ function generateMarkdownReport(config: AgentRunConfig, agentMetadata: AgentMeta lines.push(config.prompt); lines.push(""); + // Token usage summary + if (agentMetadata.tokenUsage && agentMetadata.tokenUsage.apiCallCount > 0) { + const t = agentMetadata.tokenUsage; + lines.push("# Token Usage"); + lines.push(""); + lines.push("| Metric | Value |"); + lines.push("|--------|-------|"); + lines.push(`| Model | ${t.model} |`); + lines.push(`| Input Tokens | ${t.inputTokens.toLocaleString()} |`); + lines.push(`| Output Tokens | ${t.outputTokens.toLocaleString()} |`); + lines.push(`| Cache Read | ${t.cacheReadTokens.toLocaleString()} |`); + lines.push(`| Cache Write | ${t.cacheWriteTokens.toLocaleString()} |`); + lines.push(`| API Calls | ${t.apiCallCount} |`); + lines.push(`| API Duration | ${(t.totalApiDurationMs / 1000).toFixed(1)}s |`); + lines.push(""); + } + // Process events in chronological order lines.push("# Assistant"); lines.push(""); @@ -149,7 +197,7 @@ function generateMarkdownReport(config: AgentRunConfig, agentMetadata: AgentMeta } case "assistant.message_delta": { - // Accumulate deltas for streaming - we'll use the final message instead + // Accumulate deltas for streaming - we'll use the final message instead const messageId = event.data.messageId as string; const deltaContent = event.data.deltaContent as string; if (messageId && deltaContent) { @@ -169,7 +217,7 @@ function generateMarkdownReport(config: AgentRunConfig, agentMetadata: AgentMeta } case "assistant.reasoning_delta": { - // Accumulate reasoning deltas + // Accumulate reasoning deltas const reasoningId = event.data.reasoningId as string; const deltaContent = event.data.deltaContent as string; if (reasoningId && deltaContent) { @@ -193,7 +241,7 @@ function generateMarkdownReport(config: AgentRunConfig, agentMetadata: AgentMeta lines.push(`skill: ${skillName}`); lines.push("```"); } else { - // Regular tool call + // Regular tool call let argsJson: string; try { argsJson = JSON.stringify(args, null, 2); @@ -293,9 +341,24 @@ function writeMarkdownReport(config: AgentRunConfig, agentMetadata: AgentMetadat const markdown = redactSecrets(generateMarkdownReport(config, agentMetadata)); fs.writeFileSync(filePath, markdown, "utf-8"); + // Write structured agent-metadata.json for machine consumption + const jsonPath = path.join(dir, "agent-metadata.json"); + const jsonData = { + prompt: config.prompt || "", + events: agentMetadata.events, + testComments: agentMetadata.testComments, + tokenUsage: agentMetadata.tokenUsage, + }; + fs.writeFileSync(jsonPath, redactSecrets(JSON.stringify(jsonData, null, 2)), "utf-8"); + if (process.env.DEBUG) { console.log(`Markdown report written to: ${filePath}`); } + + // Write token usage JSON alongside the markdown report + if (agentMetadata.tokenUsage && agentMetadata.tokenUsage.apiCallCount > 0) { + writeTokenUsageJson(config, agentMetadata, dir); + } } catch (error) { // Don't fail the test if report generation fails if (process.env.DEBUG) { @@ -304,6 +367,47 @@ function writeMarkdownReport(config: AgentRunConfig, agentMetadata: AgentMetadat } } +/** + * Write token usage data to a JSON file for dashboard consumption. + * Also appends to a consolidated token-summary.json in the reports root. + */ +function writeTokenUsageJson(config: AgentRunConfig, agentMetadata: AgentMetadata, reportDir: string): void { + try { + const usage = agentMetadata.tokenUsage!; + const testName = getTestName(); + const record = { + testName, + prompt: config.prompt ? redactSecrets(config.prompt) : config.prompt, + timestamp: new Date().toISOString(), + model: usage.model, + inputTokens: usage.inputTokens, + outputTokens: usage.outputTokens, + cacheReadTokens: usage.cacheReadTokens, + cacheWriteTokens: usage.cacheWriteTokens, + totalApiDurationMs: usage.totalApiDurationMs, + apiCallCount: usage.apiCallCount, + perCallUsage: usage.perCallUsage, + }; + + // Write per-test token JSON + const tokenFile = path.join(reportDir, "token-usage.json"); + fs.writeFileSync(tokenFile, JSON.stringify(record, null, 2), "utf-8"); + + // Append to consolidated summary at reports root (JSONL for safe concurrent writes) + const testRunDirectoryName = `test-run-${testRunId || TIME_STAMP}`; + const summaryFile = path.join(DEFAULT_REPORT_DIR, testRunDirectoryName, "token-summary.jsonl"); + fs.appendFileSync(summaryFile, JSON.stringify(record) + "\n", "utf-8"); + + if (process.env.DEBUG) { + console.log(`Token usage written to: ${tokenFile}`); + } + } catch (error) { + if (process.env.DEBUG) { + console.error("Failed to write token usage JSON:", error); + } + } +} + /** * Sets up the agent runner with proper per-test cleanup via afterEach. * Call once inside each describe() block. Each describe() gets its own @@ -441,6 +545,62 @@ export function useAgentRunner() { await session.send({ prompt: config.prompt }); await done; + // Extract token usage from assistant.usage events + const tokenUsage: TokenUsage = { + inputTokens: 0, + outputTokens: 0, + cacheReadTokens: 0, + cacheWriteTokens: 0, + totalApiDurationMs: 0, + apiCallCount: 0, + model: modelOverride || "claude-sonnet-4.5", + perCallUsage: [], + }; + + for (const event of agentMetadata.events) { + if (event.type === "assistant.usage") { + tokenUsage.inputTokens += event.data.inputTokens ?? 0; + tokenUsage.outputTokens += event.data.outputTokens ?? 0; + tokenUsage.cacheReadTokens += event.data.cacheReadTokens ?? 0; + tokenUsage.cacheWriteTokens += event.data.cacheWriteTokens ?? 0; + tokenUsage.totalApiDurationMs += event.data.duration ?? 0; + tokenUsage.apiCallCount++; + tokenUsage.model = event.data.model || tokenUsage.model; + tokenUsage.perCallUsage.push({ + model: event.data.model, + inputTokens: event.data.inputTokens ?? 0, + outputTokens: event.data.outputTokens ?? 0, + durationMs: event.data.duration ?? 0, + initiator: event.data.initiator, + }); + } + // Also capture aggregate from session.shutdown if available + if (event.type === "session.shutdown" && event.data.modelMetrics) { + for (const [model, metrics] of Object.entries(event.data.modelMetrics)) { + tokenUsage.model = model; + // Prefer shutdown totals if usage events were missed + if (tokenUsage.apiCallCount === 0) { + tokenUsage.inputTokens = metrics.usage.inputTokens; + tokenUsage.outputTokens = metrics.usage.outputTokens; + tokenUsage.cacheReadTokens = metrics.usage.cacheReadTokens; + tokenUsage.cacheWriteTokens = metrics.usage.cacheWriteTokens; + tokenUsage.apiCallCount = metrics.requests.count; + } + } + } + } + + agentMetadata.tokenUsage = tokenUsage; + + // Log token usage summary + if (tokenUsage.apiCallCount > 0) { + console.log( + `\n📊 Token Usage: ${tokenUsage.inputTokens.toLocaleString()} in / ${tokenUsage.outputTokens.toLocaleString()} out | ` + + `${tokenUsage.apiCallCount} API calls | ` + + `Duration: ${(tokenUsage.totalApiDurationMs / 1000).toFixed(1)}s\n` + ); + } + // Send follow-up prompts for (const followUpPrompt of config.followUp ?? []) { isComplete = false;