Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions src/stages/3-execution/sdk-client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -268,10 +268,14 @@ export function isResultMessage(msg: SDKMessage): msg is SDKResultMessage {
}

/**
* Type guard for system message.
* Type guard for system init message (the one with plugins).
* The SDK has multiple system message types (init, status, hook_response, etc.),
* but only 'init' has the plugins array we need.
*/
export function isSystemMessage(msg: SDKMessage): msg is SDKSystemMessage {
return msg.type === "system";
return (
msg.type === "system" && (msg as { subtype?: string }).subtype === "init"
);
}

/**
Expand Down
4 changes: 4 additions & 0 deletions tests/e2e/helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ export function createE2EConfig(options: E2EConfigOptions = {}): EvalConfig {
max_tokens: 512,
reasoning_effort: "none",
semantic_variations: false,
api_timeout_ms: 60000, // 60s timeout (schema default)
temperature: 0.3, // Schema default
...generation,
};

Expand Down Expand Up @@ -144,6 +146,8 @@ export function createE2EConfig(options: E2EConfigOptions = {}): EvalConfig {
num_samples: 1,
aggregate_method: "average",
include_citations: false,
api_timeout_ms: 120000, // 120s timeout (schema default for complex reasoning)
temperature: 0.1, // Schema default for consistent judging
...evaluation,
};

Expand Down
74 changes: 37 additions & 37 deletions tests/e2e/pipeline.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -135,12 +135,12 @@ describeE2E("E2E: User Workflows", () => {
expect(scenarioTypes.has("skill")).toBe(true);

// Stage 3: Execution
const execution = await runExecution(
sharedAnalysis,
generation.scenarios,
const execution = await runExecution({
analysis: sharedAnalysis,
scenarios: generation.scenarios,
config,
consoleProgress,
);
progress: consoleProgress,
});

totalE2ECost += execution.total_cost_usd;
e2eTestCount++;
Expand All @@ -158,13 +158,13 @@ describeE2E("E2E: User Workflows", () => {
}

// Stage 4: Evaluation
const evaluation = await runEvaluation(
sharedAnalysis.plugin_name,
generation.scenarios,
execution.results,
const evaluation = await runEvaluation({
pluginName: sharedAnalysis.plugin_name,
scenarios: generation.scenarios,
executions: execution.results,
config,
consoleProgress,
);
progress: consoleProgress,
});

// Verify evaluation metrics
expect(evaluation.metrics).toBeDefined();
Expand Down Expand Up @@ -244,24 +244,24 @@ describeE2E("E2E: User Workflows", () => {
];

// Execute negative scenarios
const execution = await runExecution(
sharedAnalysis,
negativeScenarios,
const execution = await runExecution({
analysis: sharedAnalysis,
scenarios: negativeScenarios,
config,
consoleProgress,
);
progress: consoleProgress,
});

totalE2ECost += execution.total_cost_usd;
e2eTestCount++;

// Evaluate
const evaluation = await runEvaluation(
sharedAnalysis.plugin_name,
negativeScenarios,
execution.results,
const evaluation = await runEvaluation({
pluginName: sharedAnalysis.plugin_name,
scenarios: negativeScenarios,
executions: execution.results,
config,
consoleProgress,
);
progress: consoleProgress,
});

// All negative scenarios should NOT trigger
for (const result of evaluation.results) {
Expand Down Expand Up @@ -306,12 +306,12 @@ describeE2E("E2E: User Workflows", () => {
const generation = await runGeneration(sharedAnalysis, config);

// Should not throw, even with tiny budget
const execution = await runExecution(
sharedAnalysis,
generation.scenarios,
const execution = await runExecution({
analysis: sharedAnalysis,
scenarios: generation.scenarios,
config,
consoleProgress,
);
progress: consoleProgress,
});

// Verify execution completed (may have partial/no results due to budget)
expect(execution).toBeDefined();
Expand Down Expand Up @@ -403,23 +403,23 @@ describeMcp("E2E: MCP Server Pipeline", () => {
expect(generation.scenarios.length).toBeGreaterThan(0);

// Stage 3: Execution
const execution = await runExecution(
const execution = await runExecution({
analysis,
generation.scenarios,
scenarios: generation.scenarios,
config,
consoleProgress,
);
progress: consoleProgress,
});

expect(execution.results.length).toBeGreaterThan(0);

// Stage 4: Evaluation
const evaluation = await runEvaluation(
analysis.plugin_name,
generation.scenarios,
execution.results,
const evaluation = await runEvaluation({
pluginName: analysis.plugin_name,
scenarios: generation.scenarios,
executions: execution.results,
config,
consoleProgress,
);
progress: consoleProgress,
});

expect(evaluation.metrics).toBeDefined();
expect(evaluation.results.length).toBe(generation.scenarios.length);
Expand Down
18 changes: 17 additions & 1 deletion tests/unit/stages/3-execution/sdk-client.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -239,14 +239,30 @@ describe("SDK Message Type Guards", () => {
expect(isSystemMessage(msg)).toBe(true);
});

it("returns true for minimal system message", () => {
it("returns true for minimal system init message", () => {
const msg: SDKMessage = {
type: "system",
subtype: "init",
};

expect(isSystemMessage(msg)).toBe(true);
});

it("returns false for non-init system messages", () => {
// System status messages don't have the plugins array
const statusMsg: SDKMessage = {
type: "system",
subtype: "status",
};
expect(isSystemMessage(statusMsg)).toBe(false);

// System messages without subtype are also not init messages
const minimalMsg: SDKMessage = {
type: "system",
};
expect(isSystemMessage(minimalMsg)).toBe(false);
});

it("returns true for system message with MCP servers", () => {
const msg: SDKSystemMessage = {
type: "system",
Expand Down