Skip to content

Commit f242587

Browse files
committed
feat(runner): add support for running and repairing tests
This commit introduces the ability to run tests against the generated code as part of the evaluation process. A new optional `testCommand` can be in the environment configuration. If provided, this command will be executed after a successful build. If the tests fail, the tool will attempt to repair the code using the LLM, similar to how build failures are handled. The number of repair attempts is configurable. The report has been updated to display the test results for each run, including whether the tests passed, failed, or passed after repair. The summary view also includes aggregated statistics about the test results.
1 parent 932301a commit f242587

File tree

19 files changed

+685
-4
lines changed

19 files changed

+685
-4
lines changed

docs/environment-reference.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,3 +179,8 @@ Defaults to `<package manager> run build`.
179179

180180
Command used to start a local dev server as a part of the evaluation.
181181
Defaults to `<package manager> run start --port 0`.
182+
183+
### `testCommand`
184+
185+
Command used to run tests against the generated code. If this property is not provided, tests will not be run. The command should exit with code 0 on success and a non-zero exit code on failure. The output from the command (both `stdout` and `stderr`) is captured and used for repair attempts if the tests fail. The test command will time out after 2 minutes.
186+

report-app/src/app/pages/report-viewer/report-viewer.html

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,20 @@ <h3 class="chart-title">
7676
/>
7777
</div>
7878
</div>
79+
@if (overview.stats.tests) {
80+
<div class="chart-container test-results-details">
81+
<h3 class="chart-title">
82+
<span class="material-symbols-outlined"> quiz </span>
83+
<span>Tests</span>
84+
</h3>
85+
<div class="summary-card-item">
86+
<stacked-bar-chart
87+
[data]="testsAsGraphData(overview.stats.tests)"
88+
[compact]="true"
89+
/>
90+
</div>
91+
</div>
92+
}
7993
@if (overview.stats.runtime) {
8094
<div class="chart-container">
8195
<h3 class="chart-title">
@@ -275,6 +289,19 @@ <h2>Generated applications</h2>
275289
@if (initialAttempt?.buildResult?.status === 'error') {
276290
<span class="status-badge error">Initial build failed</span>
277291
}
292+
293+
<!-- Test status badges -->
294+
@if (result.testResult) {
295+
@if (result.testResult.passed) {
296+
@if ((result.testRepairAttempts || 0) > 0) {
297+
<span class="status-badge warning">Tests passed after repair</span>
298+
} @else {
299+
<span class="status-badge success">Tests passed</span>
300+
}
301+
} @else {
302+
<span class="status-badge error">Tests failed</span>
303+
}
304+
}
278305
</div>
279306
</div>
280307
</expansion-panel-header>
@@ -350,6 +377,29 @@ <h5>
350377
</div>
351378
</div>
352379

380+
@if (result.testResult) {
381+
<div class="app-details-section">
382+
<h4>Test Results</h4>
383+
<div class="test-summary">
384+
@if (result.testResult.passed) {
385+
<span class="status-text success">✔ Tests passed</span>
386+
@if ((result.testRepairAttempts || 0) > 0) {
387+
<span class="status-text">after {{ result.testRepairAttempts }} repair attempt(s)</span>
388+
}
389+
} @else {
390+
<span class="status-text error">✘ Tests failed</span>
391+
}
392+
</div>
393+
394+
@if (result.testResult.output) {
395+
<details class="test-output-button">
396+
<summary class="neutral-button">See Test Output</summary>
397+
<pre class="callout neutral code">{{ result.testResult.output }}</pre>
398+
</details>
399+
}
400+
</div>
401+
}
402+
353403
<div class="app-details-section">
354404
<h4>Additional info</h4>
355405
@for (attempt of result.attemptDetails; track attempt) {

report-app/src/app/pages/report-viewer/report-viewer.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import {
2020
LlmResponseFile,
2121
RunInfo,
2222
RunSummaryBuilds,
23+
RunSummaryTests,
2324
RuntimeStats,
2425
ScoreBucket,
2526
SkippedIndividualAssessment,
@@ -281,6 +282,31 @@ export class ReportViewer {
281282
];
282283
}
283284

285+
protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData {
286+
return [
287+
{
288+
label: 'Passed',
289+
color: ScoreCssVariable.excellent,
290+
value: tests.successfulInitialTests,
291+
},
292+
{
293+
label: 'Passed after repair',
294+
color: ScoreCssVariable.great,
295+
value: tests.successfulTestsAfterRepair,
296+
},
297+
{
298+
label: 'Failed',
299+
color: ScoreCssVariable.poor,
300+
value: tests.failedTests,
301+
},
302+
{
303+
label: 'No tests run',
304+
color: ScoreCssVariable.neutral,
305+
value: tests.noTestsRun,
306+
},
307+
];
308+
}
309+
284310
protected checksAsGraphData(buckets: ScoreBucket[]): StackedBarChartData {
285311
return buckets.map((b) => ({
286312
label: b.nameWithLabels,

runner/builder/builder-types.ts

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import { ProgressType } from '../progress/progress-logger.js';
2+
import { PackageSummary } from '@safety-web/types';
3+
import {
4+
AgentOutput,
5+
BrowserAgentTaskInput,
6+
} from '../testing/browser-agent/models.js';
7+
import { Result } from 'axe-core';
8+
import { CspViolation } from './auto-csp-types.js';
9+
10+
/**
11+
* Represents the message structure used for communication between
12+
* the main process and the build worker process.
13+
*/
14+
export interface BuildWorkerMessage {
15+
directory: string;
16+
/** Name of the app. */
17+
appName: string;
18+
/** Command used to build the app. */
19+
buildCommand: string;
20+
/** Command used to start a development server. */
21+
serveCommand: string;
22+
/** Command used to run tests for the app. */
23+
testCommand?: string;
24+
/**
25+
* Whether this application should be invoked via Puppeteer and
26+
* runtime errors should be collected and reported.
27+
*/
28+
collectRuntimeErrors?: boolean;
29+
/**
30+
* Whether to take a screenshot of the application after a successful build.
31+
*/
32+
takeScreenshots?: boolean;
33+
/**
34+
* Whether or not to perform Axe testing of the application after a successful build.
35+
*/
36+
includeAxeTesting?: boolean;
37+
38+
/** Whether to enable the auto CSP checks. */
39+
enableAutoCsp?: boolean;
40+
41+
/** User journey browser agent task input */
42+
userJourneyAgentTaskInput?: BrowserAgentTaskInput;
43+
}
44+
45+
export enum BuildResultStatus {
46+
SUCCESS = 'success',
47+
ERROR = 'error',
48+
}
49+
50+
export enum BuildErrorType {
51+
MISSING_DEPENDENCY = 'Missing Dependency', // "[ERROR] Could not resolve"
52+
TYPESCRIPT_ERROR = 'TypeScript Error', // "[ERROR] TS\d+"
53+
ANGULAR_DIAGNOSTIC = 'Angular Diagnostic', // "[ERROR] NG\d+"
54+
OTHER = 'Other',
55+
}
56+
57+
export interface BuildResult {
58+
status: BuildResultStatus;
59+
message: string;
60+
errorType?: BuildErrorType;
61+
screenshotPngUrl?: string;
62+
missingDependency?: string;
63+
runtimeErrors?: string;
64+
/** JSON report from the Safety Web runner, if available. */
65+
safetyWebReportJson?: PackageSummary[];
66+
userJourneyAgentOutput: AgentOutput | null;
67+
cspViolations?: CspViolation[];
68+
axeViolations?: Result[];
69+
}
70+
71+
export interface BuildResultMessage {
72+
type: 'build';
73+
payload: BuildResult;
74+
}
75+
76+
export interface BuildProgressLogMessage {
77+
type: 'log';
78+
payload: {
79+
state: ProgressType;
80+
message: string;
81+
details?: string;
82+
};
83+
}
84+
85+
export type BuilderProgressLogFn = (
86+
state: ProgressType,
87+
message: string,
88+
details?: string
89+
) => void;
90+
91+
export type BuildWorkerResponseMessage =
92+
| BuildResultMessage
93+
| BuildProgressLogMessage;
94+
95+
export enum RepairType {
96+
Build = 'Build',
97+
Axe = 'Axe',
98+
Test = 'Test',
99+
}

runner/configuration/constants.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@ export const LLM_OUTPUT_DIR = join(rootDir, 'llm-output');
2626
*/
2727
export const DEFAULT_MAX_REPAIR_ATTEMPTS = 1;
2828

29+
/**
30+
* Number of times we'll try to ask LLM to repair a test failure,
31+
* providing the test output and the code that causes the problem.
32+
*/
33+
export const DEFAULT_MAX_TEST_REPAIR_ATTEMPTS = 1;
34+
2935
/** Name of the folder where we store all generated reports */
3036
export const REPORTS_ROOT_DIR = join(rootDir, 'reports');
3137

runner/configuration/environment-local.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({
3535
* Defaults to `<package manager> run start --port 0`.
3636
*/
3737
serveCommand: z.string().optional(),
38+
/**
39+
* Command to run when testing the code.
40+
*/
41+
testCommand: z.string().optional(),
3842
/**
3943
* Whether to skip installing dependencies when running evals in the environment.
4044
* Useful if you're managing dependencies yourself.
@@ -56,6 +60,8 @@ export class LocalEnvironment extends BaseEnvironment {
5660
readonly buildCommand: string;
5761
/** Command to run when starting a development server inside the app. */
5862
readonly serveCommand: string;
63+
/** Command to run when starting tests inside the app. */
64+
readonly testCommand?: string;
5965
/**
6066
* Absolute path at which files specific to this environment are located. Will be merged in
6167
* with the files from the `projectTemplatePath` to get the final project structure.

runner/orchestration/build-repair.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ import { EvalID, Gateway } from './gateway.js';
2828
* @param abortSignal An AbortSignal to cancel the operation.
2929
* @param workerConcurrencyQueue The queue for managing worker concurrency.
3030
* @param attempts The current attempt number.
31-
* @param repairType The type of repair being performed.
3231
* @returns A promise that resolves to the new BuildResult.
3332
*/
3433
export async function repairAndBuild(

runner/orchestration/gateway.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import {
77
LlmResponse,
88
LlmResponseFile,
99
RootPromptDefinition,
10+
TestResult,
1011
} from '../shared-interfaces.js';
1112
import { BuildResult } from '../workers/builder/builder-types.js';
1213

@@ -35,6 +36,16 @@ export interface Gateway<Env extends Environment> {
3536
abortSignal: AbortSignal
3637
): Promise<LlmResponse>;
3738

39+
repairTest(
40+
id: EvalID,
41+
requestCtx: LlmGenerateFilesContext,
42+
model: string,
43+
errorMessage: string,
44+
appFiles: LlmResponseFile[],
45+
contextFiles: LlmContextFile[],
46+
abortSignal: AbortSignal
47+
): Promise<LlmResponse>;
48+
3849
shouldRetryFailedBuilds(evalID: EvalID): boolean;
3950

4051
tryBuild(
@@ -47,6 +58,16 @@ export interface Gateway<Env extends Environment> {
4758
progress: ProgressLogger
4859
): Promise<BuildResult>;
4960

61+
tryTest(
62+
id: EvalID,
63+
env: Env,
64+
appDirectoryPath: string,
65+
rootPromptDef: RootPromptDefinition,
66+
workerConcurrencyQueue: PQueue,
67+
abortSignal: AbortSignal,
68+
progress: ProgressLogger
69+
): Promise<TestResult | null>;
70+
5071
serveBuild<T>(
5172
id: EvalID,
5273
env: Env,

runner/orchestration/gateways/local_gateway.ts

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import {
1313
LlmContextFile,
1414
LlmResponse,
1515
LlmResponseFile,
16+
TestResult,
1617
} from '../../shared-interfaces.js';
1718
import { generateCodeWithAI } from '../codegen.js';
1819
import { EvalID, Gateway } from '../gateway.js';
@@ -66,6 +67,24 @@ export class LocalGateway implements Gateway<LocalEnvironment> {
6667
);
6768
}
6869

70+
async repairTest(
71+
_id: EvalID,
72+
requestCtx: LlmGenerateFilesContext,
73+
model: string,
74+
errorMessage: string,
75+
appFiles: LlmResponseFile[],
76+
contextFiles: LlmContextFile[],
77+
abortSignal: AbortSignal
78+
): Promise<LlmResponse> {
79+
return await generateCodeWithAI(
80+
this.llm,
81+
model,
82+
requestCtx,
83+
contextFiles,
84+
abortSignal
85+
);
86+
}
87+
6988
tryBuild(
7089
_id: EvalID,
7190
env: LocalEnvironment,
@@ -106,6 +125,46 @@ export class LocalGateway implements Gateway<LocalEnvironment> {
106125
);
107126
}
108127

128+
tryTest(
129+
_id: EvalID,
130+
env: LocalEnvironment,
131+
appDirectoryPath: string,
132+
rootPromptDef: RootPromptDefinition,
133+
workerConcurrencyQueue: PQueue,
134+
abortSignal: AbortSignal,
135+
progress: ProgressLogger
136+
): Promise<TestResult | null> {
137+
if (!env.testCommand) {
138+
return Promise.resolve(null);
139+
}
140+
const testParams = {
141+
directory: appDirectoryPath,
142+
appName: rootPromptDef.name,
143+
testCommand: env.testCommand,
144+
};
145+
146+
return workerConcurrencyQueue.add(
147+
() =>
148+
new Promise<TestResult>((resolve, reject) => {
149+
const child: ChildProcess = fork(
150+
path.resolve(import.meta.dirname, '../../workers/test/worker.js'),
151+
{ signal: abortSignal }
152+
);
153+
child.send(testParams);
154+
155+
child.on('message', async (result: any) => {
156+
await killChildProcessGracefully(child);
157+
resolve(result.payload);
158+
});
159+
child.on('error', async (err) => {
160+
await killChildProcessGracefully(child);
161+
reject(err);
162+
});
163+
}),
164+
{ throwOnTimeout: true }
165+
);
166+
}
167+
109168
async serveBuild<T>(
110169
_id: EvalID,
111170
env: LocalEnvironment,

0 commit comments

Comments
 (0)