Skip to content

Commit c082d8c

Browse files
committed
Auto-rater allows some previous rater results to be passed in as context.
1 parent 1cd74c2 commit c082d8c

File tree

6 files changed

+80
-19
lines changed

6 files changed

+80
-19
lines changed

runner/ratings/autoraters/code-rater.ts

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,20 @@ import { readFileSync } from 'node:fs';
22
import { z } from 'zod';
33
import { prepareContextFilesMessage } from '../../orchestration/codegen.js';
44
import { Environment } from '../../configuration/environment.js';
5-
import { LlmResponseFile } from '../../shared-interfaces.js';
5+
import {
6+
IndividualAssessment,
7+
IndividualAssessmentState,
8+
LlmResponseFile,
9+
SkippedIndividualAssessment,
10+
} from '../../shared-interfaces.js';
611
import {
712
AutoRateResult,
813
getCoefficient,
914
MAX_RATING,
1015
} from './auto-rate-shared.js';
1116
import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js';
1217
import defaultCodeRaterPrompt from './code-rating-prompt.js';
18+
import { RatingsContext } from '../rating-types.js';
1319

1420
/** Framework-specific hints for the rating prompt. */
1521
const FW_HINTS: Record<string, string | undefined> = {
@@ -33,14 +39,16 @@ const CACHED_RATING_PROMPTS: Record<string, string> = {};
3339
* @param environment Environment in which the rating is running.
3440
* @param files Files to be rated.
3541
* @param appPrompt Prompt to be used for the rating.
42+
* @param ratingsContext Context containing results from previous ratings.
3643
*/
3744
export async function autoRateCode(
3845
llm: GenkitRunner,
3946
abortSignal: AbortSignal,
4047
model: string,
4148
environment: Environment,
4249
files: LlmResponseFile[],
43-
appPrompt: string
50+
appPrompt: string,
51+
ratingsContext: RatingsContext
4452
): Promise<AutoRateResult> {
4553
const contextMessage = prepareContextFilesMessage(
4654
files.map((o) => ({
@@ -61,10 +69,22 @@ export async function autoRateCode(
6169
promptText = defaultCodeRaterPrompt;
6270
}
6371

64-
const prompt = environment.renderPrompt(promptText, null, {
65-
APP_PROMPT: appPrompt,
66-
FRAMEWORK_SPECIFIC_HINTS: FW_HINTS[environment.fullStackFramework.id] ?? '',
67-
}).result;
72+
const safetyRating = ratingsContext['safety-web'];
73+
const safetyWebResultsJson =
74+
safetyRating?.state === IndividualAssessmentState.EXECUTED
75+
? JSON.stringify(safetyRating, null, 2)
76+
: '';
77+
78+
const prompt = environment.renderPrompt(
79+
promptText,
80+
environment.codeRatingPromptPath,
81+
{
82+
APP_PROMPT: appPrompt,
83+
FRAMEWORK_SPECIFIC_HINTS:
84+
FW_HINTS[environment.fullStackFramework.id] ?? '',
85+
SAFETY_WEB_RESULTS_JSON: safetyWebResultsJson,
86+
}
87+
).result;
6888

6989
const result = await llm.generateConstrained({
7090
abortSignal,

runner/ratings/autoraters/rate-files.ts

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
import { greenCheckmark } from '../../reporting/format.js';
2-
import { AutoraterRunInfo, LlmResponseFile } from '../../shared-interfaces.js';
2+
import {
3+
AutoraterRunInfo,
4+
IndividualAssessment,
5+
LlmResponseFile,
6+
SkippedIndividualAssessment,
7+
} from '../../shared-interfaces.js';
38
import { autoRateCode } from './code-rater.js';
49
import { autoRateAppearance } from './visuals-rater.js';
510
import { Environment } from '../../configuration/environment.js';
611
import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js';
12+
import { RatingsContext } from '../rating-types.js';
713

814
/**
915
* Automatically rates the code inside of a file.
@@ -13,6 +19,7 @@ import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js';
1319
* @param filePath Path to the file to be rated.
1420
* @param appPrompt Prompt that should be checked.
1521
* @param screenshotPath Path to the screenshot to use for visual rating.
22+
* @param ratingsContext Context containing results from previous ratings.
1623
*/
1724
export async function autoRateFiles(
1825
llm: GenkitRunner,
@@ -21,7 +28,8 @@ export async function autoRateFiles(
2128
environment: Environment,
2229
files: LlmResponseFile[],
2330
appPrompt: string,
24-
screenshotPngUrl: string | null
31+
screenshotPngUrl: string | null,
32+
ratingsContext: RatingsContext
2533
): Promise<AutoraterRunInfo> {
2634
console.log(`Autorater is using '${model}' model. \n`);
2735

@@ -33,7 +41,8 @@ export async function autoRateFiles(
3341
model,
3442
environment,
3543
files,
36-
appPrompt
44+
appPrompt,
45+
ratingsContext
3746
);
3847
console.log(`${greenCheckmark()} Code scoring is successful.`);
3948

runner/ratings/built-in-ratings/code-quality-rating.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ export const codeQualityRating: LLMBasedRating = {
2121
ctx.model,
2222
ctx.environment,
2323
ctx.outputFiles,
24-
ctx.fullPromptText
24+
ctx.fullPromptText,
25+
ctx.ratingsContext
2526
);
2627

2728
return {

runner/ratings/built-in.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@ import {
2020
export function getBuiltInRatings(): Rating[] {
2121
return [
2222
successfulBuildRating,
23+
safetyWebRating,
2324
noRuntimeExceptionsRating,
2425
sufficientCodeSizeRating,
2526
sufficientGeneratedFilesRating,
2627
codeQualityRating,
2728
visualAppearanceRating,
2829
validCssRating,
2930
axeRating,
30-
safetyWebRating,
3131
userJourneysRating,
3232
NoInnerHtmlBindingsRating,
3333
NoDangerouslySetInnerHtmlRating,

runner/ratings/rate-code.ts

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import {
2020
POINTS_FOR_CATEGORIES,
2121
Rating,
2222
CATEGORY_NAMES,
23+
RatingsContext,
2324
} from './rating-types.js';
2425
import { extractEmbeddedCodeFromTypeScript } from './embedded-languages.js';
2526
import { Environment } from '../configuration/environment.js';
@@ -60,6 +61,7 @@ export async function rateGeneratedCode(
6061
let categorizedFiles: CategorizedFiles | null = null;
6162
let totalPoints = 0;
6263
let maxOverallPoints = 0;
64+
const ratingsContext: RatingsContext = {};
6365

6466
// Rating may also invoke LLMs. Track the usage.
6567
const tokenUsage = {
@@ -92,11 +94,16 @@ export async function rateGeneratedCode(
9294
buildResult,
9395
repairAttempts,
9496
outputFiles.length,
95-
axeRepairAttempts
97+
axeRepairAttempts,
98+
ratingsContext
9699
);
97100
} else if (current.kind === RatingKind.PER_FILE) {
98101
categorizedFiles ??= splitFilesIntoCategories(outputFiles);
99-
result = await runPerFileRating(current, categorizedFiles);
102+
result = await runPerFileRating(
103+
current,
104+
categorizedFiles,
105+
ratingsContext
106+
);
100107
} else if (current.kind === RatingKind.LLM_BASED) {
101108
result = await runLlmBasedRating(
102109
environment,
@@ -109,7 +116,8 @@ export async function rateGeneratedCode(
109116
repairAttempts,
110117
axeRepairAttempts,
111118
abortSignal,
112-
autoraterModel
119+
autoraterModel,
120+
ratingsContext
113121
);
114122
} else {
115123
throw new UserFacingError(`Unsupported rating type ${current}`);
@@ -135,6 +143,7 @@ export async function rateGeneratedCode(
135143
);
136144
}
137145

146+
ratingsContext[current.id] = result;
138147
category.assessments.push(result);
139148
}
140149

@@ -173,13 +182,15 @@ function runPerBuildRating(
173182
buildResult: BuildResult,
174183
repairAttempts: number,
175184
generatedFileCount: number,
176-
axeRepairAttempts: number
185+
axeRepairAttempts: number,
186+
ratingsContext: RatingsContext
177187
): IndividualAssessment | SkippedIndividualAssessment {
178188
const rateResult = rating.rate({
179189
buildResult,
180190
repairAttempts,
181191
generatedFileCount,
182192
axeRepairAttempts,
193+
ratingsContext,
183194
});
184195

185196
// If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment.
@@ -197,7 +208,8 @@ function runPerBuildRating(
197208

198209
async function runPerFileRating(
199210
rating: PerFileRating,
200-
categorizedFiles: CategorizedFiles
211+
categorizedFiles: CategorizedFiles,
212+
ratingsContext: RatingsContext
201213
): Promise<IndividualAssessment | SkippedIndividualAssessment> {
202214
const errorMessages: string[] = [];
203215
let contentType: PerFileRatingContentType;
@@ -228,7 +240,7 @@ async function runPerFileRating(
228240
// Remove comments from the code to avoid false-detection of bad patterns.
229241
// Some keywords like `NgModule` can be used in code comments.
230242
const code = removeComments(file.code, contentType);
231-
const result = await rating.rate(code, file.filePath);
243+
const result = await rating.rate(code, file.filePath, ratingsContext);
232244
let coeff: number;
233245

234246
if (typeof result === 'number') {
@@ -272,7 +284,8 @@ async function runLlmBasedRating(
272284
repairAttempts: number,
273285
axeRepairAttempts: number,
274286
abortSignal: AbortSignal,
275-
autoraterModel: string
287+
autoraterModel: string,
288+
ratingsContext: RatingsContext
276289
): Promise<IndividualAssessment | SkippedIndividualAssessment> {
277290
const result = await rating.rate({
278291
environment,
@@ -285,6 +298,7 @@ async function runLlmBasedRating(
285298
repairAttempts,
286299
axeRepairAttempts,
287300
abortSignal,
301+
ratingsContext,
288302
});
289303

290304
if (result.state === RatingState.SKIPPED) {

runner/ratings/rating-types.ts

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import z from 'zod';
22
import { BuildResult } from '../builder/builder-types.js';
33
import type {
4+
IndividualAssessment,
45
LlmResponseFile,
56
PromptDefinition,
7+
SkippedIndividualAssessment,
68
Usage,
79
} from '../shared-interfaces.js';
810
import { Environment } from '../configuration/environment.js';
@@ -62,6 +64,9 @@ const perBuildRatingSchema = z
6264
repairAttempts: z.number(),
6365
axeRepairAttempts: z.number(),
6466
generatedFileCount: z.number(),
67+
ratingsContext: z.record(
68+
z.custom<IndividualAssessment | SkippedIndividualAssessment>()
69+
),
6570
})
6671
)
6772
.returns(z.custom<PerBuildRatingResult>()),
@@ -74,7 +79,13 @@ const perFileRatingSchema = z
7479
kind: z.literal(RatingKind.PER_FILE),
7580
rate: z
7681
.function()
77-
.args(z.string(), z.string().optional())
82+
.args(
83+
z.string(),
84+
z.string().optional(),
85+
z.record(
86+
z.custom<IndividualAssessment | SkippedIndividualAssessment>()
87+
)
88+
)
7889
.returns(z.custom<PerFileRatingResult>()),
7990
filter: z.union([
8091
z
@@ -169,6 +180,11 @@ export interface ExecutedLLMBasedRating {
169180
};
170181
}
171182

183+
export type RatingsContext = Record<
184+
string,
185+
IndividualAssessment | SkippedIndividualAssessment
186+
>;
187+
172188
export interface LLMBasedRatingContext {
173189
environment: Environment;
174190
fullPromptText: string;
@@ -180,6 +196,7 @@ export interface LLMBasedRatingContext {
180196
repairAttempts: number;
181197
axeRepairAttempts: number;
182198
abortSignal: AbortSignal;
199+
ratingsContext: RatingsContext;
183200
}
184201

185202
/** Rating that applies over build results. */

0 commit comments

Comments
 (0)