Auto-rater allows some previous rater results to be passed in as context.

aaronshim · aaronshim · commit c082d8c41a86 · 2025-09-25T21:18:45.000Z
diff --git a/runner/ratings/autoraters/code-rater.ts b/runner/ratings/autoraters/code-rater.ts
@@ -2,14 +2,20 @@ import { readFileSync } from 'node:fs';
 import { z } from 'zod';
 import { prepareContextFilesMessage } from '../../orchestration/codegen.js';
 import { Environment } from '../../configuration/environment.js';
-import { LlmResponseFile } from '../../shared-interfaces.js';
+import {
+  IndividualAssessment,
+  IndividualAssessmentState,
+  LlmResponseFile,
+  SkippedIndividualAssessment,
+} from '../../shared-interfaces.js';
 import {
   AutoRateResult,
   getCoefficient,
   MAX_RATING,
 } from './auto-rate-shared.js';
 import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js';
 import defaultCodeRaterPrompt from './code-rating-prompt.js';
+import { RatingsContext } from '../rating-types.js';
 
 /** Framework-specific hints for the rating prompt. */
 const FW_HINTS: Record<string, string | undefined> = {
@@ -33,14 +39,16 @@ const CACHED_RATING_PROMPTS: Record<string, string> = {};
  * @param environment Environment in which the rating is running.
  * @param files Files to be rated.
  * @param appPrompt Prompt to be used for the rating.
+ * @param ratingsContext Context containing results from previous ratings.
  */
 export async function autoRateCode(
   llm: GenkitRunner,
   abortSignal: AbortSignal,
   model: string,
   environment: Environment,
   files: LlmResponseFile[],
-  appPrompt: string
+  appPrompt: string,
+  ratingsContext: RatingsContext
 ): Promise<AutoRateResult> {
   const contextMessage = prepareContextFilesMessage(
     files.map((o) => ({
@@ -61,10 +69,22 @@ export async function autoRateCode(
     promptText = defaultCodeRaterPrompt;
   }
 
-  const prompt = environment.renderPrompt(promptText, null, {
-    APP_PROMPT: appPrompt,
-    FRAMEWORK_SPECIFIC_HINTS: FW_HINTS[environment.fullStackFramework.id] ?? '',
-  }).result;
+  const safetyRating = ratingsContext['safety-web'];
+  const safetyWebResultsJson =
+    safetyRating?.state === IndividualAssessmentState.EXECUTED
+      ? JSON.stringify(safetyRating, null, 2)
+      : '';
+
+  const prompt = environment.renderPrompt(
+    promptText,
+    environment.codeRatingPromptPath,
+    {
+      APP_PROMPT: appPrompt,
+      FRAMEWORK_SPECIFIC_HINTS:
+        FW_HINTS[environment.fullStackFramework.id] ?? '',
+      SAFETY_WEB_RESULTS_JSON: safetyWebResultsJson,
+    }
+  ).result;
 
   const result = await llm.generateConstrained({
     abortSignal,
diff --git a/runner/ratings/autoraters/rate-files.ts b/runner/ratings/autoraters/rate-files.ts
@@ -1,9 +1,15 @@
 import { greenCheckmark } from '../../reporting/format.js';
-import { AutoraterRunInfo, LlmResponseFile } from '../../shared-interfaces.js';
+import {
+  AutoraterRunInfo,
+  IndividualAssessment,
+  LlmResponseFile,
+  SkippedIndividualAssessment,
+} from '../../shared-interfaces.js';
 import { autoRateCode } from './code-rater.js';
 import { autoRateAppearance } from './visuals-rater.js';
 import { Environment } from '../../configuration/environment.js';
 import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js';
+import { RatingsContext } from '../rating-types.js';
 
 /**
  * Automatically rates the code inside of a file.
@@ -13,6 +19,7 @@ import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js';
  * @param filePath Path to the file to be rated.
  * @param appPrompt Prompt that should be checked.
  * @param screenshotPath Path to the screenshot to use for visual rating.
+ * @param ratingsContext Context containing results from previous ratings.
  */
 export async function autoRateFiles(
   llm: GenkitRunner,
@@ -21,7 +28,8 @@ export async function autoRateFiles(
   environment: Environment,
   files: LlmResponseFile[],
   appPrompt: string,
-  screenshotPngUrl: string | null
+  screenshotPngUrl: string | null,
+  ratingsContext: RatingsContext
 ): Promise<AutoraterRunInfo> {
   console.log(`Autorater is using '${model}' model. \n`);
 
@@ -33,7 +41,8 @@ export async function autoRateFiles(
     model,
     environment,
     files,
-    appPrompt
+    appPrompt,
+    ratingsContext
   );
   console.log(`${greenCheckmark()} Code scoring is successful.`);
 
diff --git a/runner/ratings/built-in-ratings/code-quality-rating.ts b/runner/ratings/built-in-ratings/code-quality-rating.ts
@@ -21,7 +21,8 @@ export const codeQualityRating: LLMBasedRating = {
       ctx.model,
       ctx.environment,
       ctx.outputFiles,
-      ctx.fullPromptText
+      ctx.fullPromptText,
+      ctx.ratingsContext
     );
 
     return {
diff --git a/runner/ratings/built-in.ts b/runner/ratings/built-in.ts
@@ -20,14 +20,14 @@ import {
 export function getBuiltInRatings(): Rating[] {
   return [
     successfulBuildRating,
+    safetyWebRating,
     noRuntimeExceptionsRating,
     sufficientCodeSizeRating,
     sufficientGeneratedFilesRating,
     codeQualityRating,
     visualAppearanceRating,
     validCssRating,
     axeRating,
-    safetyWebRating,
     userJourneysRating,
     NoInnerHtmlBindingsRating,
     NoDangerouslySetInnerHtmlRating,
diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts
@@ -20,6 +20,7 @@ import {
   POINTS_FOR_CATEGORIES,
   Rating,
   CATEGORY_NAMES,
+  RatingsContext,
 } from './rating-types.js';
 import { extractEmbeddedCodeFromTypeScript } from './embedded-languages.js';
 import { Environment } from '../configuration/environment.js';
@@ -60,6 +61,7 @@ export async function rateGeneratedCode(
   let categorizedFiles: CategorizedFiles | null = null;
   let totalPoints = 0;
   let maxOverallPoints = 0;
+  const ratingsContext: RatingsContext = {};
 
   // Rating may also invoke LLMs. Track the usage.
   const tokenUsage = {
@@ -92,11 +94,16 @@ export async function rateGeneratedCode(
           buildResult,
           repairAttempts,
           outputFiles.length,
-          axeRepairAttempts
+          axeRepairAttempts,
+          ratingsContext
         );
       } else if (current.kind === RatingKind.PER_FILE) {
         categorizedFiles ??= splitFilesIntoCategories(outputFiles);
-        result = await runPerFileRating(current, categorizedFiles);
+        result = await runPerFileRating(
+          current,
+          categorizedFiles,
+          ratingsContext
+        );
       } else if (current.kind === RatingKind.LLM_BASED) {
         result = await runLlmBasedRating(
           environment,
@@ -109,7 +116,8 @@ export async function rateGeneratedCode(
           repairAttempts,
           axeRepairAttempts,
           abortSignal,
-          autoraterModel
+          autoraterModel,
+          ratingsContext
         );
       } else {
         throw new UserFacingError(`Unsupported rating type ${current}`);
@@ -135,6 +143,7 @@ export async function rateGeneratedCode(
       );
     }
 
+    ratingsContext[current.id] = result;
     category.assessments.push(result);
   }
 
@@ -173,13 +182,15 @@ function runPerBuildRating(
   buildResult: BuildResult,
   repairAttempts: number,
   generatedFileCount: number,
-  axeRepairAttempts: number
+  axeRepairAttempts: number,
+  ratingsContext: RatingsContext
 ): IndividualAssessment | SkippedIndividualAssessment {
   const rateResult = rating.rate({
     buildResult,
     repairAttempts,
     generatedFileCount,
     axeRepairAttempts,
+    ratingsContext,
   });
 
   // If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment.
@@ -197,7 +208,8 @@ function runPerBuildRating(
 
 async function runPerFileRating(
   rating: PerFileRating,
-  categorizedFiles: CategorizedFiles
+  categorizedFiles: CategorizedFiles,
+  ratingsContext: RatingsContext
 ): Promise<IndividualAssessment | SkippedIndividualAssessment> {
   const errorMessages: string[] = [];
   let contentType: PerFileRatingContentType;
@@ -228,7 +240,7 @@ async function runPerFileRating(
       // Remove comments from the code to avoid false-detection of bad patterns.
       // Some keywords like `NgModule` can be used in code comments.
       const code = removeComments(file.code, contentType);
-      const result = await rating.rate(code, file.filePath);
+      const result = await rating.rate(code, file.filePath, ratingsContext);
       let coeff: number;
 
       if (typeof result === 'number') {
@@ -272,7 +284,8 @@ async function runLlmBasedRating(
   repairAttempts: number,
   axeRepairAttempts: number,
   abortSignal: AbortSignal,
-  autoraterModel: string
+  autoraterModel: string,
+  ratingsContext: RatingsContext
 ): Promise<IndividualAssessment | SkippedIndividualAssessment> {
   const result = await rating.rate({
     environment,
@@ -285,6 +298,7 @@ async function runLlmBasedRating(
     repairAttempts,
     axeRepairAttempts,
     abortSignal,
+    ratingsContext,
   });
 
   if (result.state === RatingState.SKIPPED) {
diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts
@@ -1,8 +1,10 @@
 import z from 'zod';
 import { BuildResult } from '../builder/builder-types.js';
 import type {
+  IndividualAssessment,
   LlmResponseFile,
   PromptDefinition,
+  SkippedIndividualAssessment,
   Usage,
 } from '../shared-interfaces.js';
 import { Environment } from '../configuration/environment.js';
@@ -62,6 +64,9 @@ const perBuildRatingSchema = z
           repairAttempts: z.number(),
           axeRepairAttempts: z.number(),
           generatedFileCount: z.number(),
+          ratingsContext: z.record(
+            z.custom<IndividualAssessment | SkippedIndividualAssessment>()
+          ),
         })
       )
       .returns(z.custom<PerBuildRatingResult>()),
@@ -74,7 +79,13 @@ const perFileRatingSchema = z
     kind: z.literal(RatingKind.PER_FILE),
     rate: z
       .function()
-      .args(z.string(), z.string().optional())
+      .args(
+        z.string(),
+        z.string().optional(),
+        z.record(
+          z.custom<IndividualAssessment | SkippedIndividualAssessment>()
+        )
+      )
       .returns(z.custom<PerFileRatingResult>()),
     filter: z.union([
       z
@@ -169,6 +180,11 @@ export interface ExecutedLLMBasedRating {
   };
 }
 
+export type RatingsContext = Record<
+  string,
+  IndividualAssessment | SkippedIndividualAssessment
+>;
+
 export interface LLMBasedRatingContext {
   environment: Environment;
   fullPromptText: string;
@@ -180,6 +196,7 @@ export interface LLMBasedRatingContext {
   repairAttempts: number;
   axeRepairAttempts: number;
   abortSignal: AbortSignal;
+  ratingsContext: RatingsContext;
 }
 
 /** Rating that applies over build results. */