langwatch · rajdeepmahal24 · Dec 13, 2025 · Jan 17, 2026
diff --git a/javascript/src/agents/judge/interfaces/judge-result.interface.ts b/javascript/src/agents/judge/interfaces/judge-result.interface.ts
@@ -3,4 +3,5 @@ export interface JudgeResult {
   reasoning: string;
   metCriteria: string[];
   unmetCriteria: string[];
+  evidence?: Record<string, string>;
 }
diff --git a/javascript/src/agents/judge/judge-agent.ts b/javascript/src/agents/judge/judge-agent.ts
@@ -63,6 +63,7 @@ ${criteriaList}
 <rules>
 - Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criteria.
 - DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
+- For each criterion, provide evidence citing exact values or quotes from the transcript or tool output.
 </rules>
 `.trim();
 }
@@ -91,6 +92,21 @@ function buildFinishTestTool(criteria: string[]): Tool {
         )
         .strict()
         .describe("Strict verdict for each criterion"),
+      evidence: z
+        .object(
+          Object.fromEntries(
+            criteriaNames.map((name) => [
+              name,
+              z
+                .string()
+                .describe(
+                  "Evidence supporting the criterion verdict, citing exact values or quotes."
+                ),
+            ])
+          )
+        )
+        .strict()
+        .describe("Evidence for each criterion verdict"),
       reasoning: z
         .string()
         .describe("Explanation of what the final verdict should be"),
@@ -226,6 +242,7 @@ class JudgeAgent extends JudgeAgentAdapter {
           const verdict = args.verdict || "inconclusive";
           const reasoning = args.reasoning || "No reasoning provided";
           const criteria = args.criteria || {};
+          const evidence = args.evidence || {};
           const criteriaValues = Object.values(criteria);
           const metCriteria = cfg.criteria.filter(
             (_, i) => criteriaValues[i] === "true"
@@ -239,6 +256,7 @@ class JudgeAgent extends JudgeAgentAdapter {
             reasoning,
             metCriteria,
             unmetCriteria,
+            evidence,
           };
           this.logger.debug("finish_test result", result);
           return result;

diff --git a/javascript/src/agents/types.ts b/javascript/src/agents/types.ts
@@ -42,6 +42,10 @@ export interface FinishTestArgs {
    * A record of the criteria and their results.
    */
   criteria: Record<string, "true" | "false" | "inconclusive">;
+  /**
+   * Evidence for each criterion verdict.
+   */
+  evidence: Record<string, string>;
   /**
    * The reasoning behind the verdict.
    */

diff --git a/python/scenario/judge_agent.py b/python/scenario/judge_agent.py
@@ -298,6 +298,7 @@ async def call(
 <rules>
 - Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criterias.
 - DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
+- For each criterion, provide evidence citing exact values or quotes from the transcript or tool output.
 </rules>
 """,
             },
@@ -374,13 +375,29 @@ async def call(
                                 "type": "string",
                                 "description": "Explanation of what the final verdict should be",
                             },
+                            "evidence": {
+                                "type": "object",
+                                "properties": {
+                                    criteria_names[idx]: {
+                                        "type": "string",
+                                        "description": (
+                                            "Evidence supporting the criterion verdict, citing exact values or quotes "
+                                            "from the transcript or tool output."
+                                        ),
+                                    }
+                                    for idx, criterion in enumerate(self.criteria)
+                                },
+                                "required": criteria_names,
+                                "additionalProperties": False,
+                                "description": "Evidence for each criterion verdict",
+                            },
                             "verdict": {
                                 "type": "string",
                                 "enum": ["success", "failure", "inconclusive"],
                                 "description": "The final verdict of the test",
                             },
                         },
-                        "required": ["criteria", "reasoning", "verdict"],
+                        "required": ["criteria", "reasoning", "evidence", "verdict"],
                         "additionalProperties": False,
                     },
                 },
@@ -433,6 +450,37 @@ async def call(
                         verdict = args.get("verdict", "inconclusive")
                         reasoning = args.get("reasoning", "No reasoning provided")
                         criteria = args.get("criteria", {})
+                        evidence = args.get("evidence", {})
+
+                        # Handle case where LLM returns criteria as a JSON string instead of dict
+                        # This can happen when the LLM is uncertain about the schema format
+                        # See: https://github.com/langwatch/scenario/issues/161
+                        if isinstance(criteria, str):
+                            try:
+                                criteria = json.loads(criteria)
+                                logger.debug(
+                                    "JudgeAgent: Parsed criteria from JSON string to dict"
+                                )
+                            except json.JSONDecodeError:
+                                logger.warning(
+                                    f"JudgeAgent: Failed to parse criteria string as JSON: {criteria}. "
+                                    "Using empty dict as fallback."
+                                )
+                                criteria = {}
+
+                        # Ensure criteria is a dict before calling .values()
+                        if not isinstance(criteria, dict):
+                            logger.warning(
+                                f"JudgeAgent: criteria is {type(criteria).__name__}, expected dict. "
+                                "Using empty dict as fallback."
+                            )
+                            criteria = {}
+                        if not isinstance(evidence, dict):
+                            logger.warning(
+                                f"JudgeAgent: evidence is {type(evidence).__name__}, expected dict. "
+                                "Using empty dict as fallback."
+                            )
+                            evidence = {}
 
                         passed_criteria = [
                             self.criteria[idx]
@@ -452,6 +500,7 @@ async def call(
                             reasoning=reasoning,
                             passed_criteria=passed_criteria,
                             failed_criteria=failed_criteria,
+                            evidence=evidence,
                         )
                     except json.JSONDecodeError:
                         raise Exception(

diff --git a/python/scenario/types.py b/python/scenario/types.py
@@ -10,6 +10,7 @@
     Optional,
     TypeAlias,
     Union,
+    TypedDict,
 )
 
 from openai.types.chat import (
@@ -36,6 +37,28 @@
 # message types with the trace_id field
 
 
+class FileAttachment(TypedDict, total=False):
+    """
+    Represents a file attached to a conversation message.
+
+    This type allows agents to receive file metadata or content
+    for multimodal analysis.
+
+    Attributes:
+        id: Unique identifier for the file (e.g., S3 ID)
+        name: Original filename
+        mime_type: File type (e.g., 'application/pdf')
+        url: URL where the file can be downloaded
+        content: Raw text or base64 content of the file
+    """
+
+    id: str
+    name: str
+    mime_type: str
+    url: str
+    content: str
+
+
 class ChatCompletionDeveloperMessageParamWithTrace(ChatCompletionDeveloperMessageParam):
     trace_id: Optional[str]
 
@@ -46,6 +69,7 @@ class ChatCompletionSystemMessageParamWithTrace(ChatCompletionSystemMessageParam
 
 class ChatCompletionUserMessageParamWithTrace(ChatCompletionUserMessageParam):
     trace_id: Optional[str]
+    files: Optional[List[FileAttachment]]
 
 
 class ChatCompletionAssistantMessageParamWithTrace(ChatCompletionAssistantMessageParam):
@@ -196,6 +220,7 @@ class ScenarioResult(BaseModel):
         reasoning: Detailed explanation of why the scenario succeeded or failed
         passed_criteria: List of success criteria that were satisfied
         failed_criteria: List of success criteria that were not satisfied
+        evidence: Mapping of criterion identifiers to supporting evidence from the transcript/tool output
         total_time: Total execution time in seconds (if measured)
         agent_time: Time spent in agent calls in seconds (if measured)
 
@@ -227,6 +252,7 @@ class ScenarioResult(BaseModel):
     reasoning: Optional[str] = None
     passed_criteria: List[str] = []
     failed_criteria: List[str] = []
+    evidence: dict[str, str] = {}
     total_time: Optional[float] = None
     agent_time: Optional[float] = None
 

diff --git a/python/tests/test_judge_agent.py b/python/tests/test_judge_agent.py
@@ -149,3 +149,54 @@ async def test_judge_agent_with_string_default_model_config():
         context_scenario.reset(token)
         # Cleanup
         ScenarioConfig.default_config = None
+
+
+@pytest.mark.asyncio
+async def test_judge_agent_returns_evidence_per_criterion():
+    """JudgeAgent should return evidence mapping from finish_test tool call."""
+    ScenarioConfig.default_config = ScenarioConfig(default_model="openai/gpt-4")
+
+    judge = JudgeAgent(criteria=["Test criterion"])
+
+    mock_scenario_state = MagicMock()
+    mock_scenario_state.description = "Test scenario"
+    mock_scenario_state.current_turn = 1
+    mock_scenario_state.config.max_turns = 10
+
+    agent_input = AgentInput(
+        thread_id="test",
+        messages=[{"role": "user", "content": "Hello"}],
+        new_messages=[],
+        judgment_request=True,
+        scenario_state=mock_scenario_state,
+    )
+
+    mock_response = MagicMock()
+    mock_response.choices = [MagicMock()]
+    mock_response.choices[0].message.tool_calls = [MagicMock()]
+    mock_response.choices[0].message.tool_calls[0].function.name = "finish_test"
+    mock_response.choices[0].message.tool_calls[
+        0
+    ].function.arguments = (
+        '{"verdict": "failure", "reasoning": "No match", '
+        '"criteria": {"test_criterion": "false"}, '
+        '"evidence": {"test_criterion": "Value not found in tool output"}}'
+    )
+
+    mock_executor = MagicMock()
+    mock_executor.config = MagicMock()
+    mock_executor.config.cache_key = None
+    token = context_scenario.set(mock_executor)
+
+    try:
+        with patch(
+            "scenario.judge_agent.litellm.completion", return_value=mock_response
+        ):
+            result = await judge.call(agent_input)
+            assert result is not None
+            assert result.evidence == {
+                "test_criterion": "Value not found in tool output"
+            }
+    finally:
+        context_scenario.reset(token)
+        ScenarioConfig.default_config = None