diff --git a/javascript/src/agents/judge/interfaces/judge-result.interface.ts b/javascript/src/agents/judge/interfaces/judge-result.interface.ts index 187abb52..7b6fb571 100644 --- a/javascript/src/agents/judge/interfaces/judge-result.interface.ts +++ b/javascript/src/agents/judge/interfaces/judge-result.interface.ts @@ -3,4 +3,5 @@ export interface JudgeResult { reasoning: string; metCriteria: string[]; unmetCriteria: string[]; + evidence?: Record; } diff --git a/javascript/src/agents/judge/judge-agent.ts b/javascript/src/agents/judge/judge-agent.ts index 94e3430a..31bdcc30 100644 --- a/javascript/src/agents/judge/judge-agent.ts +++ b/javascript/src/agents/judge/judge-agent.ts @@ -63,6 +63,7 @@ ${criteriaList} - Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criteria. - DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary +- For each criterion, provide evidence citing exact values or quotes from the transcript or tool output. `.trim(); } @@ -91,6 +92,21 @@ function buildFinishTestTool(criteria: string[]): Tool { ) .strict() .describe("Strict verdict for each criterion"), + evidence: z + .object( + Object.fromEntries( + criteriaNames.map((name) => [ + name, + z + .string() + .describe( + "Evidence supporting the criterion verdict, citing exact values or quotes." + ), + ]) + ) + ) + .strict() + .describe("Evidence for each criterion verdict"), reasoning: z .string() .describe("Explanation of what the final verdict should be"), @@ -226,6 +242,7 @@ class JudgeAgent extends JudgeAgentAdapter { const verdict = args.verdict || "inconclusive"; const reasoning = args.reasoning || "No reasoning provided"; const criteria = args.criteria || {}; + const evidence = args.evidence || {}; const criteriaValues = Object.values(criteria); const metCriteria = cfg.criteria.filter( (_, i) => criteriaValues[i] === "true" @@ -239,6 +256,7 @@ class JudgeAgent extends JudgeAgentAdapter { reasoning, metCriteria, unmetCriteria, + evidence, }; this.logger.debug("finish_test result", result); return result; diff --git a/javascript/src/agents/types.ts b/javascript/src/agents/types.ts index f87a2ee7..ae209044 100644 --- a/javascript/src/agents/types.ts +++ b/javascript/src/agents/types.ts @@ -42,6 +42,10 @@ export interface FinishTestArgs { * A record of the criteria and their results. */ criteria: Record; + /** + * Evidence for each criterion verdict. + */ + evidence: Record; /** * The reasoning behind the verdict. */ diff --git a/python/scenario/judge_agent.py b/python/scenario/judge_agent.py index 8abbaeaf..b0284de6 100644 --- a/python/scenario/judge_agent.py +++ b/python/scenario/judge_agent.py @@ -298,6 +298,7 @@ async def call( - Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criterias. - DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary +- For each criterion, provide evidence citing exact values or quotes from the transcript or tool output. """, }, @@ -374,13 +375,29 @@ async def call( "type": "string", "description": "Explanation of what the final verdict should be", }, + "evidence": { + "type": "object", + "properties": { + criteria_names[idx]: { + "type": "string", + "description": ( + "Evidence supporting the criterion verdict, citing exact values or quotes " + "from the transcript or tool output." + ), + } + for idx, criterion in enumerate(self.criteria) + }, + "required": criteria_names, + "additionalProperties": False, + "description": "Evidence for each criterion verdict", + }, "verdict": { "type": "string", "enum": ["success", "failure", "inconclusive"], "description": "The final verdict of the test", }, }, - "required": ["criteria", "reasoning", "verdict"], + "required": ["criteria", "reasoning", "evidence", "verdict"], "additionalProperties": False, }, }, @@ -433,6 +450,37 @@ async def call( verdict = args.get("verdict", "inconclusive") reasoning = args.get("reasoning", "No reasoning provided") criteria = args.get("criteria", {}) + evidence = args.get("evidence", {}) + + # Handle case where LLM returns criteria as a JSON string instead of dict + # This can happen when the LLM is uncertain about the schema format + # See: https://github.com/langwatch/scenario/issues/161 + if isinstance(criteria, str): + try: + criteria = json.loads(criteria) + logger.debug( + "JudgeAgent: Parsed criteria from JSON string to dict" + ) + except json.JSONDecodeError: + logger.warning( + f"JudgeAgent: Failed to parse criteria string as JSON: {criteria}. " + "Using empty dict as fallback." + ) + criteria = {} + + # Ensure criteria is a dict before calling .values() + if not isinstance(criteria, dict): + logger.warning( + f"JudgeAgent: criteria is {type(criteria).__name__}, expected dict. " + "Using empty dict as fallback." + ) + criteria = {} + if not isinstance(evidence, dict): + logger.warning( + f"JudgeAgent: evidence is {type(evidence).__name__}, expected dict. " + "Using empty dict as fallback." + ) + evidence = {} passed_criteria = [ self.criteria[idx] @@ -452,6 +500,7 @@ async def call( reasoning=reasoning, passed_criteria=passed_criteria, failed_criteria=failed_criteria, + evidence=evidence, ) except json.JSONDecodeError: raise Exception( diff --git a/python/scenario/types.py b/python/scenario/types.py index b36418d5..258cd03f 100644 --- a/python/scenario/types.py +++ b/python/scenario/types.py @@ -10,6 +10,7 @@ Optional, TypeAlias, Union, + TypedDict, ) from openai.types.chat import ( @@ -36,6 +37,28 @@ # message types with the trace_id field +class FileAttachment(TypedDict, total=False): + """ + Represents a file attached to a conversation message. + + This type allows agents to receive file metadata or content + for multimodal analysis. + + Attributes: + id: Unique identifier for the file (e.g., S3 ID) + name: Original filename + mime_type: File type (e.g., 'application/pdf') + url: URL where the file can be downloaded + content: Raw text or base64 content of the file + """ + + id: str + name: str + mime_type: str + url: str + content: str + + class ChatCompletionDeveloperMessageParamWithTrace(ChatCompletionDeveloperMessageParam): trace_id: Optional[str] @@ -46,6 +69,7 @@ class ChatCompletionSystemMessageParamWithTrace(ChatCompletionSystemMessageParam class ChatCompletionUserMessageParamWithTrace(ChatCompletionUserMessageParam): trace_id: Optional[str] + files: Optional[List[FileAttachment]] class ChatCompletionAssistantMessageParamWithTrace(ChatCompletionAssistantMessageParam): @@ -196,6 +220,7 @@ class ScenarioResult(BaseModel): reasoning: Detailed explanation of why the scenario succeeded or failed passed_criteria: List of success criteria that were satisfied failed_criteria: List of success criteria that were not satisfied + evidence: Mapping of criterion identifiers to supporting evidence from the transcript/tool output total_time: Total execution time in seconds (if measured) agent_time: Time spent in agent calls in seconds (if measured) @@ -227,6 +252,7 @@ class ScenarioResult(BaseModel): reasoning: Optional[str] = None passed_criteria: List[str] = [] failed_criteria: List[str] = [] + evidence: dict[str, str] = {} total_time: Optional[float] = None agent_time: Optional[float] = None diff --git a/python/tests/test_judge_agent.py b/python/tests/test_judge_agent.py index 25d33a8c..aba9a123 100644 --- a/python/tests/test_judge_agent.py +++ b/python/tests/test_judge_agent.py @@ -149,3 +149,54 @@ async def test_judge_agent_with_string_default_model_config(): context_scenario.reset(token) # Cleanup ScenarioConfig.default_config = None + + +@pytest.mark.asyncio +async def test_judge_agent_returns_evidence_per_criterion(): + """JudgeAgent should return evidence mapping from finish_test tool call.""" + ScenarioConfig.default_config = ScenarioConfig(default_model="openai/gpt-4") + + judge = JudgeAgent(criteria=["Test criterion"]) + + mock_scenario_state = MagicMock() + mock_scenario_state.description = "Test scenario" + mock_scenario_state.current_turn = 1 + mock_scenario_state.config.max_turns = 10 + + agent_input = AgentInput( + thread_id="test", + messages=[{"role": "user", "content": "Hello"}], + new_messages=[], + judgment_request=True, + scenario_state=mock_scenario_state, + ) + + mock_response = MagicMock() + mock_response.choices = [MagicMock()] + mock_response.choices[0].message.tool_calls = [MagicMock()] + mock_response.choices[0].message.tool_calls[0].function.name = "finish_test" + mock_response.choices[0].message.tool_calls[ + 0 + ].function.arguments = ( + '{"verdict": "failure", "reasoning": "No match", ' + '"criteria": {"test_criterion": "false"}, ' + '"evidence": {"test_criterion": "Value not found in tool output"}}' + ) + + mock_executor = MagicMock() + mock_executor.config = MagicMock() + mock_executor.config.cache_key = None + token = context_scenario.set(mock_executor) + + try: + with patch( + "scenario.judge_agent.litellm.completion", return_value=mock_response + ): + result = await judge.call(agent_input) + assert result is not None + assert result.evidence == { + "test_criterion": "Value not found in tool output" + } + finally: + context_scenario.reset(token) + ScenarioConfig.default_config = None