Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ export interface JudgeResult {
reasoning: string;
metCriteria: string[];
unmetCriteria: string[];
evidence?: Record<string, string>;
}
18 changes: 18 additions & 0 deletions javascript/src/agents/judge/judge-agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ ${criteriaList}
<rules>
- Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criteria.
- DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
- For each criterion, provide evidence citing exact values or quotes from the transcript or tool output.
</rules>
`.trim();
}
Expand Down Expand Up @@ -91,6 +92,21 @@ function buildFinishTestTool(criteria: string[]): Tool {
)
.strict()
.describe("Strict verdict for each criterion"),
evidence: z
.object(
Object.fromEntries(
criteriaNames.map((name) => [
name,
z
.string()
.describe(
"Evidence supporting the criterion verdict, citing exact values or quotes."
),
])
)
)
.strict()
.describe("Evidence for each criterion verdict"),
reasoning: z
.string()
.describe("Explanation of what the final verdict should be"),
Expand Down Expand Up @@ -226,6 +242,7 @@ class JudgeAgent extends JudgeAgentAdapter {
const verdict = args.verdict || "inconclusive";
const reasoning = args.reasoning || "No reasoning provided";
const criteria = args.criteria || {};
const evidence = args.evidence || {};
const criteriaValues = Object.values(criteria);
const metCriteria = cfg.criteria.filter(
(_, i) => criteriaValues[i] === "true"
Expand All @@ -239,6 +256,7 @@ class JudgeAgent extends JudgeAgentAdapter {
reasoning,
metCriteria,
unmetCriteria,
evidence,
};
this.logger.debug("finish_test result", result);
return result;
Expand Down
4 changes: 4 additions & 0 deletions javascript/src/agents/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ export interface FinishTestArgs {
* A record of the criteria and their results.
*/
criteria: Record<string, "true" | "false" | "inconclusive">;
/**
* Evidence for each criterion verdict.
*/
evidence: Record<string, string>;
/**
* The reasoning behind the verdict.
*/
Expand Down
51 changes: 50 additions & 1 deletion python/scenario/judge_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ async def call(
<rules>
- Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criterias.
- DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
- For each criterion, provide evidence citing exact values or quotes from the transcript or tool output.
</rules>
""",
},
Expand Down Expand Up @@ -374,13 +375,29 @@ async def call(
"type": "string",
"description": "Explanation of what the final verdict should be",
},
"evidence": {
"type": "object",
"properties": {
criteria_names[idx]: {
"type": "string",
"description": (
"Evidence supporting the criterion verdict, citing exact values or quotes "
"from the transcript or tool output."
),
}
for idx, criterion in enumerate(self.criteria)
},
"required": criteria_names,
"additionalProperties": False,
"description": "Evidence for each criterion verdict",
},
"verdict": {
"type": "string",
"enum": ["success", "failure", "inconclusive"],
"description": "The final verdict of the test",
},
},
"required": ["criteria", "reasoning", "verdict"],
"required": ["criteria", "reasoning", "evidence", "verdict"],
"additionalProperties": False,
},
},
Expand Down Expand Up @@ -433,6 +450,37 @@ async def call(
verdict = args.get("verdict", "inconclusive")
reasoning = args.get("reasoning", "No reasoning provided")
criteria = args.get("criteria", {})
evidence = args.get("evidence", {})

# Handle case where LLM returns criteria as a JSON string instead of dict
# This can happen when the LLM is uncertain about the schema format
# See: https://github.com/langwatch/scenario/issues/161
if isinstance(criteria, str):
try:
criteria = json.loads(criteria)
logger.debug(
"JudgeAgent: Parsed criteria from JSON string to dict"
)
except json.JSONDecodeError:
logger.warning(
f"JudgeAgent: Failed to parse criteria string as JSON: {criteria}. "
"Using empty dict as fallback."
)
criteria = {}

# Ensure criteria is a dict before calling .values()
if not isinstance(criteria, dict):
logger.warning(
f"JudgeAgent: criteria is {type(criteria).__name__}, expected dict. "
"Using empty dict as fallback."
)
criteria = {}
if not isinstance(evidence, dict):
logger.warning(
f"JudgeAgent: evidence is {type(evidence).__name__}, expected dict. "
"Using empty dict as fallback."
)
evidence = {}

passed_criteria = [
self.criteria[idx]
Expand All @@ -452,6 +500,7 @@ async def call(
reasoning=reasoning,
passed_criteria=passed_criteria,
failed_criteria=failed_criteria,
evidence=evidence,
)
except json.JSONDecodeError:
raise Exception(
Expand Down
26 changes: 26 additions & 0 deletions python/scenario/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
Optional,
TypeAlias,
Union,
TypedDict,
)

from openai.types.chat import (
Expand All @@ -36,6 +37,28 @@
# message types with the trace_id field


class FileAttachment(TypedDict, total=False):
"""
Represents a file attached to a conversation message.

This type allows agents to receive file metadata or content
for multimodal analysis.

Attributes:
id: Unique identifier for the file (e.g., S3 ID)
name: Original filename
mime_type: File type (e.g., 'application/pdf')
url: URL where the file can be downloaded
content: Raw text or base64 content of the file
"""

id: str
name: str
mime_type: str
url: str
content: str


class ChatCompletionDeveloperMessageParamWithTrace(ChatCompletionDeveloperMessageParam):
trace_id: Optional[str]

Expand All @@ -46,6 +69,7 @@ class ChatCompletionSystemMessageParamWithTrace(ChatCompletionSystemMessageParam

class ChatCompletionUserMessageParamWithTrace(ChatCompletionUserMessageParam):
trace_id: Optional[str]
files: Optional[List[FileAttachment]]


class ChatCompletionAssistantMessageParamWithTrace(ChatCompletionAssistantMessageParam):
Expand Down Expand Up @@ -196,6 +220,7 @@ class ScenarioResult(BaseModel):
reasoning: Detailed explanation of why the scenario succeeded or failed
passed_criteria: List of success criteria that were satisfied
failed_criteria: List of success criteria that were not satisfied
evidence: Mapping of criterion identifiers to supporting evidence from the transcript/tool output
total_time: Total execution time in seconds (if measured)
agent_time: Time spent in agent calls in seconds (if measured)

Expand Down Expand Up @@ -227,6 +252,7 @@ class ScenarioResult(BaseModel):
reasoning: Optional[str] = None
passed_criteria: List[str] = []
failed_criteria: List[str] = []
evidence: dict[str, str] = {}
total_time: Optional[float] = None
agent_time: Optional[float] = None

Expand Down
51 changes: 51 additions & 0 deletions python/tests/test_judge_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,3 +149,54 @@ async def test_judge_agent_with_string_default_model_config():
context_scenario.reset(token)
# Cleanup
ScenarioConfig.default_config = None


@pytest.mark.asyncio
async def test_judge_agent_returns_evidence_per_criterion():
"""JudgeAgent should return evidence mapping from finish_test tool call."""
ScenarioConfig.default_config = ScenarioConfig(default_model="openai/gpt-4")

judge = JudgeAgent(criteria=["Test criterion"])

mock_scenario_state = MagicMock()
mock_scenario_state.description = "Test scenario"
mock_scenario_state.current_turn = 1
mock_scenario_state.config.max_turns = 10

agent_input = AgentInput(
thread_id="test",
messages=[{"role": "user", "content": "Hello"}],
new_messages=[],
judgment_request=True,
scenario_state=mock_scenario_state,
)

mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.tool_calls = [MagicMock()]
mock_response.choices[0].message.tool_calls[0].function.name = "finish_test"
mock_response.choices[0].message.tool_calls[
0
].function.arguments = (
'{"verdict": "failure", "reasoning": "No match", '
'"criteria": {"test_criterion": "false"}, '
'"evidence": {"test_criterion": "Value not found in tool output"}}'
)

mock_executor = MagicMock()
mock_executor.config = MagicMock()
mock_executor.config.cache_key = None
token = context_scenario.set(mock_executor)

try:
with patch(
"scenario.judge_agent.litellm.completion", return_value=mock_response
):
result = await judge.call(agent_input)
assert result is not None
assert result.evidence == {
"test_criterion": "Value not found in tool output"
}
finally:
context_scenario.reset(token)
ScenarioConfig.default_config = None