diff --git a/openjudge/graders/agent/action/action_alignment.py b/openjudge/graders/agent/action/action_alignment.py index dcf46e6e0..b0fad9223 100644 --- a/openjudge/graders/agent/action/action_alignment.py +++ b/openjudge/graders/agent/action/action_alignment.py @@ -122,12 +122,20 @@ DEFAULT_ACTION_ALIGNMENT_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=ACTION_ALIGNMENT_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=ACTION_ALIGNMENT_PROMPT_ZH, diff --git a/openjudge/graders/agent/memory/memory_accuracy.py b/openjudge/graders/agent/memory/memory_accuracy.py index d5dd53982..84649553a 100644 --- a/openjudge/graders/agent/memory/memory_accuracy.py +++ b/openjudge/graders/agent/memory/memory_accuracy.py @@ -122,12 +122,20 @@ DEFAULT_MEMORY_ACCURACY_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=MEMORY_ACCURACY_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=MEMORY_ACCURACY_PROMPT_ZH, diff --git a/openjudge/graders/agent/memory/memory_detail_preservation.py b/openjudge/graders/agent/memory/memory_detail_preservation.py index 6df8196f8..c009176a7 100644 --- a/openjudge/graders/agent/memory/memory_detail_preservation.py +++ b/openjudge/graders/agent/memory/memory_detail_preservation.py @@ -122,12 +122,20 @@ DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=MEMORY_DETAIL_PRESERVATION_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=MEMORY_DETAIL_PRESERVATION_PROMPT_ZH, diff --git a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py index f1c87fe58..5b4a5790a 100644 --- a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py +++ b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py @@ -124,12 +124,20 @@ DEFAULT_MEMORY_RETRIEVAL_EFFECTIVENESS_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=MEMORY_RETRIEVAL_EFFECTIVENESS_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=MEMORY_RETRIEVAL_EFFECTIVENESS_PROMPT_ZH, diff --git a/openjudge/graders/agent/plan/plan_feasibility.py b/openjudge/graders/agent/plan/plan_feasibility.py index 30ad07a19..a323a4406 100644 --- a/openjudge/graders/agent/plan/plan_feasibility.py +++ b/openjudge/graders/agent/plan/plan_feasibility.py @@ -124,12 +124,20 @@ DEFAULT_PLAN_FEASIBILITY_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=PLAN_FEASIBILITY_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=PLAN_FEASIBILITY_PROMPT_ZH, diff --git a/openjudge/graders/agent/reflection/reflection_accuracy.py b/openjudge/graders/agent/reflection/reflection_accuracy.py index aa127e92a..c294ba261 100644 --- a/openjudge/graders/agent/reflection/reflection_accuracy.py +++ b/openjudge/graders/agent/reflection/reflection_accuracy.py @@ -122,12 +122,20 @@ DEFAULT_REFLECTION_ACCURACY_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=REFLECTION_ACCURACY_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=REFLECTION_ACCURACY_PROMPT_ZH, diff --git a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py index c9b140923..0f2346040 100644 --- a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py +++ b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py @@ -255,12 +255,20 @@ DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=REFLECTION_OUTCOME_UNDERSTANDING_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=REFLECTION_OUTCOME_UNDERSTANDING_PROMPT_ZH, diff --git a/openjudge/graders/agent/reflection/reflection_progress_awareness.py b/openjudge/graders/agent/reflection/reflection_progress_awareness.py index 57a3bea04..60bea2a2f 100644 --- a/openjudge/graders/agent/reflection/reflection_progress_awareness.py +++ b/openjudge/graders/agent/reflection/reflection_progress_awareness.py @@ -165,12 +165,20 @@ DEFAULT_REFLECTION_PROGRESS_AWARENESS_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=REFLECTION_PROGRESS_AWARENESS_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=REFLECTION_PROGRESS_AWARENESS_PROMPT_ZH, diff --git a/openjudge/graders/agent/tool/tool_call_accuracy.py b/openjudge/graders/agent/tool/tool_call_accuracy.py index ecf0c3368..526936a94 100644 --- a/openjudge/graders/agent/tool/tool_call_accuracy.py +++ b/openjudge/graders/agent/tool/tool_call_accuracy.py @@ -115,12 +115,20 @@ DEFAULT_TOOL_CALL_ACCURACY_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=TOOL_CALL_ACCURACY_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=TOOL_CALL_ACCURACY_PROMPT_ZH, diff --git a/openjudge/graders/agent/tool/tool_call_success.py b/openjudge/graders/agent/tool/tool_call_success.py index 776a581e3..ee97f1acb 100644 --- a/openjudge/graders/agent/tool/tool_call_success.py +++ b/openjudge/graders/agent/tool/tool_call_success.py @@ -144,12 +144,20 @@ DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=TOOL_CALL_SUCCESS_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=TOOL_CALL_SUCCESS_PROMPT_ZH, diff --git a/openjudge/graders/agent/tool/tool_parameter_check.py b/openjudge/graders/agent/tool/tool_parameter_check.py index 8426d2fd6..66b3f9014 100644 --- a/openjudge/graders/agent/tool/tool_parameter_check.py +++ b/openjudge/graders/agent/tool/tool_parameter_check.py @@ -121,12 +121,20 @@ DEFAULT_TOOL_PARAMETER_CHECK_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=TOOL_PARAMETER_CHECK_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=TOOL_PARAMETER_CHECK_PROMPT_ZH, diff --git a/openjudge/graders/agent/tool/tool_selection.py b/openjudge/graders/agent/tool/tool_selection.py index 371a2bd05..fb4b66913 100644 --- a/openjudge/graders/agent/tool/tool_selection.py +++ b/openjudge/graders/agent/tool/tool_selection.py @@ -132,12 +132,20 @@ DEFAULT_TOOL_SELECTION_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=TOOL_SELECTION_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=TOOL_SELECTION_PROMPT_ZH, diff --git a/openjudge/graders/agent/trajectory/trajectory_accuracy.py b/openjudge/graders/agent/trajectory/trajectory_accuracy.py index 8804cd95a..7dfb07533 100644 --- a/openjudge/graders/agent/trajectory/trajectory_accuracy.py +++ b/openjudge/graders/agent/trajectory/trajectory_accuracy.py @@ -116,12 +116,20 @@ DEFAULT_TRAJECTORY_ACCURACY_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=TRAJECTORY_ACCURACY_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=TRAJECTORY_ACCURACY_PROMPT_ZH, diff --git a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py index 5c2420612..b2a9b388e 100644 --- a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py +++ b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py @@ -206,12 +206,20 @@ DEFAULT_TRAJECTORY_COMPREHENSIVE_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=TRAJECTORY_COMPREHENSIVE_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=TRAJECTORY_COMPREHENSIVE_PROMPT_ZH, diff --git a/openjudge/graders/common/correctness.py b/openjudge/graders/common/correctness.py index 1eef78691..50df115ca 100644 --- a/openjudge/graders/common/correctness.py +++ b/openjudge/graders/common/correctness.py @@ -181,12 +181,20 @@ DEFAULT_CORRECTNESS_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=CORRECTNESS_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=CORRECTNESS_PROMPT_ZH, diff --git a/openjudge/graders/common/hallucination.py b/openjudge/graders/common/hallucination.py index 229ca47c2..82787e477 100644 --- a/openjudge/graders/common/hallucination.py +++ b/openjudge/graders/common/hallucination.py @@ -151,12 +151,20 @@ DEFAULT_HALLUCINATION_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=HALLUCINATION_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=HALLUCINATION_PROMPT_ZH, diff --git a/openjudge/graders/common/harmfulness.py b/openjudge/graders/common/harmfulness.py index b1bb49b48..6c3b77577 100644 --- a/openjudge/graders/common/harmfulness.py +++ b/openjudge/graders/common/harmfulness.py @@ -161,12 +161,20 @@ DEFAULT_HARMFULNESS_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=HARMFULNESS_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=HARMFULNESS_PROMPT_ZH, diff --git a/openjudge/graders/common/instruction_following.py b/openjudge/graders/common/instruction_following.py index f5e4a6e6f..c4b581da8 100644 --- a/openjudge/graders/common/instruction_following.py +++ b/openjudge/graders/common/instruction_following.py @@ -164,12 +164,20 @@ DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=INSTRUCTION_FOLLOWING_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=INSTRUCTION_FOLLOWING_PROMPT_ZH, diff --git a/openjudge/graders/common/relevance.py b/openjudge/graders/common/relevance.py index e5eb1a400..338741c1e 100644 --- a/openjudge/graders/common/relevance.py +++ b/openjudge/graders/common/relevance.py @@ -170,12 +170,20 @@ DEFAULT_RELEVANCE_TEMPLATE = PromptTemplate( messages={ LanguageEnum.EN: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_EN, + ), ChatMessage( role="user", content=RELEVANCE_PROMPT_EN, ), ], LanguageEnum.ZH: [ + ChatMessage( + role="system", + content=LLMGrader.SYSTEM_PROMPT_ZH, + ), ChatMessage( role="user", content=RELEVANCE_PROMPT_ZH, diff --git a/openjudge/graders/llm_grader.py b/openjudge/graders/llm_grader.py index e1aa718ca..24b5eaf86 100644 --- a/openjudge/graders/llm_grader.py +++ b/openjudge/graders/llm_grader.py @@ -56,6 +56,14 @@ class LLMGrader(BaseGrader): # The default template value is just a placeholder. # Extended classes must set proper value to DEFAULT_TEMPLATE DEFAULT_TEMPLATE = PromptTemplate(messages={}) + SYSTEM_PROMPT_EN = """ + You are an evaluation assistant. Output ONLY a valid JSON object with this exact structure: + {{"reason":"","score":}} + """ + SYSTEM_PROMPT_ZH = """ + 你是一个评估助手。仅输出一个符合此精确结构的有效 JSON 对象: + {{"reason":"<对所评分数的简要解释>","score":<整数>}} + """ def __init__( self, diff --git a/tests/graders/agent/action/test_action_alignment.py b/tests/graders/agent/action/test_action_alignment.py index 926a8c571..29c33cecb 100644 --- a/tests/graders/agent/action/test_action_alignment.py +++ b/tests/graders/agent/action/test_action_alignment.py @@ -65,10 +65,10 @@ def test_initialization(self): assert len(language_template) == 1 assert "zh" in language_template template = language_template["zh"] - assert len(template) == 1 - assert len(template[0]) == 2 - assert template[0]["role"] == "user" - assert template[0]["content"].startswith( + assert len(template) == 2 + assert len(template[1]) == 2 + assert template[1]["role"] == "user" + assert template[1]["content"].startswith( "你是一名分析智能体行为的专家。你的任务是评估智能体是否执行了与其声明的计划或推理一致的动作。" ) @@ -76,10 +76,10 @@ def test_initialization(self): assert len(language_template) == 1 assert "en" in language_template template = language_template["en"] - assert len(template) == 1 - assert len(template[0]) == 2 - assert template[0]["role"] == "user" - assert template[0]["content"].startswith( + assert len(template) == 2 + assert len(template[1]) == 2 + assert template[1]["role"] == "user" + assert template[1]["content"].startswith( "You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent executes an action that aligns with its stated plan or reasoning." ) diff --git a/tests/graders/common/test_hallucination.py b/tests/graders/common/test_hallucination.py index 9bf64ecb0..1ce7332ad 100644 --- a/tests/graders/common/test_hallucination.py +++ b/tests/graders/common/test_hallucination.py @@ -66,15 +66,15 @@ def test_get_metadata(self): assert "prompt" in meta prompt = meta["prompt"] assert len(prompt) == 2 - assert len(prompt[LanguageEnum.EN.value]) == 1 - assert prompt[LanguageEnum.EN.value][0]["role"] == "user" + assert len(prompt[LanguageEnum.EN.value]) == 2 + assert prompt[LanguageEnum.EN.value][1]["role"] == "user" assert ( "evaluating whether the model response contains hallucinations" - in prompt[LanguageEnum.EN.value][0]["content"] + in prompt[LanguageEnum.EN.value][1]["content"] ) - assert len(prompt[LanguageEnum.ZH.value]) == 1 - assert prompt[LanguageEnum.ZH.value][0]["role"] == "user" - assert "负责评估模型输出是否包含幻觉" in prompt[LanguageEnum.ZH.value][0]["content"] + assert len(prompt[LanguageEnum.ZH.value]) == 2 + assert prompt[LanguageEnum.ZH.value][1]["role"] == "user" + assert "负责评估模型输出是否包含幻觉" in prompt[LanguageEnum.ZH.value][1]["content"] @pytest.mark.asyncio async def test_successful_evaluation(self):