-
Notifications
You must be signed in to change notification settings - Fork 10
Added a GraphEval - custom faithfulness evaluator that uses knowledge graphs #17
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
183e137
3728732
0cc61a2
e5ed989
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,325 @@ | ||
| # DISCLAIMER: some prompts are taken from the research paper https://arxiv.org/pdf/2407.10793. | ||
| # Creation of this module was inspired by that paper, so cheers to authors! | ||
| import json | ||
| from typing import Optional | ||
| from langevals_core.base_evaluator import ( | ||
| BaseEvaluator, | ||
| EvaluatorEntry, | ||
| EvaluationResult, | ||
| SingleEvaluationResult, | ||
| LLMEvaluatorSettings, | ||
| Money, | ||
| ) | ||
| from pydantic import Field | ||
| import litellm | ||
| from litellm import Choices, Message, cast | ||
| from litellm.types.utils import ModelResponse | ||
| from litellm.cost_calculator import completion_cost | ||
| from dotenv import load_dotenv | ||
| import logging | ||
|
|
||
| load_dotenv() | ||
|
|
||
|
|
||
| class GraphEvalEntry(EvaluatorEntry): | ||
| input: Optional[str] = Field(default="") | ||
| output: str | ||
| contexts: list[str] | ||
|
|
||
|
|
||
| class GraphEvalSettings(LLMEvaluatorSettings): | ||
| kg_construction_prompt: str = Field( | ||
| default="""You are an expert at extracting information in structured formats to build a knowledge graph . | ||
| Step 1 - Entity detection: Identify all entities in the raw text . Make sure not to miss any out. | ||
| Entities should be basic and simple, they are akin to Wikipedia nodes . | ||
| Step 2 - Coreference resolution: Find all expressions in the text that refer to the same entity. Make sure entities are not duplicated. In particular do not include | ||
| entities that are more specific versions themselves , e.g. "a detailed view of jupiter's atmosphere " and " jupiter's atmosphere ", only include the most specific version of the entity. | ||
| Step 3 - Relation extraction: Identify semantic relationships between the entities you have identified. | ||
| Format : Return the knowledge graph as a list of triples , i.e. ["entity 1", "relation 1 - 2", "entity 2"], in Python code """ | ||
| ) | ||
| context_to_knowledge_graph_comparison_prompt: str = Field( | ||
| default="""You are an expert at evaluating knowledge graph triples for factual accuracy and faithfulness to the given context. Your task is to: | ||
|
|
||
| 1. Compare each triple in the knowledge graph against the provided context(s) | ||
| 2. Check for: | ||
| - Direct contradictions with the context | ||
| - Incorrect relationships between entities | ||
| - Misrepresented or fabricated facts | ||
|
|
||
| For each triple, determine if it is: | ||
| - SUPPORTED: All information is explicitly stated in or can be directly inferred from the context | ||
| - CONTRADICTED: The triple conflicts with information in the context | ||
| - UNVERIFIABLE: The triple makes claims that cannot be verified from the context | ||
|
|
||
| Return false if any triple is CONTRADICTED or UNVERIFIABLE, true if all triples are SUPPORTED.""" | ||
| ) | ||
| model: str = Field( | ||
| default="claude-3-5-sonnet-20240620", | ||
| description="The model to use for evaluation", | ||
| ) | ||
|
|
||
|
|
||
| class GraphEvalResult(EvaluationResult): | ||
| score: float = Field(default=0.0) | ||
| passed: Optional[bool] = Field( | ||
| default=True, description="True if the response is faithful, False otherwise" | ||
| ) | ||
|
|
||
|
|
||
| class GraphEvalEvaluator( | ||
| BaseEvaluator[GraphEvalEntry, GraphEvalSettings, GraphEvalResult] | ||
| ): | ||
| """ | ||
| Allows you to check for hallucinations by utilizing Knowledge Graphs | ||
| """ | ||
|
|
||
| name = "GraphEval" | ||
| category = "custom" | ||
| default_settings = GraphEvalSettings() | ||
| is_guardrail = True | ||
|
|
||
| def evaluate(self, entry: GraphEvalEntry) -> SingleEvaluationResult: | ||
| details = None | ||
| passed = None | ||
| try: | ||
| knowledge_graph_response = self._construct_knowledge_graph(entry.output) | ||
| cost = completion_cost(knowledge_graph_response) or 0.0 | ||
| knowledge_graph = self._get_arguments( | ||
| knowledge_graph_response, value="triples" | ||
| ) | ||
| except Exception as e: | ||
| logging.error("Caught an exception while creating a knowledge graph: ", e) | ||
|
|
||
| try: | ||
| if isinstance(knowledge_graph, list): | ||
| passed_response = self._compare_knowledge_graph_with_contexts( | ||
| knowledge_graph=knowledge_graph, contexts=entry.contexts | ||
| ) | ||
| cost += completion_cost(passed_response) or 0.0 | ||
| passed = self._get_arguments(passed_response, value="result") | ||
| except Exception as e: | ||
| logging.error( | ||
| "Caught an exception while comparing knowledge graph with contexts: ", e | ||
| ) | ||
|
|
||
| if isinstance(passed, bool): | ||
| return GraphEvalResult( | ||
| passed=passed, | ||
| details=f"The following entity_1-relationship->entity_2 triples were found in the output: {knowledge_graph}", | ||
| cost=Money(amount=cost, currency="USD") if cost else None, | ||
| ) | ||
| return GraphEvalResult( | ||
| passed=False, | ||
| details="We could not evaluate faithfulness of the output", | ||
| cost=Money(amount=cost, currency="USD") if cost else None, | ||
| ) | ||
|
|
||
| def _construct_knowledge_graph(self, output: str) -> ModelResponse: | ||
| tools = [ | ||
| { | ||
| "type": "function", | ||
| "function": { | ||
| "name": "create_knowledge_graph", | ||
| "description": "Create a knowledge graph from input text", | ||
| "parameters": { | ||
| "type": "object", | ||
| "properties": { | ||
| "triples": { | ||
| "type": "array", | ||
| "items": { | ||
| "type": "object", | ||
| "properties": { | ||
| "entity_1": { | ||
| "type": "string", | ||
| "description": "First entity in the relationship", | ||
| }, | ||
| "relationship": { | ||
| "type": "string", | ||
| "description": "Relationship between entities", | ||
| }, | ||
| "entity_2": { | ||
| "type": "string", | ||
| "description": "Second entity in the relationship", | ||
| }, | ||
| }, | ||
| "required": [ | ||
| "entity_1", | ||
| "relationship", | ||
| "entity_2", | ||
| ], | ||
| }, | ||
| "description": "List of entity-relationship triples that construct a knowledge graph", | ||
| } | ||
| }, | ||
|
Comment on lines
+122
to
+153
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Schema/Example mismatch – triples defined as objects but examples return lists The JSON schema for ["Italy", "had 3.6x times more cases of coronavirus than", "China"]This inconsistency can confuse the LLM and break Consider aligning the schema with the expected structure: - "triples": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "entity_1": { "type": "string" },
- "relationship": { "type": "string" },
- "entity_2": { "type": "string" }
- },
- "required": ["entity_1","relationship","entity_2"],
- },
+ "triples": {
+ "type": "array",
+ "items": {
+ "type": "array",
+ "items": { "type": "string" },
+ "minItems": 3,
+ "maxItems": 3
+ },
"description": "List of [entity_1, relationship, entity_2] triples",
}or, alternatively, update the rest of the pipeline/tests to consume objects instead of lists. |
||
| "required": ["triples"], | ||
| }, | ||
| }, | ||
| } | ||
| ] | ||
| response = litellm.completion( | ||
| model=self.settings.model, | ||
| messages=[ | ||
| { | ||
| "role": "system", | ||
| "content": self.settings.kg_construction_prompt, | ||
| }, | ||
| { | ||
| "role": "user", | ||
| "content": f"""Use the given format to extract | ||
| information from the following input : <input >{ output } </ input >. Skip the preamble and output the result as a list within < python></python> tags. | ||
| Important Tips | ||
| 1. Make sure all information is included in the knowledge graph . | ||
| 2. Each triple must only contain three strings ! None of the strings should be empty . | ||
| 3. Do not split up related information into separate triples because this could change the meaning. | ||
| 4. Make sure all brackets and quotation marks are matched . | ||
| 5. Before adding a triple to the knowledge graph, checkn the concatenated triple makes sense as a sentence. If not, discard it. | ||
|
|
||
|
|
||
| Here are some example input and output pairs. | ||
|
|
||
| ## Example 1. | ||
| Input: | ||
| "The Walt Disney Company, | ||
| commonly known as Disney, is | ||
| an American multinational | ||
| mass media and entertainment | ||
| conglomerate that is | ||
| headquartered at the Walt | ||
| Disney Studios complex in | ||
| Burbank, California." | ||
| Output: | ||
| <python> | ||
| [['The Walt Disney Company', ' | ||
| headquartered at', 'Walt | ||
| Disney Studios complex in | ||
| Burbank, California'], | ||
| ['The Walt Disney Company', ' | ||
| commonly known as', 'Disney' | ||
| ], | ||
| ['The Walt Disney Company', ' | ||
| instance of', 'American | ||
| multinational mass media and | ||
| entertainment conglomerate']] | ||
| </python> | ||
|
|
||
| ## Example 2. | ||
| Input: | ||
| "Amanda Jackson was born in | ||
| Springfield, Ohio, USA on | ||
| June 1, 1985. She was a | ||
| basketball player for the U.S | ||
| women's team." | ||
| Output: | ||
| <python> | ||
| [['Amanda Jackson', 'born in', | ||
| 'Springfield, Ohio, USA'], | ||
| ['Amanda Jackson', 'born on', | ||
| 'June 1, 1985'], | ||
| ['Amanda Jackson', 'occupation', | ||
| 'basketball player'], | ||
| ['Amanda Jackson', 'played for', | ||
| 'U.S. women's basketball team | ||
| ']] </python> | ||
|
|
||
| ## Example 3. | ||
| Input: | ||
| "Music executive Darius Van Arman | ||
| was born in Pennsylvania. He | ||
| attended Gonzaga College | ||
|
|
||
| High School and is a human | ||
| being." | ||
| Output: | ||
| <python> | ||
| [['Darius Van Arman', ' | ||
| occupation', 'Music executive | ||
| '], | ||
| ['Darius Van Arman', 'born in', ' | ||
| Pennsylvania'], | ||
| ['Darius Van Arman', 'attended', | ||
| 'Gonzaga College High School | ||
| '], ['Darius Van Arman', ' | ||
| instance of', 'human being']] | ||
| </python> | ||
|
|
||
| ## Example 4. | ||
| Input: "Italy had 3.6x times more | ||
| cases of coronavirus than | ||
| China." | ||
| <python> | ||
| [['Italy', 'had 3.6x times more | ||
| cases of coronavirus than', ' | ||
| China']] | ||
| </python> | ||
|
|
||
| """, | ||
| }, | ||
| ], | ||
| tools=tools, | ||
| tool_choice={ | ||
| "type": "function", | ||
| "function": {"name": "create_knowledge_graph"}, | ||
| }, | ||
| ) | ||
| response = cast(ModelResponse, response) | ||
| return response | ||
|
|
||
| def _compare_knowledge_graph_with_contexts( | ||
| self, | ||
| knowledge_graph: list[str], | ||
| contexts: list[str], | ||
| ) -> ModelResponse: | ||
|
|
||
| tools = [ | ||
| { | ||
| "type": "function", | ||
| "function": { | ||
| "name": "compare_knowledge_graph_with_contexts", | ||
| "description": "Check if the knowledge graph triples are faithful and factually accurate to the contexts", | ||
| "parameters": { | ||
| "type": "object", | ||
| "properties": { | ||
| "result": { | ||
| "type": "boolean", | ||
| "description": "True if the knowledge graph is faithful and factually accurate to the contexts, False otherwise", | ||
| } | ||
| }, | ||
| "required": ["result"], | ||
| }, | ||
| }, | ||
| } | ||
| ] | ||
|
|
||
| response = litellm.completion( | ||
| model=self.settings.model, | ||
| messages=[ | ||
| { | ||
| "role": "system", | ||
| "content": self.settings.context_to_knowledge_graph_comparison_prompt, | ||
| }, | ||
| { | ||
| "role": "user", | ||
| "content": f"""<knowledge_graph>{knowledge_graph}</knowledge_graph> | ||
|
|
||
| <contexts>{contexts}</contexts> | ||
| """, | ||
| }, | ||
| ], | ||
| tools=tools, | ||
| tool_choice={ | ||
| "type": "function", | ||
| "function": {"name": "compare_knowledge_graph_with_contexts"}, | ||
| }, | ||
| ) | ||
| response = cast(ModelResponse, response) | ||
| return response | ||
|
|
||
| def _get_arguments(self, response: ModelResponse, value: str) -> str | bool: | ||
| choice = cast(Choices, response.choices[0]) | ||
| arguments = json.loads( | ||
| cast(Message, choice.message).tool_calls[0].function.arguments # type: ignore | ||
| ) | ||
| return arguments.get( | ||
| value, | ||
| f"{value} was not found in the arguments", | ||
| ) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Guard against un-initialised variables & improve logging
costandknowledge_graphare first defined inside thetry-block.If
_construct_knowledge_graphthrows, the next block (if isinstance(knowledge_graph, list):) will raise anUnboundLocalError, and the finalMoney(amount=cost …)access will do the same forcost.While fixing this, you can also drop the unused
detailsvariable and uselogging.exceptionto capture the traceback.Do the same in the second
try-block:This prevents hidden crashes and surfaces the real stack-trace during debugging.
📝 Committable suggestion
🧰 Tools
🪛 Ruff (0.8.2)
82-82: Local variable
detailsis assigned to but never usedRemove assignment to unused variable
details(F841)