Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
325 changes: 325 additions & 0 deletions evaluators/langevals/langevals_langevals/grapheval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,325 @@
# DISCLAIMER: some prompts are taken from the research paper https://arxiv.org/pdf/2407.10793.
# Creation of this module was inspired by that paper, so cheers to authors!
import json
from typing import Optional
from langevals_core.base_evaluator import (
BaseEvaluator,
EvaluatorEntry,
EvaluationResult,
SingleEvaluationResult,
LLMEvaluatorSettings,
Money,
)
from pydantic import Field
import litellm
from litellm import Choices, Message, cast
from litellm.types.utils import ModelResponse
from litellm.cost_calculator import completion_cost
from dotenv import load_dotenv
import logging

load_dotenv()


class GraphEvalEntry(EvaluatorEntry):
input: Optional[str] = Field(default="")
output: str
contexts: list[str]


class GraphEvalSettings(LLMEvaluatorSettings):
kg_construction_prompt: str = Field(
default="""You are an expert at extracting information in structured formats to build a knowledge graph .
Step 1 - Entity detection: Identify all entities in the raw text . Make sure not to miss any out.
Entities should be basic and simple, they are akin to Wikipedia nodes .
Step 2 - Coreference resolution: Find all expressions in the text that refer to the same entity. Make sure entities are not duplicated. In particular do not include
entities that are more specific versions themselves , e.g. "a detailed view of jupiter's atmosphere " and " jupiter's atmosphere ", only include the most specific version of the entity.
Step 3 - Relation extraction: Identify semantic relationships between the entities you have identified.
Format : Return the knowledge graph as a list of triples , i.e. ["entity 1", "relation 1 - 2", "entity 2"], in Python code """
)
context_to_knowledge_graph_comparison_prompt: str = Field(
default="""You are an expert at evaluating knowledge graph triples for factual accuracy and faithfulness to the given context. Your task is to:

1. Compare each triple in the knowledge graph against the provided context(s)
2. Check for:
- Direct contradictions with the context
- Incorrect relationships between entities
- Misrepresented or fabricated facts

For each triple, determine if it is:
- SUPPORTED: All information is explicitly stated in or can be directly inferred from the context
- CONTRADICTED: The triple conflicts with information in the context
- UNVERIFIABLE: The triple makes claims that cannot be verified from the context

Return false if any triple is CONTRADICTED or UNVERIFIABLE, true if all triples are SUPPORTED."""
)
model: str = Field(
default="claude-3-5-sonnet-20240620",
description="The model to use for evaluation",
)


class GraphEvalResult(EvaluationResult):
score: float = Field(default=0.0)
passed: Optional[bool] = Field(
default=True, description="True if the response is faithful, False otherwise"
)


class GraphEvalEvaluator(
BaseEvaluator[GraphEvalEntry, GraphEvalSettings, GraphEvalResult]
):
"""
Allows you to check for hallucinations by utilizing Knowledge Graphs
"""

name = "GraphEval"
category = "custom"
default_settings = GraphEvalSettings()
is_guardrail = True

def evaluate(self, entry: GraphEvalEntry) -> SingleEvaluationResult:
details = None
passed = None
try:
knowledge_graph_response = self._construct_knowledge_graph(entry.output)
cost = completion_cost(knowledge_graph_response) or 0.0
knowledge_graph = self._get_arguments(
knowledge_graph_response, value="triples"
)
except Exception as e:
logging.error("Caught an exception while creating a knowledge graph: ", e)

try:
if isinstance(knowledge_graph, list):
passed_response = self._compare_knowledge_graph_with_contexts(
knowledge_graph=knowledge_graph, contexts=entry.contexts
)
cost += completion_cost(passed_response) or 0.0
passed = self._get_arguments(passed_response, value="result")
except Exception as e:
logging.error(
"Caught an exception while comparing knowledge graph with contexts: ", e
)

if isinstance(passed, bool):
return GraphEvalResult(
passed=passed,
details=f"The following entity_1-relationship->entity_2 triples were found in the output: {knowledge_graph}",
cost=Money(amount=cost, currency="USD") if cost else None,
)
return GraphEvalResult(
passed=False,
details="We could not evaluate faithfulness of the output",
cost=Money(amount=cost, currency="USD") if cost else None,
)
Comment on lines +81 to +115
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Guard against un-initialised variables & improve logging

cost and knowledge_graph are first defined inside the try-block.
If _construct_knowledge_graph throws, the next block (if isinstance(knowledge_graph, list):) will raise an UnboundLocalError, and the final Money(amount=cost …) access will do the same for cost.

While fixing this, you can also drop the unused details variable and use logging.exception to capture the traceback.

-        details = None
-        passed = None
-        try:
-            knowledge_graph_response = self._construct_knowledge_graph(entry.output)
-            cost = completion_cost(knowledge_graph_response) or 0.0
+        passed: bool | None = None
+        cost: float = 0.0
+        knowledge_graph: list[list[str]] | None = None
+        try:
+            knowledge_graph_response = self._construct_knowledge_graph(entry.output)
+            cost += completion_cost(knowledge_graph_response) or 0.0
             knowledge_graph = self._get_arguments(
                 knowledge_graph_response, value="triples"
             )
-        except Exception as e:
-            logging.error("Caught an exception while creating a knowledge graph: ", e)
+        except Exception:
+            logging.exception("Error while creating a knowledge graph")

Do the same in the second try-block:

-        except Exception as e:
-            logging.error(
-                "Caught an exception while comparing knowledge graph with contexts: ", e
-            )
+        except Exception:
+            logging.exception(
+                "Error while comparing knowledge graph with contexts"
+            )

This prevents hidden crashes and surfaces the real stack-trace during debugging.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def evaluate(self, entry: GraphEvalEntry) -> SingleEvaluationResult:
details = None
passed = None
try:
knowledge_graph_response = self._construct_knowledge_graph(entry.output)
cost = completion_cost(knowledge_graph_response) or 0.0
knowledge_graph = self._get_arguments(
knowledge_graph_response, value="triples"
)
except Exception as e:
logging.error("Caught an exception while creating a knowledge graph: ", e)
try:
if isinstance(knowledge_graph, list):
passed_response = self._compare_knowledge_graph_with_contexts(
knowledge_graph=knowledge_graph, contexts=entry.contexts
)
cost += completion_cost(passed_response) or 0.0
passed = self._get_arguments(passed_response, value="result")
except Exception as e:
logging.error(
"Caught an exception while comparing knowledge graph with contexts: ", e
)
if isinstance(passed, bool):
return GraphEvalResult(
passed=passed,
details=f"The following entity_1-relationship->entity_2 triples were found in the output: {knowledge_graph}",
cost=Money(amount=cost, currency="USD") if cost else None,
)
return GraphEvalResult(
passed=False,
details="We could not evaluate faithfulness of the output",
cost=Money(amount=cost, currency="USD") if cost else None,
)
def evaluate(self, entry: GraphEvalEntry) -> SingleEvaluationResult:
passed: bool | None = None
cost: float = 0.0
knowledge_graph: list[list[str]] | None = None
try:
knowledge_graph_response = self._construct_knowledge_graph(entry.output)
cost += completion_cost(knowledge_graph_response) or 0.0
knowledge_graph = self._get_arguments(
knowledge_graph_response, value="triples"
)
except Exception:
logging.exception("Error while creating a knowledge graph")
try:
if isinstance(knowledge_graph, list):
passed_response = self._compare_knowledge_graph_with_contexts(
knowledge_graph=knowledge_graph, contexts=entry.contexts
)
cost += completion_cost(passed_response) or 0.0
passed = self._get_arguments(passed_response, value="result")
except Exception:
logging.exception(
"Error while comparing knowledge graph with contexts"
)
if isinstance(passed, bool):
return GraphEvalResult(
passed=passed,
details=f"The following entity_1-relationship->entity_2 triples were found in the output: {knowledge_graph}",
cost=Money(amount=cost, currency="USD") if cost else None,
)
return GraphEvalResult(
passed=False,
details="We could not evaluate faithfulness of the output",
cost=Money(amount=cost, currency="USD") if cost else None,
)
🧰 Tools
🪛 Ruff (0.8.2)

82-82: Local variable details is assigned to but never used

Remove assignment to unused variable details

(F841)


def _construct_knowledge_graph(self, output: str) -> ModelResponse:
tools = [
{
"type": "function",
"function": {
"name": "create_knowledge_graph",
"description": "Create a knowledge graph from input text",
"parameters": {
"type": "object",
"properties": {
"triples": {
"type": "array",
"items": {
"type": "object",
"properties": {
"entity_1": {
"type": "string",
"description": "First entity in the relationship",
},
"relationship": {
"type": "string",
"description": "Relationship between entities",
},
"entity_2": {
"type": "string",
"description": "Second entity in the relationship",
},
},
"required": [
"entity_1",
"relationship",
"entity_2",
],
},
"description": "List of entity-relationship triples that construct a knowledge graph",
}
},
Comment on lines +122 to +153
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Schema/Example mismatch – triples defined as objects but examples return lists

The JSON schema for triples specifies an array of objects with entity_1, relationship, entity_2 keys.
However, the prompt examples (and downstream code/tests) expect each triple to be a simple 3-item list, e.g.:

["Italy", "had 3.6x times more cases of coronavirus than", "China"]

This inconsistency can confuse the LLM and break self._get_arguments (which assumes a list of lists).

Consider aligning the schema with the expected structure:

-                            "triples": {
-                                "type": "array",
-                                "items": {
-                                    "type": "object",
-                                    "properties": {
-                                        "entity_1": { "type": "string" },
-                                        "relationship": { "type": "string" },
-                                        "entity_2": { "type": "string" }
-                                    },
-                                    "required": ["entity_1","relationship","entity_2"],
-                                },
+                            "triples": {
+                                "type": "array",
+                                "items": {
+                                    "type": "array",
+                                    "items": { "type": "string" },
+                                    "minItems": 3,
+                                    "maxItems": 3
+                                },
                                 "description": "List of [entity_1, relationship, entity_2] triples",
                             }

or, alternatively, update the rest of the pipeline/tests to consume objects instead of lists.

"required": ["triples"],
},
},
}
]
response = litellm.completion(
model=self.settings.model,
messages=[
{
"role": "system",
"content": self.settings.kg_construction_prompt,
},
{
"role": "user",
"content": f"""Use the given format to extract
information from the following input : <input >{ output } </ input >. Skip the preamble and output the result as a list within < python></python> tags.
Important Tips
1. Make sure all information is included in the knowledge graph .
2. Each triple must only contain three strings ! None of the strings should be empty .
3. Do not split up related information into separate triples because this could change the meaning.
4. Make sure all brackets and quotation marks are matched .
5. Before adding a triple to the knowledge graph, checkn the concatenated triple makes sense as a sentence. If not, discard it.


Here are some example input and output pairs.

## Example 1.
Input:
"The Walt Disney Company,
commonly known as Disney, is
an American multinational
mass media and entertainment
conglomerate that is
headquartered at the Walt
Disney Studios complex in
Burbank, California."
Output:
<python>
[['The Walt Disney Company', '
headquartered at', 'Walt
Disney Studios complex in
Burbank, California'],
['The Walt Disney Company', '
commonly known as', 'Disney'
],
['The Walt Disney Company', '
instance of', 'American
multinational mass media and
entertainment conglomerate']]
</python>

## Example 2.
Input:
"Amanda Jackson was born in
Springfield, Ohio, USA on
June 1, 1985. She was a
basketball player for the U.S
women's team."
Output:
<python>
[['Amanda Jackson', 'born in',
'Springfield, Ohio, USA'],
['Amanda Jackson', 'born on',
'June 1, 1985'],
['Amanda Jackson', 'occupation',
'basketball player'],
['Amanda Jackson', 'played for',
'U.S. women's basketball team
']] </python>

## Example 3.
Input:
"Music executive Darius Van Arman
was born in Pennsylvania. He
attended Gonzaga College

High School and is a human
being."
Output:
<python>
[['Darius Van Arman', '
occupation', 'Music executive
'],
['Darius Van Arman', 'born in', '
Pennsylvania'],
['Darius Van Arman', 'attended',
'Gonzaga College High School
'], ['Darius Van Arman', '
instance of', 'human being']]
</python>

## Example 4.
Input: "Italy had 3.6x times more
cases of coronavirus than
China."
<python>
[['Italy', 'had 3.6x times more
cases of coronavirus than', '
China']]
</python>

""",
},
],
tools=tools,
tool_choice={
"type": "function",
"function": {"name": "create_knowledge_graph"},
},
)
response = cast(ModelResponse, response)
return response

def _compare_knowledge_graph_with_contexts(
self,
knowledge_graph: list[str],
contexts: list[str],
) -> ModelResponse:

tools = [
{
"type": "function",
"function": {
"name": "compare_knowledge_graph_with_contexts",
"description": "Check if the knowledge graph triples are faithful and factually accurate to the contexts",
"parameters": {
"type": "object",
"properties": {
"result": {
"type": "boolean",
"description": "True if the knowledge graph is faithful and factually accurate to the contexts, False otherwise",
}
},
"required": ["result"],
},
},
}
]

response = litellm.completion(
model=self.settings.model,
messages=[
{
"role": "system",
"content": self.settings.context_to_knowledge_graph_comparison_prompt,
},
{
"role": "user",
"content": f"""<knowledge_graph>{knowledge_graph}</knowledge_graph>

<contexts>{contexts}</contexts>
""",
},
],
tools=tools,
tool_choice={
"type": "function",
"function": {"name": "compare_knowledge_graph_with_contexts"},
},
)
response = cast(ModelResponse, response)
return response

def _get_arguments(self, response: ModelResponse, value: str) -> str | bool:
choice = cast(Choices, response.choices[0])
arguments = json.loads(
cast(Message, choice.message).tool_calls[0].function.arguments # type: ignore
)
return arguments.get(
value,
f"{value} was not found in the arguments",
)
Loading