-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Langchain #216
base: main
Are you sure you want to change the base?
Langchain #216
Changes from all commits
c8a05de
5677fb7
7e48531
d3d6144
eed4434
0ca5d57
38dac0f
a25d33e
626000e
4d9a90a
9d54f24
78ffb03
e581c07
396133a
0fe4dd6
29aceba
74cedbc
143bb92
3bbc485
f2c5a49
599ea87
f9399c4
c728b95
cad601f
2ffdefd
c05c335
71c76f4
6bc5d7c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,7 +52,12 @@ numpy = "~=1.26" | |
umap-learn = "~=0.5" | ||
scikit-learn = "~=1.5" | ||
nltk = "~=3.9" | ||
sacrebleu = "^2.4.3" | ||
pytest-testmon = "^2.1.1" | ||
batwood-1 marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suspect you don't want |
||
vocos = "~=0.1" | ||
deepeval = "~=2.2.6" | ||
rouge-score = "~=0.1.2" | ||
textstat = "^0.7.4" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same as above |
||
iso639 = "~=0.1" | ||
nest-asyncio = "~=1.5" | ||
pylangacq = "~=0.19" | ||
|
@@ -129,10 +134,7 @@ target-version = "py310" | |
|
||
[tool.ruff.lint] | ||
select = ["ANN", "D", "E", "F", "I"] | ||
ignore = [ | ||
"ANN101", # self should not be annotated. | ||
"ANN102" # cls should not be annotated. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why this? I suspect ANN101 and ANN102 should be there since we never annotate self |
||
] | ||
ignore = [] | ||
fixable = ["ALL"] | ||
unfixable = [] | ||
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" | ||
|
@@ -163,7 +165,7 @@ skip = [ | |
"*.cha", | ||
"*.ipynb" | ||
] | ||
ignore-words-list = ["senselab", "nd", "astroid", "wil", "SER", "te"] | ||
ignore-words-list = ["senselab", "nd", "astroid", "wil", "SER", "te", "ROUGE", "rouge"] | ||
|
||
[build-system] | ||
requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
"""senselab project integrates deepeval for evaluating conversations. | ||
|
||
Using an api.py script to interface with deep_eval.py, | ||
which includes a custom ROUGE metric for comprehensive evaluation. | ||
The ScriptLine class standardizes input data, and unit tests ensure accurate functionality, | ||
making Senselab a robust wrapper for deepeval and other tools. | ||
""" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
"""deep_eval.py.""" | ||
|
||
from typing import Dict, List | ||
|
||
from senselab.text.tasks.evaluate_conversation.metrics import ( | ||
BaseMetric, | ||
BiasEvaluation, | ||
ReadabilityScore, | ||
RougeScore, | ||
TextStatistics, | ||
ToxicityEvaluation, | ||
) | ||
from senselab.utils.data_structures.transcript_output import TranscriptOutput | ||
|
||
|
||
def evaluate_transcript_output(transcript_output: TranscriptOutput, selected_metrics: List[str]) -> TranscriptOutput: | ||
"""Evaluate a conversation based on the provided transcript output and metrics. | ||
|
||
Args: | ||
transcript_output (TranscriptOutput): The transcript output to evaluate. | ||
selected_metrics (List[str]): A list of metrics to use for evaluation. Available metrics are: | ||
- "ReadabilityScore" | ||
- "RougeScore" | ||
- "TextStatistics" | ||
- "ToxicityEvaluation" | ||
- "BiasEvaluation" | ||
|
||
Returns: | ||
TranscriptOutput: The transcript output with evaluated metrics for each response. | ||
""" | ||
if not transcript_output: | ||
raise ValueError("transcript output is empty!") | ||
|
||
available_metrics = { | ||
"ReadabilityScore": ReadabilityScore, | ||
"RougeScore": RougeScore, | ||
"TextStatistics": TextStatistics, | ||
"ToxicityEvaluation": ToxicityEvaluation, | ||
"BiasEvaluation": BiasEvaluation, | ||
} | ||
for name in selected_metrics: | ||
if name not in available_metrics: | ||
raise ValueError(f"Metric '{name}' is not available. Choose from {list(available_metrics.keys())}") | ||
|
||
selected_metrics_classes = [available_metrics[name] for name in selected_metrics] | ||
|
||
for i, response in enumerate(transcript_output.data): | ||
if response["speaker"] == "AI": | ||
assert transcript_output.data[i - 1]["speaker"] == "Tutor" | ||
response_reference_pair = (response["text"], transcript_output.data[i - 1]["text"]) | ||
response["metrics"] = pipeline(response_reference_pair, selected_metrics_classes) | ||
|
||
if response["speaker"] == "Tutor": | ||
if i + 1 < len(transcript_output.data) and transcript_output.data[i + 1]["speaker"] == "AI": | ||
response_reference_pair = (response["text"], transcript_output.data[i + 1]["text"]) | ||
response["metrics"] = pipeline(response_reference_pair, selected_metrics_classes) | ||
|
||
return transcript_output | ||
|
||
|
||
def pipeline(response_reference_pair: tuple, selected_metrics_classes: list[type[BaseMetric]]) -> Dict: | ||
"""Run the metric pipeline on a single text-reference pair. | ||
|
||
Args: | ||
response_reference_pair (tuple): A tuple containing the response and reference text. | ||
selected_metrics_classes (list[BaseMetric]): A list of metric classes to be used for evaluation. | ||
|
||
Returns: | ||
Dict: A dictionary containing the results of the computed metrics. | ||
""" | ||
metrics = {} | ||
for metric_class in selected_metrics_classes: | ||
result = metric_class().compute_reference_pair(response_reference_pair) | ||
metrics.update(result) | ||
return metrics |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,275 @@ | ||
"""Metrics to assess performance on tutor response. | ||
|
||
Functions named as ``*_score`` return a scalar value to maximize: the higher | ||
the better. | ||
|
||
Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize: | ||
the lower the better. | ||
|
||
All other functions are value-independent. | ||
""" | ||
|
||
from abc import ABC, abstractmethod | ||
from typing import Any, Dict | ||
|
||
import textstat | ||
from deepeval.metrics import BiasMetric, PromptAlignmentMetric, ToxicityMetric | ||
from deepeval.test_case import LLMTestCaseParams | ||
from rouge_score import rouge_scorer | ||
|
||
|
||
class BaseMetric(ABC): | ||
"""Base class for evaluation metrics.""" | ||
|
||
@abstractmethod | ||
def compute_reference_pair(self: "BaseMetric", pair: tuple) -> Dict[str, Any]: | ||
"""Compute the reference pair metrics.""" | ||
raise NotImplementedError | ||
|
||
|
||
class SingleTextMetric(BaseMetric): | ||
"""Base class for metrics that only need one text.""" | ||
|
||
@abstractmethod | ||
def compute(self: "SingleTextMetric", text: str) -> Dict[str, Any]: | ||
"""Compute metrics for the given text. | ||
|
||
Args: | ||
text (str): The input text to evaluate. | ||
reference (str): The reference text to compare against. Will always be None. | ||
|
||
Returns: | ||
Dict[str, Any]: A dictionary containing the computed metrics. | ||
""" | ||
raise NotImplementedError | ||
|
||
def compute_reference_pair(self: "SingleTextMetric", pair: tuple) -> Dict[str, Any]: | ||
"""Compute the reference pair. | ||
|
||
This method takes a tuple containing two elements and computes the reference | ||
for the first element in the tuple. It asserts that the tuple has exactly two elements. | ||
|
||
Args: | ||
pair (tuple): A tuple containing two elements. | ||
|
||
Returns: | ||
Dict[str, Any]: The computed reference for the first element in the tuple. | ||
""" | ||
assert len(pair) == 2 | ||
return self.compute(pair[0]) | ||
|
||
|
||
class ComparativeMetric(BaseMetric): | ||
"""Base class for metrics that compare two texts.""" | ||
|
||
@abstractmethod | ||
def compute(self: "ComparativeMetric", text: str, reference_text: str) -> Dict[str, Any]: | ||
"""Computes the evaluation metrics for a given text against a reference text. | ||
|
||
Args: | ||
text (str): The text to be evaluated. | ||
reference_text (str): The reference text to compare against. | ||
|
||
Returns: | ||
Dict[str, Any]: A dictionary containing the computed metrics. | ||
""" | ||
raise NotImplementedError | ||
|
||
def compute_reference_pair(self: "ComparativeMetric", pair: tuple) -> Dict[str, Any]: | ||
"""Compute the reference pair metrics. | ||
|
||
This method takes a tuple containing two elements and computes the metrics | ||
by calling the `compute` method with the two elements of the tuple. | ||
|
||
Args: | ||
pair (tuple): A tuple containing exactly two elements. | ||
|
||
Returns: | ||
Dict[str, Any]: A dictionary containing the computed metrics. | ||
|
||
Raises: | ||
AssertionError: If the length of the tuple is not equal to 2. | ||
""" | ||
assert len(pair) == 2 | ||
return self.compute(pair[0], pair[1]) | ||
|
||
|
||
class ReadabilityScore(SingleTextMetric): | ||
"""Class to compute readability scores for a text.""" | ||
|
||
def compute(self: "ReadabilityScore", text: str) -> Dict[str, float]: | ||
"""Compute the readability and syntactic complexity scores for the text using Flesch metrics. | ||
|
||
The Flesch Reading Ease score indicates how easy a text is to read. | ||
Higher scores indicate easier readability, with scores ranging from 0 to 100. | ||
- 90-100: Very easy to read, easily understood by an average 11-year-old student. | ||
- 60-70: Plain English, easily understood by 13- to 15-year-old students. | ||
- 0-30: Very difficult to read, best understood by university graduates. | ||
|
||
The Flesch-Kincaid Grade Level score indicates the grade level required to understand the text. | ||
Lower scores indicate easier readability, with scores corresponding to U.S. school grade levels. | ||
|
||
Args: | ||
text: The model response text | ||
|
||
Returns: | ||
Dictionary containing the computed readability and grade level scores: | ||
- "readability_score": Flesch Reading Ease score | ||
- "grade_level_score": Flesch-Kincaid Grade Level score | ||
""" | ||
return { | ||
"readability_score": textstat.flesch_reading_ease(text), | ||
"grade_level_score": textstat.flesch_kincaid_grade(text), | ||
} | ||
|
||
|
||
class RougeScore(ComparativeMetric): | ||
"""Class to compute ROUGE scores for a text against a reference text.""" | ||
|
||
def compute(self: "RougeScore", text: str, reference_text: str) -> Dict[str, float]: | ||
"""Compute the ROUGE (Recall-Oriented Understudy for Gisting Evaluation) scores for the text. | ||
|
||
ROUGE is a set of metrics for evaluating automatic summarization and machine translation. | ||
It compares an automatically produced summary or translation against a reference or a set of references. | ||
|
||
The following ROUGE metrics are computed: | ||
- ROUGE-1: Overlap of unigrams (single words) between the system and reference summaries. | ||
- ROUGE-2: Overlap of bigrams (two consecutive words) between the system and reference summaries. | ||
- ROUGE-L: Longest common subsequence (LCS) between the system and reference summaries. | ||
|
||
Args: | ||
text: The model response text | ||
reference_text: The human tutor response text | ||
|
||
Returns: | ||
Dictionary containing the computed ROUGE scores: | ||
- "rouge-1": ROUGE-1 score | ||
- "rouge-2": ROUGE-2 score | ||
- "rouge-l": ROUGE-L score | ||
""" | ||
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True) | ||
|
||
scores = scorer.score(text, reference_text) | ||
return { | ||
"rouge-1": scores["rouge-1"]["f"], | ||
"rouge-2": scores["rouge-2"]["f"], | ||
"rouge-l": scores["rouge-l"]["f"], | ||
} | ||
|
||
|
||
class TextStatistics(SingleTextMetric): | ||
"""Class to compute basic text statistics such as word count and sentence count.""" | ||
|
||
def compute(self: "TextStatistics", text: str) -> Dict[str, int]: | ||
"""Compute basic text statistics such as word count and sentence count. | ||
|
||
Args: | ||
text: The model response text | ||
|
||
Returns: | ||
Dictionary containing the computed text statistics: | ||
- "word_count": Number of words in the text | ||
- "sentence_count": Number of sentences in the text | ||
- "char_count": Number of characters in the text | ||
""" | ||
return { | ||
"word_count": textstat.lexicon_count(text), | ||
"sentence_count": textstat.sentence_count(text), | ||
"char_count": textstat.char_count(text), | ||
} | ||
|
||
|
||
class ToxicityEvaluation(SingleTextMetric): | ||
"""Evaluates text toxicity using deepeval's ToxicityMetric.""" | ||
|
||
def __init__(self: "ToxicityEvaluation") -> None: | ||
"""Initializes the evaluator with a ToxicityMetric instance.""" | ||
self.evaluator = ToxicityMetric() | ||
|
||
def compute(self: "ToxicityEvaluation", text: str) -> Dict[str, Any]: | ||
"""Compute the toxicity score for the given text. | ||
|
||
The toxicity metric measures the presence of harmful, offensive, or | ||
inappropriate content in the text. Lower scores indicate less toxic content. | ||
|
||
Args: | ||
text: The model response text | ||
|
||
Returns: | ||
Dictionary containing the computed toxicity metrics: | ||
- "toxicity_score": Overall toxicity score between 0 and 1 | ||
- "toxicity_labels": Specific toxicity categories detected | ||
- "is_toxic": Boolean indicating if the text exceeds toxicity thresholds | ||
""" | ||
test_case = LLMTestCaseParams(actual_output=text) | ||
|
||
evaluation = self.evaluator.measure(test_case) | ||
return { | ||
"toxicity_score": evaluation.score, | ||
"toxicity_labels": evaluation.toxicity_labels if hasattr(evaluation, "toxicity_labels") else [], | ||
"is_toxic": not evaluation.passed, | ||
} | ||
|
||
|
||
class BiasEvaluation(SingleTextMetric): | ||
"""Evaluates text bias using deepeval's BiasMetric.""" | ||
|
||
def __init__(self: "BiasEvaluation") -> None: | ||
"""Initializes the BiasMetric evaluator.""" | ||
self.evaluator = BiasMetric() | ||
|
||
def compute(self: "BiasEvaluation", text: str) -> Dict[str, Any]: | ||
"""Compute the bias score for the given text. | ||
|
||
The bias metric detects various forms of bias including gender, racial, | ||
and other demographic biases in the text. Lower scores indicate less bias. | ||
|
||
Args: | ||
text: The model response text | ||
|
||
Returns: | ||
Dictionary containing the computed bias metrics: | ||
- "bias_score": Overall bias score between 0 and 1 | ||
- "bias_types": Types of bias detected | ||
- "has_bias": Boolean indicating if the text contains significant bias | ||
""" | ||
test_case = LLMTestCaseParams(actual_output=text) | ||
|
||
evaluation = self.evaluator.measure(test_case) | ||
return { | ||
"bias_score": evaluation.score, | ||
"bias_types": evaluation.bias_types if hasattr(evaluation, "bias_types") else [], | ||
"has_bias": not evaluation.passed, | ||
} | ||
|
||
|
||
class PromptAlignmentEvaluation(ComparativeMetric): | ||
"""Evaluates how well the response aligns with the original prompt using deepeval's PromptAlignmentMetric.""" | ||
|
||
# TODO | ||
|
||
# This should compare the LLM response to the system instruction, not the tutor response | ||
|
||
def __init__(self: "PromptAlignmentEvaluation") -> None: | ||
"""Initializes the evaluator with a PromptAlignmentMetric instance.""" | ||
self.evaluator = PromptAlignmentMetric() | ||
|
||
def compute(self: "PromptAlignmentEvaluation", text: str, reference_text: str) -> Dict[str, float]: | ||
"""Compute how well the response aligns with the given prompt/reference. | ||
|
||
The prompt alignment metric measures how well the model's response addresses | ||
and follows the requirements and context of the original prompt. | ||
|
||
Args: | ||
text: The model response text | ||
reference_text: The system instruction | ||
|
||
Returns: | ||
Dictionary containing the computed alignment metrics: | ||
- "alignment_score": Overall alignment score between 0 and 1 | ||
- "is_aligned": Boolean indicating if the response adequately aligns with the prompt | ||
""" | ||
test_case = LLMTestCaseParams(actual_output=text, expected_output=reference_text) | ||
|
||
evaluation = self.evaluator.measure(test_case) | ||
return {"alignment_score": evaluation.score, "is_aligned": evaluation.passed} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
""".. include:: ./doc.md""" # noqa: D415 | ||
|
||
__version__ = "1.0.0" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For consistency and easier dependency solving, can you use the tilde as for the other dependencies?
~=2.4