diff --git a/.gitignore b/.gitignore index fbe5c4b..c9119af 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ dist/ downloads/ eggs/ .eggs/ +generation/ lib/ lib64/ parts/ @@ -152,6 +153,12 @@ output/ # local artifacts local/ +# cache artifacts +cache/ + +# notebook cache +docs/notebooks/cache/ + # mypy .mypy_cache/ .dmypy.json @@ -178,3 +185,6 @@ cython_debug/ # PyPI configuration file .pypirc + +# GitHub instructions +.github/instructions/ diff --git a/.semversioner/next-release/minor-20251219235819458131.json b/.semversioner/next-release/minor-20251219235819458131.json new file mode 100644 index 0000000..3e82593 --- /dev/null +++ b/.semversioner/next-release/minor-20251219235819458131.json @@ -0,0 +1,4 @@ +{ + "type": "minor", + "description": "Add assertion generation for data-local and data-global questions with optional validation" +} diff --git a/README.md b/README.md index 85b2989..200a290 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ flowchart LR BenchmarkQED is a suite of tools designed for automated benchmarking of retrieval-augmented generation (RAG) systems. It provides components for query generation, evaluation, and dataset preparation to facilitate reproducible testing at scale. - **AutoQ:** Generates four classes of synthetic queries with variable data scope, ranging from local queries (answered using a small number of text regions) to global queries (requiring reasoning over large portions or the entirety of a dataset). -- **AutoE:** Evaluates RAG answers by comparing them side-by-side on key metrics—relevance, comprehensiveness, diversity, and empowerment—using the LLM-as-a-Judge approach. When ground truth is available, AutoE can also assess correctness, completeness, and other custom metrics. +- **AutoE:** Evaluates RAG answers by comparing them side-by-side on key metrics—relevance, comprehensiveness, diversity, and empowerment—using the LLM-as-a-Judge approach. When ground truth is available, AutoE can also assess correctness, completeness, and other custom metrics. Additionally, AutoE supports assertion-based scoring using either manually-authored assertions or those generated by AutoQ. - **AutoD:** Provides data utilities for sampling and summarizing datasets, ensuring consistent inputs for query synthesis. In addition to the tools, we also release two datasets to support the development and evaluation of RAG systems: diff --git a/benchmark_qed/autoe/__init__.py b/benchmark_qed/autoe/__init__.py index 22abdb2..191d081 100644 --- a/benchmark_qed/autoe/__init__.py +++ b/benchmark_qed/autoe/__init__.py @@ -1,2 +1,19 @@ # Copyright (c) 2025 Microsoft Corporation. """Relative measure module for evaluating the performance of models.""" + +from benchmark_qed.autoe.visualization import ( + get_available_question_sets, + get_available_rag_methods, + plot_assertion_accuracy_by_rag_method, + plot_assertion_score_distribution, + prepare_assertion_summary_data, +) + +__all__ = [ + "get_available_question_sets", + "get_available_rag_methods", + # Assertion-based visualizations + "plot_assertion_accuracy_by_rag_method", + "plot_assertion_score_distribution", + "prepare_assertion_summary_data", +] diff --git a/benchmark_qed/autoe/assertion_scores.py b/benchmark_qed/autoe/assertion_scores.py index 3692b84..1b77070 100644 --- a/benchmark_qed/autoe/assertion_scores.py +++ b/benchmark_qed/autoe/assertion_scores.py @@ -7,14 +7,17 @@ from collections.abc import Callable from pathlib import Path from string import Template -from typing import Any +from typing import Any, cast from uuid import uuid4 +import numpy as np import pandas as pd +from rich import print as rich_print from rich.progress import Progress, TaskID from benchmark_qed.autoe.data_model.assertion import Assertion, AssertionLLMResponse from benchmark_qed.autoe.prompts import assertion as assertion_prompts +from benchmark_qed.cli.utils import print_df from benchmark_qed.config.llm_config import LLMConfig from benchmark_qed.config.utils import load_template_file from benchmark_qed.llm.type.base import ChatModel @@ -30,6 +33,7 @@ def get_assertion_scores( answers: pd.DataFrame, assertions: pd.DataFrame, trials: int, + top_k: int | None = None, assessment_system_prompt: Template | None = None, assessment_user_prompt: Template | None = None, include_score_id_in_prompt: bool = True, @@ -45,9 +49,20 @@ def get_assertion_scores( llm_config (LLMConfig): The LLM configuration to use for scoring. answers (pd.DataFrame): DataFrame containing answers with columns 'question', 'answer'. assertions (pd.DataFrame): DataFrame containing assertions with column 'assertion'. + trials (int): Number of trials to run for each assertion. + top_k (int | None): If specified, only evaluate the top-k assertions per question + (ranked by rank if available, where lower rank = higher importance, + otherwise uses first k assertions). assessment_system_prompt (Template | None): Optional system prompt template for the assessment. assessment_user_prompt (Template | None): Optional user prompt template for the assessment. include_score_id_in_prompt (bool): Whether to include the score ID in the user prompt. + question_id_key (str): Column name for question ID. + question_text_key (str): Column name for question text. + answer_text_key (str): Column name for answer text. + + Returns + ------- + pd.DataFrame: Results with assertion scores and metadata. """ pairs = ( answers.merge( @@ -67,6 +82,27 @@ def get_assertion_scores( ) pairs = pairs[["question_id", "question_text", "answer_text", "assertion"]] + # Apply top-k filtering if specified + if top_k is not None and top_k > 0: + # Check if assertions have a 'rank' column for ranking + if "rank" in assertions.columns: + # Rank by rank (ascending - lower rank = higher importance) and take top-k per question + pairs_with_rank = pairs.merge( + assertions[["assertion", "rank"]], on="assertion", how="left" + ) + pairs = ( + pairs_with_rank.sort_values( + ["question_id", "rank"], ascending=[True, True] + ) + .groupby("question_id") + .head(top_k) + .drop(columns=["rank"]) + .reset_index(drop=True) + ) + else: + # If no rank column, just take first k assertions per question + pairs = pairs.groupby("question_id").head(top_k).reset_index(drop=True) + with Progress() as progress: def on_complete_callback(progress_task: TaskID) -> None: @@ -156,3 +192,298 @@ async def evaluate_assertion( "assertion": assertion, "trial": trial, } + + +def load_and_normalize_assertions( + input_dir: str, + question_set: str, + assertions_filename_template: str = "{question_set}_assertions.json", +) -> pd.DataFrame: + """ + Load assertions from JSON file and normalize nested dictionaries. + + Args: + input_dir: Directory containing assertion files + question_set: Name of the question set + assertions_filename_template: Template for assertion filename (default: "{question_set}_assertions.json") + + Returns + ------- + DataFrame with normalized assertion data containing question_id, question_text, assertion, rank + """ + assertions_file = assertions_filename_template.format(question_set=question_set) + assertions_raw = pd.read_json(f"{input_dir}/{assertions_file}") + + # Explode assertions and normalize the nested dictionaries + assertions = assertions_raw.explode("assertions").reset_index(drop=True) + + # Normalize the assertion dictionaries into separate columns + assertion_normalized = pd.json_normalize( + cast(list[dict[str, Any]], assertions["assertions"].tolist()) + ) + assertions = pd.concat( + [ + assertions.drop("assertions", axis=1), + assertion_normalized[["statement", "rank"]], # Keep only statement and rank + ], + axis=1, + ) + + # Rename the statement column to assertion for compatibility + return assertions.rename(columns={"statement": "assertion"}) + + +def evaluate_rag_method( + llm_client: ChatModel, + llm_config: LLMConfig, + generated_rag: str, + question_set: str, + assertions: pd.DataFrame, + input_dir: str, + output_dir: Path, + trials: int, + top_k_assertions: int | None, + pass_threshold: float, + answers_path_template: str = "{input_dir}/{generated_rag}/{question_set}.json", +) -> dict[str, Any] | None: + """ + Evaluate a single RAG method against assertions for a question set. + + Args: + llm_client: LLM client for evaluation + llm_config: LLM configuration + generated_rag: Name of the RAG method + question_set: Name of the question set + assertions: DataFrame with assertions + input_dir: Input directory path + output_dir: Output directory path + trials: Number of evaluation trials + top_k_assertions: Number of top assertions to evaluate (None for all) + pass_threshold: Threshold for assertion pass/fail + answers_path_template: Template for answers file path (default: "{input_dir}/{generated_rag}/{question_set}.json") + + Returns + ------- + Dictionary with evaluation results or None if evaluation failed + """ + question_set_output_dir = output_dir / question_set + if not question_set_output_dir.exists(): + question_set_output_dir.mkdir(parents=True) + + # Define answers path before try block so it's available in except block + answers_path = answers_path_template.format( + input_dir=input_dir, generated_rag=generated_rag, question_set=question_set + ) + + try: + # Load answers for this RAG method and question set + answers = pd.read_json(answers_path) + + # Get assertion scores + assertion_score = get_assertion_scores( + llm_client=llm_client, + llm_config=llm_config, + answers=answers, + assertions=assertions, + trials=trials, + top_k=top_k_assertions, + question_id_key="question_id", + question_text_key="question_text", + answer_text_key="answer", + ) + + # Save detailed scores for this RAG method and question set + assertion_score.to_csv( + question_set_output_dir / f"{generated_rag}_assertion_scores.csv", + index=False, + ) + + # Calculate summary statistics + summary_by_assertion = ( + assertion_score.groupby(["question", "assertion"]) + .agg( + score=("score", lambda x: int(x.mean() > pass_threshold)), + scores=("score", list), + ) + .reset_index() + ) + + summary_by_question = ( + summary_by_assertion.groupby(["question"]) + .agg( + success=("score", lambda x: (x == 1).sum()), + fail=("score", lambda x: (x == 0).sum()), + ) + .reset_index() + ) + + # Calculate overall accuracy score + total_success = summary_by_question["success"].sum() + total_fail = summary_by_question["fail"].sum() + total_assertions = total_success + total_fail + overall_accuracy = ( + total_success / total_assertions if total_assertions > 0 else 0.0 + ) + + # Calculate per-assertion statistics + summary_by_assertion["score_mean"] = summary_by_assertion["scores"].apply( + lambda x: np.mean(x) if len(x) > 0 else 0.0 + ) + summary_by_assertion["score_std"] = summary_by_assertion["scores"].apply( + lambda x: np.std(x) if len(x) > 0 else 0.0 + ) + summary_by_assertion = summary_by_assertion.drop(columns=["scores"]) + + # Save detailed summary for this RAG method and question set + summary_by_question.to_csv( + question_set_output_dir / f"{generated_rag}_summary_by_question.csv", + index=False, + ) + summary_by_assertion.to_csv( + question_set_output_dir / f"{generated_rag}_summary_by_assertion.csv", + index=False, + ) + + # Report failed assertions for this method + failed_assertions: pd.DataFrame = cast( + pd.DataFrame, summary_by_assertion[summary_by_assertion["score"] == 0] + ) + + if len(failed_assertions) > 0: + rich_print( + f" [bold red]{generated_rag} ({question_set}): {len(failed_assertions)} assertions failed[/bold red]" + ) + else: + rich_print( + f" [bold green]{generated_rag} ({question_set}): All assertions passed[/bold green]" + ) + + rich_print( + f" {generated_rag} ({question_set}) - Overall accuracy: {overall_accuracy:.3f} ({total_success}/{total_assertions})" + ) + if top_k_assertions is not None: + rich_print( + f" [dim]Using top-{top_k_assertions} assertions per question[/dim]" + ) + + # Return results for summary + return { + "question_set": question_set, + "rag_method": generated_rag, + "total_assertions": total_assertions, + "successful_assertions": total_success, + "failed_assertions": total_fail, + "overall_accuracy": overall_accuracy, + "total_questions": len(summary_by_question), + "top_k_used": top_k_assertions if top_k_assertions is not None else "all", + } + + except FileNotFoundError as e: + rich_print( + f" [bold yellow]Warning: Could not find answers file at {answers_path}: {e}[/bold yellow]" + ) + return None + except (OSError, ValueError, KeyError) as e: + rich_print( + f" [bold red]Error processing {generated_rag}/{question_set}: {e}[/bold red]" + ) + return None + + +def run_assertion_evaluation( + llm_client: ChatModel, + llm_config: LLMConfig, + question_sets: list[str], + generated_rags: list[str], + input_dir: str, + output_dir: Path, + trials: int, + top_k_assertions: int | None, + pass_threshold: float, + assertions_filename_template: str = "{question_set}_assertions.json", + answers_path_template: str = "{input_dir}/{generated_rag}/{question_set}.json", +) -> pd.DataFrame: + """ + Run assertion-based evaluation for multiple question sets and RAG methods. + + Args: + llm_client: LLM client for evaluation + llm_config: LLM configuration + question_sets: List of question set names + generated_rags: List of RAG method names + input_dir: Input directory path + output_dir: Output directory path + trials: Number of evaluation trials + top_k_assertions: Number of top assertions to evaluate (None for all) + pass_threshold: Threshold for assertion pass/fail + assertions_filename_template: Template for assertion filename (default: "{question_set}_assertions.json") + answers_path_template: Template for answers file path (default: "{input_dir}/{generated_rag}/{question_set}.json") + + Returns + ------- + DataFrame with overall results summary + """ + overall_results = [] + + # Loop through each question set + for question_set in question_sets: + rich_print(f"Processing question set: {question_set}") + + # Load and normalize assertions + assertions = load_and_normalize_assertions( + input_dir, question_set, assertions_filename_template + ) + + # Display assertion filtering info + if top_k_assertions is not None: + rich_print(f" Filtering to top {top_k_assertions} assertions per question") + else: + rich_print(" Using all assertions (no filtering)") + + # Loop through each RAG method for this question set + for generated_rag in generated_rags: + rich_print(f" Processing {generated_rag} for {question_set}") + + result = evaluate_rag_method( + llm_client=llm_client, + llm_config=llm_config, + generated_rag=generated_rag, + question_set=question_set, + assertions=assertions, + input_dir=input_dir, + output_dir=output_dir, + trials=trials, + top_k_assertions=top_k_assertions, + pass_threshold=pass_threshold, + answers_path_template=answers_path_template, + ) + + if result is not None: + overall_results.append(result) + + # Create and save overall summary + overall_summary_df = pd.DataFrame(overall_results) + overall_summary_df = overall_summary_df.sort_values( + ["question_set", "overall_accuracy"], ascending=[True, False] + ) + overall_summary_df.to_csv( + output_dir / "assertion_scores_overall_summary.csv", index=False + ) + + # Display summary table + print_df( + overall_summary_df, + "Overall Assertion Scores Summary by Question Set and RAG Method", + ) + + # Also create a pivot table for easier comparison + pivot_summary = overall_summary_df.pivot_table( + index="rag_method", columns="question_set", values="overall_accuracy" + ) + pivot_summary.to_csv(output_dir / "assertion_scores_pivot_summary.csv") + print_df( + pivot_summary.reset_index(), + "Assertion Accuracy Comparison (Pivot View)", + ) + + return overall_summary_df diff --git a/benchmark_qed/autoe/prompts/assertion/assertion_system_prompt.txt b/benchmark_qed/autoe/prompts/assertion/assertion_system_prompt.txt index 9838631..8010af9 100644 --- a/benchmark_qed/autoe/prompts/assertion/assertion_system_prompt.txt +++ b/benchmark_qed/autoe/prompts/assertion/assertion_system_prompt.txt @@ -19,9 +19,9 @@ Follow these steps to make your decision: 3. Make a binary decision based on whether the assertion is satisfied ---Important Guidelines--- -- **Be precise**: Look for explicit evidence in the answer that directly relates to the assertion +- **Be precise**: Look for explicit evidence or strong implications in the answer that directly relate to the assertion - **Consider the entire answer**: Any part of the answer can potentially satisfy the assertion -- **Ignore irrelevant factors**: Length, style, or eloquence should not affect your evaluation unless specified in the assertion +- **Ignore irrelevant factors**: Length, style, exact wordings, or eloquence should not affect your evaluation unless specified in the assertion - **Handle ambiguity**: If the assertion or answer is ambiguous, explain the ambiguity in your reasoning and make the most reasonable interpretation - **Binary evaluation**: The assertion is either satisfied (1) or not satisfied (0) - there are no partial scores diff --git a/benchmark_qed/autoe/visualization/__init__.py b/benchmark_qed/autoe/visualization/__init__.py new file mode 100644 index 0000000..4d049c6 --- /dev/null +++ b/benchmark_qed/autoe/visualization/__init__.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025 Microsoft Corporation. +"""AutoE visualization package for assertion-based evaluation results.""" + +# Import assertion visualization functions +from benchmark_qed.autoe.visualization.assertions import ( + get_available_question_sets, + get_available_rag_methods, + plot_assertion_accuracy_by_rag_method, + plot_assertion_score_distribution, + prepare_assertion_summary_data, +) + +# Import utilities for advanced users +from benchmark_qed.autoe.visualization.utils import ( + add_value_labels, + calculate_bar_width, + format_method_name, + format_question_set_name, + get_color_palette, + save_figure, + setup_grid, + setup_plot_style, +) + +__all__ = [ + "add_value_labels", + "calculate_bar_width", + "format_method_name", + "format_question_set_name", + "get_available_question_sets", + "get_available_rag_methods", + "get_color_palette", + # Assertion-based visualizations + "plot_assertion_accuracy_by_rag_method", + "plot_assertion_score_distribution", + "prepare_assertion_summary_data", + "save_figure", + "setup_grid", + # Utilities + "setup_plot_style", +] diff --git a/benchmark_qed/autoe/visualization/assertions.py b/benchmark_qed/autoe/visualization/assertions.py new file mode 100644 index 0000000..11e68b6 --- /dev/null +++ b/benchmark_qed/autoe/visualization/assertions.py @@ -0,0 +1,228 @@ +# Copyright (c) 2025 Microsoft Corporation. +"""Visualization functions for AutoE assertion-based evaluation results.""" + +from pathlib import Path + +import matplotlib.pyplot as plt +import pandas as pd +from matplotlib.axes import Axes +from matplotlib.figure import Figure + +from benchmark_qed.autoe.visualization.utils import ( + add_value_labels, + calculate_bar_width, + format_method_name, + format_question_set_name, + get_color_palette, + save_figure, + setup_grid, + setup_plot_style, +) + + +def plot_assertion_accuracy_by_rag_method( + results_df: pd.DataFrame, + output_path: Path | None = None, + figsize: tuple[int, int] = (12, 6), + title: str = "Assertion-based Accuracy by RAG Method and Question Set", + show_values: bool = True, + sort_by_mean: bool = True, + save_dpi: int = 300, +) -> tuple[Figure, Axes]: + """ + Create a grouped bar chart showing assertion-based accuracy by RAG method and question set. + + Args: + results_df: DataFrame containing evaluation results with columns: + - 'rag_method': RAG method names + - 'question_set': Question set names + - 'overall_accuracy': Accuracy values + output_path: Optional path to save the visualization + figsize: Figure size as (width, height) tuple + title: Chart title + show_values: Whether to show accuracy values on bars + sort_by_mean: Whether to sort RAG methods by mean accuracy + save_dpi: DPI for saved image + + Returns + ------- + Tuple of (matplotlib Figure, matplotlib Axes) objects + + Example: + >>> fig, ax = ( + ... plot_assertion_accuracy_by_rag_method( + ... results_df, + ... output_path=Path( + ... "output/chart.png" + ... ), + ... ) + ... ) + """ + # Create pivot table for visualization + pivot_summary = results_df.pivot_table( + index="rag_method", columns="question_set", values="overall_accuracy" + ) + + # Set up consistent plotting style + setup_plot_style() + + # Create the figure and axis + fig, ax = plt.subplots(1, 1, figsize=figsize) + + # Prepare data for plotting + pivot_summary_reset = pivot_summary.reset_index() + + if sort_by_mean: + # Sort by mean accuracy across question sets (lowest to highest) + pivot_summary_reset["mean_accuracy"] = pivot_summary_reset.select_dtypes( + include="number" + ).mean(axis=1) + pivot_summary_reset = pivot_summary_reset.sort_values( + "mean_accuracy", ascending=True + ) + + x = range(len(pivot_summary_reset)) + + # Get the question sets dynamically (exclude non-data columns) + question_set_columns = [ + col + for col in pivot_summary_reset.columns + if col not in ["rag_method", "mean_accuracy"] + ] + + # Calculate appropriate bar width + width = calculate_bar_width(len(question_set_columns)) + + # Get consistent colors + colors = get_color_palette(len(question_set_columns)) + + for i, question_set in enumerate(question_set_columns): + values = pivot_summary_reset[question_set].fillna(0) + bars = ax.bar( + [pos + width * i for pos in x], + values, + width, + label=format_question_set_name(question_set), + color=colors[i], + alpha=0.8, + ) + + # Add value labels on bars using common utility + if show_values: + add_value_labels(ax, bars) + + # Customize the chart + ax.set_xlabel("RAG Methods") + ax.set_ylabel("Assertion-based Accuracy") + ax.set_title(title) + ax.set_xticks([pos + width * (len(question_set_columns) - 1) / 2 for pos in x]) + ax.set_xticklabels( + [format_method_name(method) for method in pivot_summary_reset["rag_method"]], + rotation=45, + ha="right", + ) + ax.legend(loc="upper left") + setup_grid(ax) + ax.set_ylim(0, 1.0) + + plt.tight_layout() + + # Save if output path is provided using common utility + if output_path: + save_figure(fig, output_path, dpi=save_dpi) + + return fig, ax + + +def plot_assertion_score_distribution( + results_df: pd.DataFrame, # noqa: ARG001 + output_path: Path | None = None, + figsize: tuple[int, int] = (10, 6), + title: str = "Assertion Score Distribution by RAG Method", +) -> tuple[Figure, Axes]: + """ + Create a box plot showing assertion score distributions by RAG method. + + Args: + results_df: DataFrame containing detailed assertion results + output_path: Optional path to save the visualization + figsize: Figure size as (width, height) tuple + title: Chart title + + Returns + ------- + Tuple of (matplotlib Figure, matplotlib Axes) objects + + Note: + This function is a placeholder for future implementation when + detailed assertion scoring data becomes available. + """ + # Set up consistent plotting style + setup_plot_style() + + # Create the figure and axis + fig, ax = plt.subplots(1, 1, figsize=figsize) + + # Placeholder implementation + ax.text( + 0.5, + 0.5, + "Assertion Score Distribution\n(Future Implementation)", + ha="center", + va="center", + transform=ax.transAxes, + fontsize=14, + ) + ax.set_title(title) + + plt.tight_layout() + + # Save if output path is provided + if output_path: + save_figure(fig, output_path) + + return fig, ax + + +def prepare_assertion_summary_data(results_df: pd.DataFrame) -> pd.DataFrame: + """ + Prepare assertion evaluation results for visualization. + + Args: + results_df: Raw evaluation results DataFrame + + Returns + ------- + Pivot table ready for visualization + """ + return results_df.pivot_table( + index="rag_method", columns="question_set", values="overall_accuracy" + ) + + +def get_available_question_sets(results_df: pd.DataFrame) -> list[str]: + """ + Get list of available question sets from results DataFrame. + + Args: + results_df: Evaluation results DataFrame + + Returns + ------- + List of question set names + """ + return sorted(results_df["question_set"].unique()) + + +def get_available_rag_methods(results_df: pd.DataFrame) -> list[str]: + """ + Get list of available RAG methods from results DataFrame. + + Args: + results_df: Evaluation results DataFrame + + Returns + ------- + List of RAG method names + """ + return sorted(results_df["rag_method"].unique()) diff --git a/benchmark_qed/autoe/visualization/utils.py b/benchmark_qed/autoe/visualization/utils.py new file mode 100644 index 0000000..c0d2320 --- /dev/null +++ b/benchmark_qed/autoe/visualization/utils.py @@ -0,0 +1,140 @@ +# Copyright (c) 2025 Microsoft Corporation. +"""Common utilities for AutoE visualizations.""" + +from collections.abc import Iterable +from pathlib import Path +from typing import Any + +import matplotlib.pyplot as plt +import seaborn as sns +from matplotlib.axes import Axes +from matplotlib.container import BarContainer +from matplotlib.figure import Figure +from rich import print as rich_print + + +def setup_plot_style() -> None: + """Set up consistent plotting style for all AutoE visualizations.""" + plt.style.use("default") + sns.set_palette("Set2") + + +def get_color_palette(n_colors: int) -> list[tuple[float, float, float]]: + """ + Get a consistent color palette for visualizations. + + Args: + n_colors: Number of colors needed + + Returns + ------- + List of colors from the Set2 palette + """ + return sns.color_palette("Set2", n_colors) + + +def save_figure( + fig: Figure, output_path: Path, dpi: int = 300, bbox_inches: str = "tight" +) -> None: + """ + Save a matplotlib figure with consistent settings. + + Args: + fig: Matplotlib figure to save + output_path: Path where to save the figure + dpi: Resolution for saved image + bbox_inches: Bounding box setting for saved image + """ + fig.savefig(output_path, dpi=dpi, bbox_inches=bbox_inches) + rich_print(f"[bold green]Visualization saved to {output_path.parent}[/bold green]") + + +def format_method_name(method_name: str) -> str: + """ + Format RAG method names for display. + + Args: + method_name: Raw method name (e.g., 'vector_rag') + + Returns + ------- + Formatted name for display (e.g., 'Vector RAG') + """ + return method_name.replace("_", " ").title() + + +def format_question_set_name(question_set: str) -> str: + """ + Format question set names for display. + + Args: + question_set: Raw question set name (e.g., 'activity_local') + + Returns + ------- + Formatted name for display (e.g., 'Activity Local') + """ + return question_set.replace("_", " ").title() + + +def add_value_labels( + ax: Axes, + bars: BarContainer | Iterable[Any], + format_str: str = "{:.3f}", + offset: int = 3, + fontsize: int = 9, + fontweight: str = "bold", +) -> None: + """ + Add value labels on top of bars in a bar chart. + + Args: + ax: Matplotlib axes object + bars: Bar container from matplotlib bar plot + format_str: Format string for values + offset: Vertical offset for labels + fontsize: Font size for labels + fontweight: Font weight for labels + """ + for bar in bars: + height = bar.get_height() + if height > 0: + ax.annotate( + format_str.format(height), + xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, offset), + textcoords="offset points", + ha="center", + va="bottom", + fontsize=fontsize, + fontweight=fontweight, + ) + + +def setup_grid(ax: Axes, alpha: float = 0.3, linestyle: str = "--") -> None: + """ + Set up consistent grid styling for plots. + + Args: + ax: Matplotlib axes object + alpha: Grid transparency + linestyle: Grid line style + """ + ax.grid(axis="y", alpha=alpha, linestyle=linestyle) + + +def calculate_bar_width(n_groups: int, max_width: float = 0.8) -> float: + """ + Calculate appropriate bar width for grouped bar charts. + + Args: + n_groups: Number of groups in the chart + max_width: Maximum total width for all bars + + Returns + ------- + Appropriate width for individual bars + """ + if n_groups <= 2: + return 0.35 + return max_width / n_groups diff --git a/benchmark_qed/autoq/cli.py b/benchmark_qed/autoq/cli.py index 3355356..50dd939 100644 --- a/benchmark_qed/autoq/cli.py +++ b/benchmark_qed/autoq/cli.py @@ -20,6 +20,8 @@ ActivityContextPromptConfig, ActivityGlobalPromptConfig, ActivityLocalPromptConfig, + AssertionConfig, + AssertionPromptConfig, DataGlobalPromptConfig, DataLocalPromptConfig, QuestionGenerationConfig, @@ -67,6 +69,9 @@ async def __generate_data_local( random_seed: int, concurrent_requests: int, config: DataLocalPromptConfig, + assertion_config: AssertionConfig, + assertion_prompt_config: AssertionPromptConfig, + llm_params: dict[str, Any], ) -> None: sample_texts_df = pd.read_parquet(f"{output_data_path}/sample_texts.parquet") sample_texts = load_text_units(df=sample_texts_df) @@ -77,9 +82,12 @@ async def __generate_data_local( text_units=sample_texts, concurrent_coroutines=concurrent_requests, random_seed=random_seed, + llm_params=llm_params, generation_system_prompt=config.data_local_gen_system_prompt.template, generation_user_prompt=config.data_local_gen_user_prompt.template, expansion_system_prompt=config.data_local_expansion_system_prompt.template, + assertion_config=assertion_config, + assertion_prompt_config=assertion_prompt_config, ) data_local_question_results = await data_local_generator.agenerate( @@ -115,6 +123,9 @@ async def __generate_data_global( random_seed: int, concurrent_requests: int, config: DataGlobalPromptConfig, + assertion_config: AssertionConfig, + assertion_prompt_config: AssertionPromptConfig, + llm_params: dict[str, Any], ) -> None: if not ( output_data_path / "data_local_questions" / "candidate_questions.json" @@ -134,8 +145,11 @@ async def __generate_data_global( local_questions=local_questions, concurrent_coroutines=concurrent_requests, random_seed=random_seed, + llm_params=llm_params, generation_system_prompt=config.data_global_gen_system_prompt.template, generation_user_prompt=config.data_global_gen_user_prompt.template, + assertion_config=assertion_config, + assertion_prompt_config=assertion_prompt_config, ) data_global_question_results = await data_global_generator.agenerate( @@ -173,6 +187,7 @@ async def __generate_activity_context( oversample_factor: float, concurrent_requests: int, config: ActivityContextPromptConfig, + llm_params: dict[str, Any], use_representative_samples_only: bool = True, skip_warning: bool = False, ) -> None: @@ -195,6 +210,7 @@ async def __generate_activity_context( token_encoder=token_encoder, text_units=sample_texts, concurrent_coroutines=concurrent_requests, + llm_params=llm_params, activity_identification_prompt=config.activity_identification_prompt.template, map_system_prompt=config.data_summary_prompt_config.summary_map_system_prompt.template, map_user_prompt=config.data_summary_prompt_config.summary_map_user_prompt.template, @@ -222,6 +238,7 @@ async def __generate_activity_local( random_seed: int, concurrent_requests: int, config: ActivityLocalPromptConfig, + llm_params: dict[str, Any], ) -> None: activity_context = ActivityContext( **json.loads( @@ -236,6 +253,7 @@ async def __generate_activity_local( activity_context=activity_context, concurrent_coroutines=concurrent_requests, random_seed=random_seed, + llm_params=llm_params, generation_system_prompt=config.activity_local_gen_system_prompt.template, generation_user_prompt=config.activity_local_gen_user_prompt.template, ) @@ -273,6 +291,7 @@ async def __generate_activity_global( random_seed: int, concurrent_requests: int, config: ActivityGlobalPromptConfig, + llm_params: dict[str, Any], ) -> None: activity_context = ActivityContext( **json.loads( @@ -287,6 +306,7 @@ async def __generate_activity_global( activity_context=activity_context, concurrent_coroutines=concurrent_requests, random_seed=random_seed, + llm_params=llm_params, generation_system_prompt=config.activity_global_gen_system_prompt.template, generation_user_prompt=config.activity_global_gen_user_prompt.template, ) @@ -398,6 +418,28 @@ def autoq( token_encoder = tiktoken.get_encoding(config.encoding.model_name) loop = asyncio.get_event_loop() + # Log assertion generation status + local_assertions_enabled = ( + config.assertions.local.max_assertions is None + or config.assertions.local.max_assertions > 0 + ) + global_assertions_enabled = ( + config.assertions.global_.max_assertions is None + or config.assertions.global_.max_assertions > 0 + ) + if local_assertions_enabled or global_assertions_enabled: + if ( + config.assertions.local.enable_validation + or config.assertions.global_.enable_validation + ): + rich_print( + f"Assertion generation enabled with validation (local min score: {config.assertions.local.min_validation_score}/5, global min score: {config.assertions.global_.min_validation_score}/5)" + ) + else: + rich_print("Assertion generation enabled (validation disabled)") + else: + rich_print("Assertion generation disabled") + rich_print("Creating clustered sample from the input data...") loop.run_until_complete( __create_clustered_sample( @@ -440,6 +482,7 @@ def autoq( oversample_factor=activity_config.oversample_factor, concurrent_requests=config.concurrent_requests, config=config.activity_questions_prompt_config.activity_context_prompt_config, + llm_params=config.chat_model.call_args, skip_warning=not first_activity, ) ) @@ -456,6 +499,7 @@ def autoq( config=config.activity_questions_prompt_config.activity_local_prompt_config if generation_type == GenerationType.activity_local else config.activity_questions_prompt_config.activity_global_prompt_config, + llm_params=config.chat_model.call_args, ) ) first_activity = False @@ -478,6 +522,9 @@ def autoq( config=config.data_questions_prompt_config.data_local_prompt_config if generation_type == GenerationType.data_local else config.data_questions_prompt_config.data_global_prompt_config, + assertion_config=config.assertions, + assertion_prompt_config=config.assertion_prompts, + llm_params=config.chat_model.call_args, ) ) diff --git a/benchmark_qed/autoq/config.py b/benchmark_qed/autoq/config.py index 49b9c72..e5d35ed 100644 --- a/benchmark_qed/autoq/config.py +++ b/benchmark_qed/autoq/config.py @@ -2,8 +2,9 @@ """Configuration for the autoq question generation process.""" from pathlib import Path +from typing import ClassVar -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field from benchmark_qed.autod import prompts as autod_prompts from benchmark_qed.autod.io.enums import InputDataType @@ -23,6 +24,7 @@ from benchmark_qed.autoq.prompts.data_questions import ( local_questions as data_local_prompts, ) +from benchmark_qed.config import defaults as defs from benchmark_qed.config.llm_config import LLMConfig from benchmark_qed.config.prompt_config import PromptConfig @@ -35,6 +37,7 @@ AUTOQ_DATA_PROMPTS_PATH = Path(autoq_data_prompts.__file__).parent AUTOQ_DATA_GLOBAL_PROMPTS_PATH = Path(data_global_prompts.__file__).parent AUTOQ_DATA_LOCAL_PROMPTS_PATH = Path(data_local_prompts.__file__).parent +AUTOQ_ASSERTIONS_PROMPTS_PATH = AUTOQ_DATA_PROMPTS_PATH / "assertions" class InputConfig(BaseModel): @@ -49,13 +52,13 @@ class InputConfig(BaseModel): default=InputDataType.CSV, description="The type of the input data." ) text_column: str = Field( - default="text", description="The column containing the text data." + default=defs.TEXT_COLUMN, description="The column containing the text data." ) metadata_columns: list[str] | None = Field( default=None, description="The columns containing metadata information." ) file_encoding: str = Field( - default="utf-8", description="The encoding of the input files." + default=defs.FILE_ENCODING, description="The encoding of the input files." ) @@ -63,28 +66,140 @@ class QuestionConfig(BaseModel): """Configuration for the question generation process.""" num_questions: int = Field( - default=20, + default=defs.NUM_QUESTIONS, description="Number of questions to generate for each question class.", ) oversample_factor: float = Field( - default=2.0, + default=defs.OVERSAMPLE_FACTOR, description="Factor by which to overgenerate candidate questions before filtering.", ) +class LocalAssertionConfig(BaseModel): + """Configuration for local assertion generation.""" + + max_assertions: int | None = Field( + default=defs.MAX_ASSERTIONS, + description="Maximum number of assertions per question. Set to 0 to disable, or None for unlimited.", + ) + enable_validation: bool = Field( + default=defs.ENABLE_ASSERTION_VALIDATION, + description="Whether to validate assertions against sources for quality filtering.", + ) + min_validation_score: int = Field( + default=defs.MIN_ASSERTION_VALIDATION_SCORE, + description="Minimum score (1-5) for grounding, relevance, and verifiability criteria.", + ) + concurrent_llm_calls: int = Field( + default=defs.ASSERTION_CONCURRENT_LLM_CALLS, + description="Number of concurrent LLM calls for validation.", + ) + max_concurrent_questions: int | None = Field( + default=defs.ASSERTION_MAX_CONCURRENT_LOCAL_QUESTIONS, + description="Maximum questions to process in parallel. Set to 1 for sequential.", + ) + + +class GlobalAssertionConfig(BaseModel): + """Configuration for global assertion generation.""" + + max_assertions: int | None = Field( + default=defs.MAX_ASSERTIONS, + description="Maximum number of assertions per question. Set to 0 to disable, or None for unlimited.", + ) + enable_validation: bool = Field( + default=defs.ENABLE_ASSERTION_VALIDATION, + description="Whether to validate assertions against sources for quality filtering.", + ) + min_validation_score: int = Field( + default=defs.MIN_ASSERTION_VALIDATION_SCORE, + description="Minimum score (1-5) for grounding, relevance, and verifiability criteria.", + ) + batch_size: int = Field( + default=defs.ASSERTION_BATCH_SIZE, + description="Batch size for processing claims in map-reduce assertion generation.", + ) + max_data_tokens: int = Field( + default=defs.ASSERTION_MAX_DATA_TOKENS, + description="Maximum input data tokens for the reduce step.", + ) + concurrent_llm_calls: int = Field( + default=defs.ASSERTION_CONCURRENT_LLM_CALLS, + description="Number of concurrent LLM calls for batch processing and validation.", + ) + max_concurrent_questions: int | None = Field( + default=defs.ASSERTION_MAX_CONCURRENT_GLOBAL_QUESTIONS, + description="Maximum questions to process in parallel. Set to 1 for sequential.", + ) + + +class AssertionConfig(BaseModel): + """Configuration for assertion generation (local and global).""" + + local: LocalAssertionConfig = Field( + default_factory=LocalAssertionConfig, + description="Configuration for local assertion generation.", + ) + global_: GlobalAssertionConfig = Field( + default_factory=GlobalAssertionConfig, + alias="global", + description="Configuration for global assertion generation.", + ) + + model_config: ClassVar[ConfigDict] = {"populate_by_name": True} + + +class AssertionPromptConfig(BaseModel): + """Configuration for assertion generation prompts.""" + + local_assertion_gen_prompt: PromptConfig = Field( + default=PromptConfig( + prompt=AUTOQ_ASSERTIONS_PROMPTS_PATH + / "local_claim_assertion_gen_prompt.txt" + ), + description="Prompt for generating local assertions from claims.", + ) + global_assertion_map_prompt: PromptConfig = Field( + default=PromptConfig( + prompt=AUTOQ_ASSERTIONS_PROMPTS_PATH + / "global_claim_assertion_map_prompt.txt" + ), + description="Prompt for the map step in global assertion generation.", + ) + global_assertion_reduce_prompt: PromptConfig = Field( + default=PromptConfig( + prompt=AUTOQ_ASSERTIONS_PROMPTS_PATH + / "global_claim_assertion_reduce_prompt.txt" + ), + description="Prompt for the reduce step in global assertion generation.", + ) + local_validation_prompt: PromptConfig = Field( + default=PromptConfig( + prompt=AUTOQ_ASSERTIONS_PROMPTS_PATH / "local_validation_prompt.txt" + ), + description="Prompt for validating local assertions (fact-focused) against sources.", + ) + global_validation_prompt: PromptConfig = Field( + default=PromptConfig( + prompt=AUTOQ_ASSERTIONS_PROMPTS_PATH / "global_validation_prompt.txt" + ), + description="Prompt for validating global assertions (theme-focused) against sources.", + ) + + class EncodingModelConfig(BaseModel): """Configuration for the encoding model used in question generation.""" model_name: str = Field( - default="o200k_base", + default=defs.ENCODING_MODEL, description="Name of the encoding model to use for chunking documents.", ) chunk_size: int = Field( - default=600, + default=defs.CHUNK_SIZE, description="Size of each text chunk to be processed by the encoding model.", ) chunk_overlap: int = Field( - default=100, + default=defs.CHUNK_OVERLAP, description="Overlap size between consecutive text chunks.", ) @@ -93,15 +208,15 @@ class SamplingConfig(BaseModel): """Configuration for data sampling in question generation.""" num_clusters: int = Field( - default=50, + default=defs.NUM_CLUSTERS, description="Number of clusters to sample from the dataset.", ) num_samples_per_cluster: int = Field( - default=10, + default=defs.NUM_SAMPLES_PER_CLUSTER, description="Number of samples to take from each cluster.", ) random_seed: int = Field( - default=42, + default=defs.RANDOM_SEED, description="Random seed for reproducibility of sampling.", ) @@ -110,15 +225,15 @@ class ActivityQuestionConfig(QuestionConfig): """Configuration for generating activity questions.""" num_personas: int = Field( - default=5, + default=defs.NUM_PERSONAS, description="Number of personas to generate questions for.", ) num_tasks_per_persona: int = Field( - default=5, + default=defs.NUM_TASKS_PER_PERSONA, description="Number of tasks to generate for each persona.", ) num_entities_per_task: int = Field( - default=10, + default=defs.NUM_ENTITIES_PER_TASK, description="Number of entities to include in each task.", ) @@ -346,7 +461,7 @@ class QuestionGenerationConfig(BaseModel): ) concurrent_requests: int = Field( - default=8, + default=defs.CONCURRENT_REQUESTS, description="Control for request concurrency. Adjust this based on your model capacity.", ) @@ -379,3 +494,13 @@ class QuestionGenerationConfig(BaseModel): default_factory=DataQuestionsPromptConfig, description="Configuration for data-related prompts.", ) + + assertions: AssertionConfig = Field( + default_factory=AssertionConfig, + description="Configuration for assertion generation.", + ) + + assertion_prompts: AssertionPromptConfig = Field( + default_factory=AssertionPromptConfig, + description="Configuration for assertion generation prompts.", + ) diff --git a/benchmark_qed/autoq/data_model/__init__.py b/benchmark_qed/autoq/data_model/__init__.py index 5bb7e28..5801dc7 100644 --- a/benchmark_qed/autoq/data_model/__init__.py +++ b/benchmark_qed/autoq/data_model/__init__.py @@ -1,2 +1,10 @@ # Copyright (c) 2025 Microsoft Corporation. """Define data models used in AutoQ.""" + +from benchmark_qed.autoq.data_model.enums import QuestionType +from benchmark_qed.autoq.data_model.question import Question + +__all__ = [ + "Question", + "QuestionType", +] diff --git a/benchmark_qed/autoq/io/question.py b/benchmark_qed/autoq/io/question.py index a7047bb..6d4db27 100644 --- a/benchmark_qed/autoq/io/question.py +++ b/benchmark_qed/autoq/io/question.py @@ -2,12 +2,106 @@ """Util functions to save/load questions.""" import json +import logging from dataclasses import asdict from pathlib import Path +from typing import Any from uuid import uuid4 from benchmark_qed.autoq.data_model.question import Question +log: logging.Logger = logging.getLogger(__name__) + + +def _normalize_assertion(assertion: dict[str, Any] | str) -> dict[str, Any]: + """ + Normalize an assertion to a dictionary with consistent structure. + + Assertions can be: + - dict: Standard format with statement, sources, score, etc. + - str: Legacy format (plain text assertion statement) + + Args: + assertion: An assertion dictionary or string + + Returns + ------- + A normalized dictionary with statement, sources, score, reasoning, attributes keys + + """ + if isinstance(assertion, str): + # Legacy format: plain string assertion + return { + "statement": assertion, + "sources": [], + "score": 0, + "reasoning": "", + "attributes": {}, + } + # Standard dict format + return { + "statement": assertion.get("statement", ""), + "sources": assertion.get("sources") or [], + "score": assertion.get("score", 0), + "reasoning": assertion.get("reasoning", ""), + "attributes": assertion.get("attributes") or {}, + } + + +def _save_assertions(questions: list[Question], output_path: Path) -> None: + """Extract and save assertions from questions to a separate JSON file with ranks.""" + questions_with_assertions = [] + + for question in questions: + # Check if question has assertions in its attributes + if not question.attributes: + continue + + assertions = question.attributes.get("assertions", []) + if not assertions: + continue + + # Normalize all assertions to dictionaries + assertion_dicts = [_normalize_assertion(a) for a in assertions] + + # Sort assertions by score (descending) then by source count (descending) to determine ranks + sorted_assertions = sorted( + assertion_dicts, + key=lambda a: (-a.get("score", 0), -len(a.get("sources", []))), + ) + + # Add rank information (rank 1 = highest importance) + ranked_assertions = [] + for rank, assertion in enumerate(sorted_assertions, 1): + sources = assertion.get("sources") or [] + ranked_assertion = { + "statement": assertion["statement"], + "source_count": len(sources), + "score": assertion.get("score", 0), + "reasoning": assertion.get("reasoning", ""), + "rank": rank, + } + + # Include validation scores if available + attributes = assertion.get("attributes") or {} + if attributes and "validation" in attributes: + ranked_assertion["validation"] = attributes["validation"] + + ranked_assertions.append(ranked_assertion) + + questions_with_assertions.append({ + "question_id": question.id, + "question_text": question.text, + "assertions": ranked_assertions, + }) + + # Save assertions to file as a direct list + if questions_with_assertions: + assertions_file = output_path / "assertions.json" + Path(assertions_file).write_text( + json.dumps(questions_with_assertions, indent=4) + ) + def load_questions(file_path: str, question_text_only: bool = False) -> list[Question]: """Read question list from a json file.""" @@ -29,6 +123,7 @@ def save_questions( output_name: str, question_text_only: bool = False, include_embedding: bool = False, + save_assertions: bool = True, ) -> None: """Save question list to a json file.""" if question_text_only: @@ -45,3 +140,7 @@ def save_questions( output_file = output_path_obj / f"{output_name}.json" Path(output_file).write_text(json.dumps(question_list, indent=4)) + + # Save assertions separately if requested + if save_assertions: + _save_assertions(questions, output_path_obj) diff --git a/benchmark_qed/autoq/prompts/activity_questions/global_questions/activity_global_gen_system_prompt.txt b/benchmark_qed/autoq/prompts/activity_questions/global_questions/activity_global_gen_system_prompt.txt index 2117790..0d61dcd 100644 --- a/benchmark_qed/autoq/prompts/activity_questions/global_questions/activity_global_gen_system_prompt.txt +++ b/benchmark_qed/autoq/prompts/activity_questions/global_questions/activity_global_gen_system_prompt.txt @@ -1,6 +1,10 @@ ---ROLE--- You are a helpful assistant tasked with identifying activity-related global questions targeting a dataset of interest. +---DEFINITION OF GLOBAL QUESTIONS--- +Global questions are sensemaking queries that require holistic understanding of the entire dataset. They focus on themes, patterns, and connections (among people, places, and events) in order to understand trajectories and implications. +Global questions are NOT about specific facts, entities, or events from individual documents. + ---INPUT--- You will be provided with the following information: @@ -12,17 +16,25 @@ You will be provided with the following information: Your goal is to generate a list of global questions based on the descriptions of the dataset, the persona and their task. ---INSTRUCTIONS--- -Each generated question should: +Each generated question MUST: - begin with "Across the dataset, ..."; - require an understanding of the dataset as a whole; - assume the person asking the question only has a general sense of the dataset as context; - be specific to the natures of the dataset, persona, and task; -- be general, abstract, and concise, but not overly broad. For example, a question like "Across the dataset, what are the most important factors?" is too broad and should be more specific to the given context. -- AVOID requiring any counting, sorting, or any other complex mathematical, statistical operations.For example, avoid questions like "Across the dataset, what is the frequency of occurrences of X?" as it requires counting. +- be ABSTRACT, SHORT, SIMPLE, and NATURAL-SOUNDING - like a question a curious human would ask: + - aim for 10-20 words after "Across the dataset," + - ask only one thing at a time + - use simple, direct phrasing + - AVOID jargon, excessive qualifiers, and academic language + - BAD (too long, too many clauses): "Across the dataset, how do shifts in regulatory guidance reshape the practical boundaries of emergency pregnancy care under overlapping state restrictions and federal obligations?" + - GOOD (simple, direct): "Across the dataset, how are changes in abortion regulations affecting access to emergency care?" + - BAD (keyword-heavy): "Across the dataset, what themes emerge in health education policy debates involving parents, educators, school boards, and advocacy groups over curriculum mandates, sex education, and mental health programs?" + - GOOD (abstracted): "Across the dataset, what themes emerge in health education policy debates?" + - BAD (over-specified): "Across the dataset, how are homelessness policy approaches portrayed as shaping access to mental health services across institutional and community settings?" + - GOOD (simple): "Across the dataset, how are homelessness policies linked to mental health service access?" +- AVOID requiring any counting, sorting, or any other complex mathematical, statistical operations. For example, avoid questions like "Across the dataset, what is the frequency of occurrences of X?" as it requires counting. - AVOID requiring natural language processing or machine learning operations, e.g., sentiment analysis, keyword counting. - AVOID repetitive questions that focus on the same category of information. - For example, the two questions below are too similar and only one should be included in the output: - "Across the dataset, explain common legislative actions and their typical societal implications." and "Across the dataset, explain important legislative actions and their societal implications." ---OUTPUT--- Output questions as a JSON-formatted object with the following structure: diff --git a/benchmark_qed/autoq/prompts/activity_questions/local_questions/activity_local_gen_system_prompt.txt b/benchmark_qed/autoq/prompts/activity_questions/local_questions/activity_local_gen_system_prompt.txt index ab2e5d1..e3795b8 100644 --- a/benchmark_qed/autoq/prompts/activity_questions/local_questions/activity_local_gen_system_prompt.txt +++ b/benchmark_qed/autoq/prompts/activity_questions/local_questions/activity_local_gen_system_prompt.txt @@ -12,13 +12,13 @@ You will be provided with the following information: Generate a set of questions that are relevant to the persona's task and the provided entities. ---INSTRUCTIONS--- -Each question should: +Each question MUST: - make a clear reference to the selected entities and the persona's target task; - be relevant to the persona's task and can be entirely and accurately answered based on the selected entity descriptions, rather than requiring external knowledge; +- be SHORT, CONCISE, and NATURAL-SOUNDING - avoid convoluted phrasing, long qualifying clauses, excessive parentheticals, and over-specification; +- ask only one thing at a time rather than having multiple sub-questions. For example, this question is BAD because it has multiple sub-questions: "What recent tragic incidents related to the fentanyl crisis have occurred in Washington and Oregon, and what measures are officials taking in response to these events?"; - avoid telegraphing the answer to the question. For example, consider this question: "Given the recent incidents in Zanzibar, what are the health implications of consuming sea turtle meat that have resulted in multiple deaths and hospitalizations?". This is a BAD output question because it telegraphs the answer by mentioning the deaths and hospitalizations as a result of consuming sea turtle meat; - stand alone in the absence of the entities and other questions; -- be short and concise; -- ask only one thing at a time rather than having multiple sub-questions. For example, this question is BAD because it has multiple sub-questions: ""What recent tragic incidents related to the fentanyl crisis have occurred in Washington and Oregon, and what measures are officials taking in response to these events?"; - Fully describe all references to people, places, or things. Such references should only be included if including this information materially affects the answer. Ensure that ALL references are clear and unambiguous, e.g., don't refer to "the ", but refer to named or described entities in ways that distinguish them from all other . Do this for ALL references, and ensure that all references are absolute (e.g., summer 2024) rather than relative (e.g., this summer). diff --git a/benchmark_qed/autoq/prompts/data_questions/assertions/__init__.py b/benchmark_qed/autoq/prompts/data_questions/assertions/__init__.py new file mode 100644 index 0000000..73953f2 --- /dev/null +++ b/benchmark_qed/autoq/prompts/data_questions/assertions/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2025 Microsoft Corporation. +"""Assertion generation prompt templates.""" diff --git a/benchmark_qed/autoq/prompts/data_questions/assertions/global_claim_assertion_map_prompt.txt b/benchmark_qed/autoq/prompts/data_questions/assertions/global_claim_assertion_map_prompt.txt new file mode 100644 index 0000000..17e4d8a --- /dev/null +++ b/benchmark_qed/autoq/prompts/data_questions/assertions/global_claim_assertion_map_prompt.txt @@ -0,0 +1,108 @@ +---ROLE--- +You are a helpful assistant that generates assertions for evaluating answer accuracy in global question-answering systems. + + +---GOAL--- +Given a global query and a list of relevant claims, generate assertions that test whether important aspects from the claims are covered in an answer to the query. + +---INSTRUCTIONS--- +Each assertion should be a clear, testable statement that adheres to the following rules: + +1. **Claim-Based Relevance**: Focus on the key topics, themes, and insights present in the provided claims that are relevant to the global query. + +2. **Clear Testability**: Can be verified with a simple YES/NO criteria (either the aspect is addressed in an answer or it is not). + +3. **Appropriate Scope**: Cover the important aspects from the claims without being overly specific about details like exact numbers, dates, locations, or individual entities unless they are central to the claim's meaning. + +4. **Comprehensive Coverage**: Generate assertions that collectively test coverage of the main topics and insights present in the claims. + +5. **Conciseness**: Keep assertions brief and focused. Aim for under 50 words. Focus on the core theme rather than listing multiple examples or detailed specifications. + +6. **Atomic Structure - NO COMPOUND STATEMENTS**: Each assertion must test exactly ONE concept. Never use compound statements that combine multiple facts or examples using words like "including", "such as", "and", or other similar connectors. + - If you find yourself wanting to list examples, create separate assertions instead + - Each assertion should pass this test: "Can this be verified with a single YES/NO answer?" + +7. **Avoid Detailed Lists**: Never enumerate specific subtypes, categories, or examples within assertions. Use broad thematic terms instead of detailed breakdowns. + + +---EXAMPLES--- +**GOOD (Concise)**: +- "The response should address regulatory approaches to environmental protection" +- "The response should discuss implementation challenges in healthcare policy" +- "The response should address stakeholder perspectives on the policy changes" +- "The response should discuss economic impacts of the proposed measures" + +**AVOID (Too Long or Too Specific)**: +- "The response should address regulatory approaches to environmental protection, including water quality standards, hazardous substance containment, air pollution controls, and industrial oversight" +- "The response should address major infectious disease challenges, including outbreaks of respiratory, waterborne, foodborne, and vaccine-preventable diseases, and the public health measures taken to control them" +- "The response should address societal and legal challenges related to reproductive rights, including abortion access, restrictions, and policy responses across regions" +- "The response should discuss the impact of research challenges, including early setbacks and difficulties in demonstrating effectiveness, on the development of medical treatments" +- "The response should address safety measures such as recalls, monitoring, and enforcement actions" +- "The response should mention the exact EPA standard of 70 parts per trillion" +- "The response should discuss the specific meeting on March 15th, 2023" + +**HOW TO FIX COMPOUND STATEMENTS**: +Instead of: "The response should address societal and legal challenges related to reproductive rights, including abortion access, restrictions, and policy responses across regions" +Use separate assertions: +- "The response should address societal challenges related to reproductive rights" +- "The response should address legal challenges related to reproductive rights" +- "The response should discuss abortion access restrictions across regions" +- "The response should address policy responses to reproductive rights issues" + + +---OUTPUT--- +Each assertion in the response should contain the following elements: +- "statement": A clear assertion that begins with "The response should address" or "The response should discuss" followed by a topic or theme from the claims. +- "sources": A list of all source claim IDs that support this assertion (exact string values taken from the ID column in the input claims). +- "score": An integer score between 1-10 indicating how important this aspect is for answering the global query (10 = essential, 1 = least important). +- "reasoning": A brief explanation (1-2 sentences) of why this assertion is relevant to the query and why you assigned this importance score. + +The response should be JSON formatted as follows: +{ + "assertions": [ + {"statement": "The response should address...", "sources": [list of supporting claim IDs], "score": importance score (1-10), "reasoning": "This assertion is relevant because... The score reflects..."}, + {"statement": "The response should discuss...", "sources": [list of supporting claim IDs], "score": importance score (1-10), "reasoning": "This assertion is relevant because... The score reflects..."}, + ] +} + + +---QUERY--- +${query} + + +---INPUT CLAIMS--- +${context_data} + + +---INSTRUCTIONS--- +Each assertion should be a clear, testable statement that adheres to the following rules: + +1. **Claim-Based Relevance**: Focus on the key topics, themes, and insights present in the provided claims that are relevant to the global query. + +2. **Clear Testability**: Can be verified with a simple YES/NO criteria (either the aspect is addressed in an answer or it is not). + +3. **Appropriate Scope**: Cover the important aspects from the claims without being overly specific about details like exact numbers, dates, locations, or individual entities unless they are central to the claim's meaning. + +4. **Comprehensive Coverage**: Generate assertions that collectively test coverage of the main topics and insights present in the claims. + +5. **Conciseness**: Keep assertions brief and focused. Aim for under 50 words. Focus on the core theme rather than listing multiple examples or detailed specifications. + +6. **Atomic Structure - NO COMPOUND STATEMENTS**: Each assertion must test exactly ONE concept. Never use compound statements that combine multiple facts or examples using words like "including", "such as", "and", or other similar connectors. + - If you find yourself wanting to list examples, create separate assertions instead + - Each assertion should pass this test: "Can this be verified with a single YES/NO answer?" + +7. **Avoid Detailed Lists**: Never enumerate specific subtypes, categories, or examples within assertions. Use broad thematic terms instead of detailed breakdowns. + +Each assertion in the response should contain the following elements: +- "statement": A clear assertion that begins with "The response should address" or "The response should discuss" followed by a topic or theme from the claims. +- "sources": A list of all source claim IDs that support this assertion (exact string values taken from the ID column in the input claims) +- "score": An integer score between 1-10 indicating how important this aspect is for answering the global query (10 = essential, 1 = least important). +- "reasoning": A brief explanation (1-2 sentences) of why this assertion is relevant to the query and why you assigned this importance score. + +The response should be JSON formatted as follows: +{ + "assertions": [ + {"statement": "The response should address...", "sources": [list of supporting claim IDs], "score": importance score (1-10), "reasoning": "This assertion is relevant because... The score reflects..."}, + {"statement": "The response should discuss...", "sources": [list of supporting claim IDs], "score": importance score (1-10), "reasoning": "This assertion is relevant because... The score reflects..."}, + ] +} \ No newline at end of file diff --git a/benchmark_qed/autoq/prompts/data_questions/assertions/global_claim_assertion_reduce_prompt.txt b/benchmark_qed/autoq/prompts/data_questions/assertions/global_claim_assertion_reduce_prompt.txt new file mode 100644 index 0000000..c127658 --- /dev/null +++ b/benchmark_qed/autoq/prompts/data_questions/assertions/global_claim_assertion_reduce_prompt.txt @@ -0,0 +1,128 @@ +---ROLE--- +You are a helpful assistant that consolidates multiple assertions into fewer, higher-level thematic assertions for global question-answering evaluation. + + +---GOAL--- +Given a global query and a list of initial assertions generated from claims, consolidate them into a smaller set of high-level thematic assertions that comprehensively cover the same topics while reducing redundancy and creating broader thematic categories. + +---INSTRUCTIONS--- +Each consolidated assertion should be a high-level, thematic statement that adheres to the following rules: + +1. **Thematic Grouping**: Group related assertions by theme, topic, or conceptual area to create broader thematic categories. + +2. **Comprehensive Coverage**: Ensure that all important aspects from the initial assertions are preserved in the consolidated set - no critical topics should be lost. + +3. **Higher-Level Abstraction**: Create assertions that are more abstract and comprehensive than the originals, focusing on major thematic dimensions rather than specific details. + +4. **Reduced Redundancy**: Merge overlapping or similar assertions to eliminate duplication while maintaining complete coverage. + +5. **Clear Testability**: Each consolidated assertion should be verifiable with a simple YES/NO criteria. + +6. **Atomicity**: Each consolidated assertion should test a single, coherent thematic concept. Avoid combining multiple distinct evaluation criteria that should be tested separately. + +7. **NO COMPOUND STATEMENTS**: Each assertion must test exactly ONE thematic concept. Never use compound statements that combine multiple themes or examples using words like "including", "such as", "and", or other similar connectors. + - If you find yourself wanting to list examples, use broader categorical terms instead + - Each assertion should pass this test: "Can this thematic area be verified with a single YES/NO answer?" + +8. **Conciseness**: Keep assertions brief and focused. Aim for under 50 words. Avoid long lists of examples or detailed enumerations within the assertion statement. + +9. **Avoid Detailed Lists**: Never enumerate specific subtypes, categories, or examples within assertions. Use broad thematic terms instead of detailed breakdowns. + +10. **Use Categorical Terms**: When multiple related items would be listed (e.g., 'food and and product safety'), use a broader categorical term instead (e.g., 'consumer safety') + +11. **Quality Focus**: Create comprehensive, high-quality consolidated assertions that provide thorough thematic coverage. Focus on quality and comprehensive thematic representation rather than quantity. + + +---EXAMPLES--- +**Original Assertions** → **Consolidated Assertion**: +- "Address economic impacts on businesses" + "Discuss financial consequences for industry" → "The response should address economic and financial impacts across affected sectors" +- "Discuss environmental regulations in California" + "Address environmental policy in Texas" + "Discuss environmental rules in Florida" → "The response should address environmental regulatory approaches across different states" + +**GOOD (Concise)**: +- "The response should address regulatory approaches to environmental protection" +- "The response should discuss economic impacts of policy changes" +- "The response should address stakeholder perspectives on the measures" +- "The response should discuss major infectious disease challenges and public health responses" +- "The response should address consumer safety as public health challenges" + + +**AVOID (Too Long)**: +- "The response should address regulatory approaches to environmental protection, including water quality standards, air pollution controls, hazardous waste management, and oversight of industrial facilities" +- "The response should discuss the various economic impacts of policy changes on businesses, industries, workers, consumers, and local communities" +- "The response should address major infectious disease challenges, including outbreaks of respiratory, waterborne, foodborne, and vaccine-preventable diseases, and the public health measures taken to control them" +- "The response should discuss research challenges, including early setbacks and difficulties in demonstrating effectiveness, in medical development" +- "The response should address food and product safety as public health challenges and discuss related interventions" + +**HOW TO FIX COMPOUND STATEMENTS**: +Instead of: "The response should discuss research challenges, including early setbacks and difficulties in demonstrating effectiveness" +Use: "The response should address research challenges in medical treatment development" + +Instead of: "The response should address societal and legal challenges related to reproductive rights, including abortion access, restrictions, and policy responses across regions" +Use: "The response should address challenges related to reproductive rights" + +Instead of: "The response should address safety measures, including recalls and enforcement actions" +Use: "The response should address regulatory safety measures" + + +---OUTPUT--- +Each consolidated assertion should contain the following elements: +- "statement": A high-level thematic assertion that begins with "The response should address" or "The response should discuss" followed by a broad thematic area. +- "sources": A list of the original assertions IDs that were consolidated into this thematic assertion. +- "score": An integer score between 1-10 indicating the importance of this thematic area (typically high since these represent consolidated important themes). +- "reasoning": A brief explanation (1-2 sentences) of why this thematic assertion is important for the query and why you assigned this importance score. + +The response should be JSON formatted as follows: +{ + "assertions": [ + {"statement": "The response should address...", "sources": [list of original assertion IDs], "score": importance score (1-10), "reasoning": "This thematic assertion is important because... The score reflects..."}, + {"statement": "The response should discuss...", "sources": [list of original assertion IDs], "score": importance score (1-10), "reasoning": "This thematic assertion is important because... The score reflects..."} + ] +} + + +---QUERY--- +${question_text} + + +---INITIAL ASSERTIONS--- +${assertions_context} + + +---INSTRUCTIONS--- +Each consolidated assertion should be a high-level, thematic statement that adheres to the following rules: + +1. **Thematic Grouping**: Group related assertions by theme, topic, or conceptual area to create broader thematic categories. + +2. **Comprehensive Coverage**: Ensure that all important aspects from the initial assertions are preserved in the consolidated set - no critical topics should be lost. + +3. **Higher-Level Abstraction**: Create assertions that are more abstract and comprehensive than the originals, focusing on major thematic dimensions rather than specific details. + +4. **Reduced Redundancy**: Merge overlapping or similar assertions to eliminate duplication while maintaining complete coverage. + +5. **Clear Testability**: Each consolidated assertion should be verifiable with a simple YES/NO criteria. + +6. **Atomicity**: Each consolidated assertion should test a single, coherent thematic concept. Avoid combining multiple distinct evaluation criteria that should be tested separately. + +7. **NO COMPOUND STATEMENTS**: Each assertion must test exactly ONE thematic concept. Never use compound statements that combine multiple themes or examples using words like "including", "such as", "and", or other similar connectors. + - If you find yourself wanting to list examples, use broader categorical terms instead + - Each assertion should pass this test: "Can this thematic area be verified with a single YES/NO answer?" + +8. **Conciseness**: Keep assertions brief and focused. Aim for under 50 words. Avoid long lists of examples or detailed enumerations within the assertion statement. + +9. **Avoid Detailed Lists**: Never enumerate specific subtypes, categories, or examples within assertions. Use broad thematic terms instead of detailed breakdowns. + +10. **Use Categorical Terms**: When multiple related items would be listed (e.g., 'food and and product safety'), use a broader categorical term instead (e.g., 'consumer safety') + +Each consolidated assertion should contain the following elements: +- "statement": A high-level thematic assertion that begins with "The response should address" or "The response should discuss" followed by a broad thematic area. +- "sources": A list of the original assertions IDs that were consolidated into this thematic assertion. +- "score": An integer score between 1-10 indicating the importance of this thematic area (typically high since these represent consolidated important themes). +- "reasoning": A brief explanation (1-2 sentences) of why this thematic assertion is important for the query and why you assigned this importance score. + +The response should be JSON formatted as follows: +{ + "assertions": [ + {"statement": "The response should address...", "sources": ["original assertion 1", "original assertion 2", ...], "score": importance score (1-10), "reasoning": "This thematic assertion is important because... The score reflects..."}, + {"statement": "The response should discuss...", "sources": ["original assertion 1", "original assertion 2", ...], "score": importance score (1-10), "reasoning": "This thematic assertion is important because... The score reflects..."} + ] +} diff --git a/benchmark_qed/autoq/prompts/data_questions/assertions/global_max_assertion_instruction_prompt.txt b/benchmark_qed/autoq/prompts/data_questions/assertions/global_max_assertion_instruction_prompt.txt new file mode 100644 index 0000000..b9af246 --- /dev/null +++ b/benchmark_qed/autoq/prompts/data_questions/assertions/global_max_assertion_instruction_prompt.txt @@ -0,0 +1,2 @@ +---MAX NUMBER OF OUTPUT ASSERTIONS--- +Consolidate into up to ${max_assertions} high-quality thematic assertions. Use fewer if that provides better thematic coherence and avoids forcing unnatural groupings. diff --git a/benchmark_qed/autoq/prompts/data_questions/assertions/global_validation_prompt.txt b/benchmark_qed/autoq/prompts/data_questions/assertions/global_validation_prompt.txt new file mode 100644 index 0000000..046995f --- /dev/null +++ b/benchmark_qed/autoq/prompts/data_questions/assertions/global_validation_prompt.txt @@ -0,0 +1,53 @@ +---ROLE--- +You are an expert evaluator assessing thematic assertions for grounding, relevance, and verifiability. + + +---GOAL--- +Rate the given assertion on three criteria using a 1-5 scale. This assertion was generated for a GLOBAL question (broad, thematic query requiring synthesis across multiple sources to identify patterns, trends, or overarching themes). + + +---QUESTION BEING EVALUATED--- +${question} + + +---ASSERTION TO EVALUATE--- +${assertion} + + +---SOURCE TEXTS--- +${sources} + + +---CRITERIA--- +Rate each criterion from 1 (poor) to 5 (excellent): + +**GROUNDING**: Is the thematic assertion supported by the source texts? +- 1: Theme not present in sources (hallucination) +- 2: Mostly unsupported, significant overreach +- 3: Partially supported, reasonable synthesis but some gaps +- 4: Well supported, theme clearly emerges from sources +- 5: Theme is strongly and consistently present across sources + +**RELEVANCE**: Is the assertion useful for evaluating answers to the global question? +- 1: Off-topic or tests irrelevant themes +- 2: Tangentially related, tests minor aspects +- 3: Somewhat useful, but not a core theme +- 4: Tests an important thematic dimension of the question +- 5: Tests an essential theme a comprehensive answer must address + +**VERIFIABILITY**: Is the thematic assertion clear and checkable against an answer? +- 1: Too vague or abstract to verify +- 2: Ambiguous, unclear what would satisfy it +- 3: Reasonably clear, some interpretation needed +- 4: Clear thematic scope, easy to check for coverage +- 5: Unambiguous theme that can be clearly verified as addressed or not + + +---OUTPUT--- +Respond with a JSON object: +{ + "grounding": 1-5, + "relevance": 1-5, + "verifiability": 1-5, + "reasoning": "Brief explanation" +} diff --git a/benchmark_qed/autoq/prompts/data_questions/assertions/local_claim_assertion_gen_prompt.txt b/benchmark_qed/autoq/prompts/data_questions/assertions/local_claim_assertion_gen_prompt.txt new file mode 100644 index 0000000..7576dab --- /dev/null +++ b/benchmark_qed/autoq/prompts/data_questions/assertions/local_claim_assertion_gen_prompt.txt @@ -0,0 +1,159 @@ +---ROLE--- +You are a helpful assistant that generates assertions for evaluating answer accuracy in question-answering systems. + + +---GOAL--- +Given a user query and potentially relevant claims, generate assertions that can be used as unit tests to verify the accuracy and completeness of any answer to the query. + +---INSTRUCTIONS--- +Each assertion should be a clear, testable statement that adheres to the following rules: + +1. **Claim-Based Relevance**: Focus on the key topics and insights present in the provided claims that should be present in a complete and accurate answer. + +2. **Clear Testability**: Can be verified with a simple YES/NO criteria (either the aspect is addressed in an answer or it is not). + +3. **Appropriate Scope**: + - Cover the important aspects from the claims without being overly specific about details like exact numbers, dates, locations, wordings, quotes, or individual entities unless they are absolutely central to answering the query. + - Focus on concepts, meaning, and role of the claims relative to the query rather than precise phrasing. + - Use terminology that accepts conceptually equivalent expressions (e.g., "respiratory illness" encompasses flu, cold, fatigue, breathing problems; "financial impact" covers costs, expenses, budget effects; "time off" includes leave, unpaid time, medical accommodation). + +4. **Query Context Awareness - Avoid Redundancy**: + - Never generate assertions that merely restate facts already mentioned in the query. + - Focus on substantive information that a complete answer should provide BEYOND what's already stated in the question. + +5. **Comprehensive Coverage**: Generate assertions that collectively test coverage of the main topics and insights present in the claims. + +6. **Conciseness**: Keep assertions brief and focused. Aim for under 50 words. Focus on the core topics/facts rather than listing multiple examples or detailed specifications. + +7. **Atomic Structure - NO COMPOUND STATEMENTS**: Each assertion must test exactly ONE concept or fact. + - Never use compound statements that combine multiple facts or examples using words like "including", "such as", "and", or other similar connectors. + - If you find yourself wanting to list examples, create separate assertions instead + - Each assertion should pass this test: "Can this be verified with a single YES/NO answer?" + +8. **Avoid Detailed Lists**: Never enumerate specific subtypes, categories, or examples within assertions. Use broad thematic terms instead of detailed breakdowns. + + +---EXAMPLES--- +**GOOD (Atomic, Single Facts)**: +- "The response should state that the ballot measure allows abortion restrictions only after fetal viability." +- "The response should mention that fetal viability is generally around 23-24 weeks." + +**BAD (Compound Statements)**: +- "The response should describe that the ballot measure would allow lawmakers to restrict abortion only after fetal viability, generally around 23 or 24 weeks, and would permit abortion later in pregnancy." + +**How to Fix**: Split into separate assertions for each distinct fact. + +**REDUNDANT vs. VALUABLE ASSERTIONS**: + +*Query*: "What amendment was made to Article 34 of the French Constitution regarding abortion rights in March 2024?" + +**BAD (Redundant - restates query)**: +- "The response should address that Article 34 of the French Constitution was amended in March 2024." + +**GOOD (Valuable - tests substantive answer content)**: +- "The response should state that the amendment guarantees the freedom of women to have recourse to an abortion." +- "The response should mention that the law determines the conditions for exercising abortion freedom." + +**SPECIFIC WORDING vs. CONCEPTUAL CONTENT**: + +*Query*: "Why did Governor Tony Evers veto the Republican-backed bill to combat PFAS pollution in Wisconsin between February 2024 and April 2024?" + +**BAD (Too specific - focuses on exact quotes)**: +- "The response should indicate that Evers called the bill 'not good enough' and reaffirmed his commitment to addressing PFAS contamination." + +**GOOD (Conceptual - focuses on meaning)**: +- "The response should address that Evers criticized the bill's adequacy for addressing PFAS contamination." +- "The response should mention that Evers expressed continued commitment to combating PFAS pollution." + +**OVERLY SPECIFIC vs. APPROPRIATELY GENERAL**: + +*Query*: "What health issues has Pope Francis experienced from late 2023 to early 2024 that have affected his ability to perform his duties in Vatican City and Rome?" + +**BAD (Overly specific - unnecessary precision)**: +- "The response should mention that Pope Francis experienced a mild flu in late February 2024 that led to the cancellation of several appointments." + +**GOOD (Appropriately general - conceptual focus)**: +- "The response should address that Pope Francis experienced flu symptoms in early 2024 that affected his scheduled activities." + +**INSTITUTIONAL SPECIFICITY vs. POLICY CONCEPTS**: + +*Query*: "How does House Bill 1339, passed by the Georgia Senate in March 2024, propose to change the certificate-of-need requirements for health facilities in Georgia?" + +**BAD (Overly specific - focuses on particular institutions)**: +- "The response should mention that the bill would allow the Morehouse School of Medicine to open a hospital in central Atlanta without a certificate of need." + +**GOOD (Conceptual - focuses on policy change)**: +- "The response should address that the bill creates exemptions allowing certain institutions to open hospitals without certificate-of-need requirements." + +**TERMINOLOGY SPECIFICITY vs. CONCEPTUAL EQUIVALENCE**: + +*Query*: "What health issues has Pope Francis experienced from late 2023 to early 2024 that have affected his ability to perform his duties in Vatican City and Rome?" + +**BAD (Overly specific terminology - requires exact words)**: +- "The response should mention that Pope Francis suffered from fatigue and a persistent cold in early 2024, affecting his ability to deliver speeches." + +**GOOD (Conceptual equivalence - accepts related terminology)**: +- "The response should address that Pope Francis experienced respiratory illness symptoms in early 2024 that affected his ability to perform duties." + +---OUTPUT--- +Each assertion in the response should contain the following elements: +- "statement": A clear assertion that begins with "The response should " followed by an aspect to be included in the answer from the claims. +- "sources": A list of all source claim IDs that support this assertion (exact string values taken from the ID column in the input claims). +- "score": An integer score between 1-10 indicating how important this aspect is for answering the query (10 = essential, 1 = least important). +- "reasoning": A brief explanation (1-2 sentences) of why this assertion is relevant to the query and why you assigned this importance score. + +The response should be JSON formatted as follows: +{ + "assertions": [ + {"statement": "The response should...", "sources": [list of supporting claim IDs], "score": importance score (1-10), "reasoning": "This assertion is relevant because... The score reflects..."}, + ] +} + + +---QUERY--- +${query} + + +---INPUT CLAIMS--- +${context_data} + + +---INSTRUCTIONS--- +Each assertion should be a clear, testable statement that adheres to the following rules: + +1. **Claim-Based Relevance**: Focus on the key topics, facts, and insights present in the provided claims that are relevant to the query. + +2. **Clear Testability**: Can be verified with a simple YES/NO criteria (either the aspect is addressed in an answer or it is not). + +3. **Appropriate Scope**: + - Cover the important aspects from the claims without being overly specific about details like exact numbers, dates, locations, wordings, quotes, or individual entities unless they are absolutely central to answering the query. + - Focus on concepts, meaning, and role of the claims relative to the query rather than precise phrasing. + - Use terminology that accepts conceptually equivalent expressions (e.g., "respiratory illness" encompasses flu, cold, fatigue, breathing problems; "financial impact" covers costs, expenses, budget effects; "time off" includes leave, unpaid time, medical accommodation). + +4. **Query Context Awareness - Avoid Redundancy**: + - Never generate assertions that merely restate facts already mentioned in the query. + - Focus on substantive information that a complete answer should provide BEYOND what's already stated in the question. + +5. **Comprehensive Coverage**: Generate assertions that collectively test coverage of the main topics and insights present in the claims. + +6. **Conciseness**: Keep assertions brief and focused. Aim for under 50 words. Focus on the core topics/facts rather than listing multiple examples or detailed specifications. + +7. **Atomic Structure - NO COMPOUND STATEMENTS**: Each assertion must test exactly ONE concept or fact. + - Never use compound statements that combine multiple facts or examples using words like "including", "such as", "and", or other similar connectors. + - If you find yourself wanting to list examples, create separate assertions instead + - Each assertion should pass this test: "Can this be verified with a single YES/NO answer?" + +8. **Avoid Detailed Lists**: Never enumerate specific subtypes, categories, or examples within assertions. Use broad thematic terms instead of detailed breakdowns. + +Each assertion in the response should contain the following elements: +- "statement": A clear assertion that begins with "The response should " followed by an aspect to be included in the answer from the claims. +- "sources": A list of all source claim IDs that support this assertion (exact string values taken from the ID column in the input claims) +- "score": An integer score between 1-10 indicating how important this aspect is for answering the query (10 = essential, 1 = least important). +- "reasoning": A brief explanation (1-2 sentences) of why this assertion is relevant to the query and why you assigned this importance score. + +The response should be JSON formatted as follows: +{ + "assertions": [ + {"statement": "The response should...", "sources": [list of supporting claim IDs], "score": importance score (1-10), "reasoning": "This assertion is relevant because... The score reflects..."}, + ] +} \ No newline at end of file diff --git a/benchmark_qed/autoq/prompts/data_questions/assertions/local_max_assertion_instruction_prompt.txt b/benchmark_qed/autoq/prompts/data_questions/assertions/local_max_assertion_instruction_prompt.txt new file mode 100644 index 0000000..68fe3df --- /dev/null +++ b/benchmark_qed/autoq/prompts/data_questions/assertions/local_max_assertion_instruction_prompt.txt @@ -0,0 +1,2 @@ +---MAX NUMBER OF ASSERTIONS--- +Generate up to ${max_assertions} high-quality assertions. Use fewer if that provides better quality and avoids redundancy. diff --git a/benchmark_qed/autoq/prompts/data_questions/assertions/local_validation_prompt.txt b/benchmark_qed/autoq/prompts/data_questions/assertions/local_validation_prompt.txt new file mode 100644 index 0000000..324088d --- /dev/null +++ b/benchmark_qed/autoq/prompts/data_questions/assertions/local_validation_prompt.txt @@ -0,0 +1,53 @@ +---ROLE--- +You are an expert fact-checker evaluating assertions for grounding, relevance, and verifiability. + + +---GOAL--- +Rate the given assertion on three criteria using a 1-5 scale. This assertion was generated for a LOCAL question (specific, fact-focused query about particular events, entities, or details). + + +---QUESTION BEING EVALUATED--- +${question} + + +---ASSERTION TO EVALUATE--- +${assertion} + + +---SOURCE TEXTS--- +${sources} + + +---CRITERIA--- +Rate each criterion from 1 (poor) to 5 (excellent): + +**GROUNDING**: Is the assertion factually supported by the source texts? +- 1: Makes claims not in sources (hallucination) +- 2: Mostly unsupported, significant extrapolation +- 3: Partially supported, some minor extrapolation +- 4: Well supported, minor paraphrasing only +- 5: Directly stated or clearly implied by sources + +**RELEVANCE**: Is the assertion useful for evaluating answers to the question? +- 1: Off-topic or just restates information already in the question +- 2: Tangentially related, or tests trivial details +- 3: Somewhat useful, but not core to the question +- 4: Tests important factual content for the question +- 5: Tests essential specific information a good answer must include + +**VERIFIABILITY**: Is the assertion clear and checkable against an answer? +- 1: Vague, subjective, or impossible to verify +- 2: Ambiguous, hard to check objectively +- 3: Reasonably clear, some interpretation needed +- 4: Clear and concrete, easy to verify +- 5: Unambiguous, binary checkable fact + + +---OUTPUT--- +Respond with a JSON object: +{ + "grounding": 1-5, + "relevance": 1-5, + "verifiability": 1-5, + "reasoning": "Brief explanation" +} diff --git a/benchmark_qed/autoq/prompts/data_questions/global_questions/data_global_gen_system_prompt.txt b/benchmark_qed/autoq/prompts/data_questions/global_questions/data_global_gen_system_prompt.txt index cee6b36..793d9e1 100644 --- a/benchmark_qed/autoq/prompts/data_questions/global_questions/data_global_gen_system_prompt.txt +++ b/benchmark_qed/autoq/prompts/data_questions/global_questions/data_global_gen_system_prompt.txt @@ -1,16 +1,28 @@ You are a helpful assistant tasked with identifying relevant global questions from input questions sharing the same abstract category. +--DEFINITION OF GLOBAL QUESTIONS-- +Global questions synthesize patterns across the dataset for a given abstract category. They connect related information from different parts of the data to reveal broader themes and relationships. +Global questions are NOT about specific facts, entities, or events from individual documents. + --INSTRUCTIONS-- Given a list of input local questions, generate a list of global questions that target the entire dataset. -Each global question should: +Each global question MUST: - begin with 'Across the dataset, ...'. Example: "Across the dataset, what are the most important strategies for improving mental health outcomes?"; -- require an understanding of the dataset as a whole; +- require synthesizing information from across the dataset; - be relevant to the abstract category shared by the input local questions; - be relevant to all the input local questions, not just a subset; -- be general and abstract, short and concise, but not overly broad. For example, a question like "Across the dataset, what are the most important factors?" is too broad and should be more specific to the given input questions. +- stay grounded in the specific topics reflected in the input questions - questions should clearly relate to the subject matter; +- be ABSTRACT, SHORT, SIMPLE, and NATURAL-SOUNDING - like a question a curious human would ask: + - aim for 10-25 words after "Across the dataset," (slightly longer is okay if needed for clarity) + - ask only one thing at a time + - use simple, direct phrasing + - BAD (too long): "Across the dataset, how do shifts in regulatory guidance reshape the practical boundaries of emergency pregnancy care under overlapping state restrictions and federal obligations?" + - GOOD: "Across the dataset, how are changes in abortion regulations affecting access to emergency care?" + - BAD (too abstract): "Across the dataset, what are the most important factors?" + - GOOD (grounded in topic): "Across the dataset, what factors most influence vaccine hesitancy?" - assume the person asking only has a general sense of the dataset as context; -- AVOID requiring any counting, sorting, or any other complex mathematical, statistical operations.For example, avoid questions like "Across the dataset, what is the frequency of occurrences of X?" as it requires counting. +- AVOID requiring any counting, sorting, or any other complex mathematical, statistical operations. For example, avoid questions like "Across the dataset, what is the frequency of occurrences of X?" as it requires counting. - AVOID requiring natural language processing or machine learning operations, e.g., sentiment analysis, keyword counting. The output question set should be distinct and diverse, avoiding repetitive questions. diff --git a/benchmark_qed/autoq/prompts/data_questions/local_questions/data_local_gen_system_prompt.txt b/benchmark_qed/autoq/prompts/data_questions/local_questions/data_local_gen_system_prompt.txt index 6d40811..6c40672 100644 --- a/benchmark_qed/autoq/prompts/data_questions/local_questions/data_local_gen_system_prompt.txt +++ b/benchmark_qed/autoq/prompts/data_questions/local_questions/data_local_gen_system_prompt.txt @@ -7,11 +7,13 @@ Let's think step by step: - Background information may connect entities and concepts from multiple input texts. 2. Step 2: Extract a list of concrete questions that follow from the background information and can be answered based on the input texts. -Each question should: +Each question MUST: - focus on specific entities, events, or concepts mentioned in the input texts +- target a SINGLE extractable fact (e.g., a specific number, name, date, reason, or stated position) rather than requiring a long descriptive answer - stand alone in the absence of the background information and other questions - be appropriately scoped to be answerable entirely and accurately using the input texts, rather than requiring external knowledge -- ask only one thing at a time rather than having multiple sub-questions. For example, this question is BAD because it has multiple sub-questions: ""What recent tragic incidents related to the fentanyl crisis have occurred in Washington and Oregon, and what measures are officials taking in response to these events?". +- ask only one thing at a time rather than having multiple sub-questions. For example, this question is BAD because it has multiple sub-questions: "What recent tragic incidents related to the fentanyl crisis have occurred in Washington and Oregon, and what measures are officials taking in response to these events?". +- be CONCISE and NATURAL-SOUNDING - avoid convoluted phrasing, long qualifying clauses, and over-specification Question set should be distinct and diverse, avoiding repetitive questions. --OUTPUT-- diff --git a/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/__init__.py b/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/__init__.py new file mode 100644 index 0000000..112a07e --- /dev/null +++ b/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/__init__.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025 Microsoft Corporation. +"""Assertion generation for evaluating answer accuracy in question-answering systems.""" + +from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.base import ( + Assertion, + AssertionGenerationResult, + BaseAssertionGenerator, + ClaimDict, +) +from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.global_claim_assertion_gen import ( + GlobalClaimAssertionGenerator, +) +from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.local_claim_assertion_gen import ( + LocalClaimAssertionGenerator, +) +from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.validator import ( + AssertionValidator, + ValidationResult, + ValidationScores, + ValidationSummary, +) + +__all__ = [ + "Assertion", + "AssertionGenerationResult", + "AssertionValidator", + "BaseAssertionGenerator", + "ClaimDict", + "GlobalClaimAssertionGenerator", + "LocalClaimAssertionGenerator", + "ValidationResult", + "ValidationScores", + "ValidationSummary", +] diff --git a/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/base.py b/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/base.py new file mode 100644 index 0000000..326a2bc --- /dev/null +++ b/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/base.py @@ -0,0 +1,300 @@ +# Copyright (c) 2025 Microsoft Corporation. +"""Base classes for assertion generation.""" + +from __future__ import annotations + +import asyncio +import logging +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +from pydantic import BaseModel, Field, field_validator +from tqdm.asyncio import tqdm_asyncio + +from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.ranking import ( + calculate_rrf_scores, +) +from benchmark_qed.config.defaults import LLM_PARAMS, MAX_ASSERTIONS + +if TYPE_CHECKING: + from benchmark_qed.autoq.data_model.question import Question + from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.validator import ( + AssertionValidator, + ) + from benchmark_qed.llm.type.base import ChatModel + +ClaimDict = dict[str, Any] # Individual claim with statement, score, etc. + +log: logging.Logger = logging.getLogger(__name__) + + +class Assertion(BaseModel): + """Pydantic model representing an assertion for evaluation.""" + + statement: str + """The assertion statement text.""" + + score: int = Field(ge=1, le=10) + """The importance/confidence score (1-10).""" + + sources: list[str] = Field(default_factory=list) + """List of source text chunks that are associated with the assertion.""" + + reasoning: str = "" + """Explanation of why this assertion is relevant to the question.""" + + attributes: dict[str, Any] = Field(default_factory=dict) + """Additional metadata and attributes.""" + + @field_validator("statement") + @classmethod + def statement_not_empty(cls, v: str) -> str: + """Validate that statement is not empty.""" + if not v.strip(): + msg = "Assertion statement cannot be empty" + raise ValueError(msg) + return v + + +class AssertionGenerationResult(BaseModel): + """Pydantic model for assertion generation results.""" + + assertions: list[Assertion] + """The generated assertions.""" + + total_assertions: int + """Total number of assertions generated.""" + + +class BaseAssertionGenerator(ABC): + """ + Base class for generating factual assertions for evaluating answer accuracy in question-answering systems. + + This is a general interface that can be implemented by various types of assertion generators + (claim-based, document-based, context-based, etc.). + + Subclasses should implement specific methods for generating assertions from different input types. + + Supports optional validation of generated assertions using AssertionValidator to filter out + assertions that are not properly grounded in sources or not relevant to the question. + """ + + def __init__( + self, + llm: ChatModel, + llm_params: dict[str, Any] = LLM_PARAMS, + json_mode: bool = True, + max_assertions: int | None = MAX_ASSERTIONS, + validator: AssertionValidator | None = None, + max_concurrent_questions: int | None = None, + ) -> None: + """ + Initialize the assertion generator. + + Parameters + ---------- + llm : ChatModel + The language model to use for generation. + llm_params : dict[str, Any] + Parameters for the LLM. + json_mode : bool + Whether to use JSON mode for LLM responses. + max_assertions : int | None + Maximum number of assertions to generate. None for no limit. + validator : AssertionValidator | None + Optional validator for filtering assertions. If provided, assertions + will be validated after generation and only valid ones returned. + If None, validation is skipped. + max_concurrent_questions : int | None + Maximum number of questions to process concurrently when generating + assertions. If None, processes sequentially. If > 1, processes in parallel. + """ + self.llm = llm + self.llm_params = llm_params + self.json_mode = json_mode + self.max_assertions = max_assertions + self.validator = validator + self.max_concurrent_questions = max_concurrent_questions + + if self.json_mode: + self.llm_params["response_format"] = {"type": "json_object"} + else: + self.llm_params.pop("response_format", None) + + @abstractmethod + async def agenerate_assertions( + self, question_text: str, **kwargs: Any + ) -> AssertionGenerationResult: + """Generate assertions for evaluating answer accuracy based on a question and additional inputs. + + Args: + question_text: The question text to generate assertions for + **kwargs: Additional parameters specific to assertion generator implementation + """ + + async def agenerate_assertions_for_questions( + self, + questions: list[Question], + ) -> None: + """ + Generate assertions for a list of questions and add them to question attributes. + + This method extracts claims from each question's attributes and generates + assertions using the configured assertion generator. Results are stored + as dictionaries in question.attributes["assertions"]. + + Args: + questions: List of Question objects to generate assertions for (modified in place). + Each question should have claims in question.attributes["claims"]. + + Side Effects: + Updates each question's attributes with generated assertions: + - 'assertions': List of assertion dictionaries + - 'assertion_count': Number of assertions generated + """ + if self.max_concurrent_questions is None or self.max_concurrent_questions <= 1: + # Sequential processing + await self._generate_assertions_sequential(questions) + else: + # Parallel processing with semaphore + await self._generate_assertions_parallel(questions) + + async def _generate_assertions_sequential(self, questions: list[Question]) -> None: + """Generate assertions sequentially for each question.""" + from tqdm import tqdm + + for question in tqdm(questions, desc="Generating assertions", unit="question"): + await self._process_single_question(question) + + async def _generate_assertions_parallel(self, questions: list[Question]) -> None: + """Generate assertions in parallel with concurrency limit.""" + semaphore = asyncio.Semaphore(self.max_concurrent_questions or 1) + + async def process_with_semaphore(question: Question) -> None: + async with semaphore: + await self._process_single_question(question) + + tasks = [process_with_semaphore(q) for q in questions] + await tqdm_asyncio.gather(*tasks, desc="Generating assertions", unit="question") + + async def _process_single_question(self, question: Question) -> None: + """Process a single question to generate assertions.""" + try: + # Get claims from question attributes + claims = ( + question.attributes.get("claims", []) if question.attributes else [] + ) + if not claims: + log.warning("No claims found for question: %s", question.text) + return + + result = await self.agenerate_assertions( + question_text=question.text, + claims=claims, + ) + + # Convert assertions to dicts for JSON serialization + assertions = [assertion.model_dump() for assertion in result.assertions] + + # Initialize attributes if they don't exist + if question.attributes is None: + question.attributes = {} + + # Add assertion results to question attributes + question.attributes["assertions"] = assertions + question.attributes["assertion_count"] = len(assertions) + + log.info( + "Generated %s assertions for question: %s", + len(assertions), + question.text, + ) + + except Exception as e: # noqa: BLE001 + log.warning( + "Failed to generate assertions for question '%s': %s", + question.text, + e, + ) + # Ensure attributes exist even on failure + if question.attributes is None: + question.attributes = {} + question.attributes["assertions"] = [] + question.attributes["assertion_count"] = 0 + + async def _validate_assertions( + self, + assertions: list[Assertion], + question_text: str, + ) -> list[Assertion]: + """ + Validate assertions using the configured validator. + + If no validator is configured, returns assertions unchanged. + + Args: + assertions: List of assertions to validate + question_text: The question text for context in validation + + Returns + ------- + List of assertions that passed validation, or all assertions if no validator + """ + if not self.validator or not assertions: + return assertions + + log.info("Validating %s assertions...", len(assertions)) + summary = await self.validator.validate_assertions(assertions, question_text) + + log.info( + "Validation complete: %s/%s assertions passed (%.1f%%)", + summary.valid_count, + summary.total_count, + summary.validation_rate * 100, + ) + + return summary.valid_assertions + + def _rank_and_limit_assertions( + self, assertions: list[Assertion], max_assertions: int | None + ) -> list[Assertion]: + """ + Rank assertions using RRF and optionally limit to max_assertions. + + Reciprocal Rank Fusion combines importance score and source count rankings. + + RRF fuses rankings from two criteria: + 1. Importance score (descending: higher scores = better rank) + 2. Source count (descending: more sources = better rank) + + The RRF scores are stored in each assertion's attributes for debugging/analysis. + + Args: + assertions: List of validated assertions + max_assertions: Maximum number of assertions to return, or None for no limit + + Returns + ------- + Top ranked assertions using RRF fusion, optionally limited to max_assertions + """ + if not assertions: + return [] + + # Calculate RRF scores using the utility function + rrf_scores = calculate_rrf_scores( + items=assertions, + score_key_func=lambda a: a.score, + source_count_key_func=lambda a: len(a.sources), + ) + + # Store RRF scores in assertion attributes for debugging/analysis + for assertion in assertions: + assertion.attributes["rrf_score"] = rrf_scores[id(assertion)] + + # Sort by RRF score (descending - higher RRF scores are better) + ranked_assertions = sorted(assertions, key=lambda a: -rrf_scores[id(a)]) + + # Limit to max_assertions if specified + if max_assertions is not None: + return ranked_assertions[:max_assertions] + return ranked_assertions diff --git a/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/global_claim_assertion_gen.py b/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/global_claim_assertion_gen.py new file mode 100644 index 0000000..06b5178 --- /dev/null +++ b/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/global_claim_assertion_gen.py @@ -0,0 +1,522 @@ +# Copyright (c) 2025 Microsoft Corporation. +"""Generate assertions for evaluating answer accuracy based on claims for global questions.""" + +from __future__ import annotations + +import asyncio +import json +import logging +from pathlib import Path +from string import Template +from typing import TYPE_CHECKING, Any + +from benchmark_qed.autod.data_processor.text_utils import ( + num_tokens, + try_parse_json_object, +) +from benchmark_qed.autoq.prompts import data_questions +from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.base import ( + Assertion, + AssertionGenerationResult, + BaseAssertionGenerator, + ClaimDict, +) +from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.local_claim_assertion_gen import ( + LocalClaimAssertionGenerator, +) +from benchmark_qed.config.defaults import ( + ASSERTION_BATCH_SIZE, + ASSERTION_MAX_DATA_TOKENS, + LLM_PARAMS, + MAX_ASSERTIONS, +) +from benchmark_qed.config.utils import load_template_file + +if TYPE_CHECKING: + from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.validator import ( + AssertionValidator, + ) + from benchmark_qed.llm.type.base import ChatModel + +log: logging.Logger = logging.getLogger(__name__) + +ASSERTION_GEN_PROMPTS_PATH = Path(data_questions.__file__).parent + + +class GlobalClaimAssertionGenerator(BaseAssertionGenerator): + """ + Generate factual assertions for evaluating answer accuracy based on claims for global questions. + + This generator is designed for data_global_questions and handles complex batch formats + where each batch contains question text and ClaimExtractionResult objects. + It aggregates claims across multiple question contexts and performs cross-question deduplication. + + Supports optional validation of generated assertions using AssertionValidator. + + Supports automatic batching for large claim sets with parallel processing: + - If batch_size is None or <= 0, processes all claims in a single request + - If claims exceed batch_size, automatically splits into parallel batches + """ + + def __init__( + self, + llm: ChatModel, + llm_params: dict[str, Any] = LLM_PARAMS, + json_mode: bool = True, + map_system_prompt: Template | None = None, + reduce_system_prompt: Template | None = None, + max_assertions: int | None = MAX_ASSERTIONS, + batch_size: int | None = ASSERTION_BATCH_SIZE, + concurrent_coroutines: int = 8, + max_data_tokens: int = ASSERTION_MAX_DATA_TOKENS, + token_encoder: Any | None = None, + validator: AssertionValidator | None = None, + max_concurrent_questions: int | None = None, + ) -> None: + super().__init__( + llm, + llm_params, + json_mode, + max_assertions, + validator, + max_concurrent_questions, + ) + + # Load prompt templates + self.map_prompt: Template = map_system_prompt or load_template_file( + ASSERTION_GEN_PROMPTS_PATH + / "assertions" + / "global_claim_assertion_map_prompt.txt" + ) + if isinstance(self.map_prompt, str): + self.map_prompt = Template(self.map_prompt) + + self.reduce_prompt: Template = reduce_system_prompt or load_template_file( + ASSERTION_GEN_PROMPTS_PATH + / "assertions" + / "global_claim_assertion_reduce_prompt.txt" + ) + if isinstance(self.reduce_prompt, str): + self.reduce_prompt = Template(self.reduce_prompt) + + # Load max assertion instruction template for dynamic count limiting + self._max_assertion_instruction_prompt = load_template_file( + ASSERTION_GEN_PROMPTS_PATH + / "assertions" + / "global_max_assertion_instruction_prompt.txt" + ) + + # Batch processing parameters for complex global processing + self.batch_size = batch_size + self.concurrent_coroutines = concurrent_coroutines + self._semaphore = asyncio.Semaphore(concurrent_coroutines) + self.max_data_tokens = max_data_tokens + self.token_encoder = token_encoder + + # Local generator for map phase processing + self.local_generator: LocalClaimAssertionGenerator = LocalClaimAssertionGenerator( + llm=self.llm, + llm_params=self.llm_params, + json_mode=self.json_mode, + system_prompt=self.map_prompt, + max_assertions=None, # No assertion limiting in map step - focus on quality and comprehensiveness + ) + + async def agenerate_assertions( + self, question_text: str, **kwargs: Any + ) -> AssertionGenerationResult: + """Generate assertions using a map-reduce approach for global questions. + + Map Step: Divide claims into batches and generate initial assertions + Reduce Step: Consolidate initial assertions into high-level global assertions + Validation Step (optional): Validate assertions if validator is configured + + Args: + question_text: The question text to generate assertions for + **kwargs: Additional parameters, expects 'claims' list (can be simple or batch format) + """ + # MAP PHASE + claim_batches = self.build_map_context(kwargs.get("claims", [])) + if not claim_batches: + log.warning("No claims provided for assertion generation") + return AssertionGenerationResult(assertions=[], total_assertions=0) + + map_responses = await self.generate_map_responses(question_text, claim_batches) + + # REDUCE PHASE + reduce_context = self.build_reduce_context(map_responses) + final_assertions = await self.generate_reduce_response( + question_text, reduce_context + ) + + # VALIDATION PHASE (optional) + final_assertions = await self._validate_assertions( + final_assertions, question_text + ) + + return AssertionGenerationResult( + assertions=final_assertions, total_assertions=len(final_assertions) + ) + + def build_map_context(self, claims: list[ClaimDict]) -> list[list[ClaimDict]]: + """Build map context by extracting claims and dividing into batches for parallel processing. + + Args: + claims_input: Entire list of claims + + Returns + ------- + List of claim batches, where each batch is a list ofclaims + """ + if not claims: + return [] + + # create batches based on batch_size + batch_size = ( + self.batch_size if self.batch_size and self.batch_size > 0 else len(claims) + ) + + # Split claims into batches + batches = [ + claims[i : i + batch_size] for i in range(0, len(claims), batch_size) + ] + log.info( + "MAP CONTEXT: Created %s batches from %s simple claims", + len(batches), + len(claims), + ) + return batches + + async def generate_map_responses( + self, question_text: str, claim_batches: list[list[ClaimDict]] + ) -> list[list[Assertion]]: + """Generate map responses by running LocalClaimAssertionGenerator in parallel on claim batches. + + Args: + question_text: The question text for assertion generation + claim_batches: List of claim batches to process in parallel + + Returns + ------- + List of assertion lists, one per successfully processed batch + """ + if not claim_batches: + return [] + + log.info("MAP RESPONSES: Processing %s batches in parallel", len(claim_batches)) + + async def process_batch_with_semaphore( + batch: list[ClaimDict], + ) -> list[Assertion]: + async with self._semaphore: + # Use the shared local generator for map phase processing + result = await self.local_generator.agenerate_assertions( + question_text, claims=batch + ) + return result.assertions + + # Process batches in parallel + batch_results = await asyncio.gather( + *[process_batch_with_semaphore(batch) for batch in claim_batches], + return_exceptions=True, + ) + + # Collect successful results + map_responses = [] + for i, result in enumerate(batch_results): + if isinstance(result, Exception): + log.error("Error processing batch %s in MAP step: %s", i, result) + continue + if isinstance(result, list): + map_responses.append(result) + + log.info( + "MAP RESPONSES: Successfully processed %s out of %s batches", + len(map_responses), + len(claim_batches), + ) + return map_responses + + def build_reduce_context( + self, map_responses: list[list[Assertion]] + ) -> tuple[list[Assertion], str]: + """Build reduce context by merging map responses and formatting for consolidation. + + Args: + question_text: The question text for context + map_responses: List of assertion lists from map phase + + Returns + ------- + Tuple of (unique_assertions, formatted_assertions_text) for reduce phase + """ + # Flatten all initial assertions + initial_assertions: list[Assertion] = [] + for assertion_list in map_responses: + initial_assertions.extend(assertion_list) + + if not initial_assertions: + return [], "" + + log.info( + "REDUCE CONTEXT: Merging %s assertions from %s batches", + len(initial_assertions), + len(map_responses), + ) + + # Rank assertions by score (descending) and source count (descending) + ranked_assertions = sorted( + initial_assertions, key=lambda a: (-a.score, -len(a.sources)) + ) + + log.info( + "REDUCE CONTEXT: Ranked %s unique assertions by score and source count", + len(ranked_assertions), + ) + + # Format assertions with token limit + formatted_assertions = [] + selected_assertions = [] + current_tokens = 0 + + for i, assertion in enumerate(ranked_assertions, 1): + statement = assertion.statement + score = assertion.score + assertion_text = ( + f"ID: assertion_{i}\nStatement: {statement}\nScore: {score}" + ) + assertion_tokens = num_tokens(assertion_text, self.token_encoder) + + # Check if adding this assertion would exceed token limit + if current_tokens + assertion_tokens > self.max_data_tokens: + log.info( + "REDUCE CONTEXT: Reached token limit at assertion %s, stopping at %s tokens", + i - 1, + current_tokens, + ) + break + + formatted_assertions.append(assertion_text) + selected_assertions.append(assertion) + current_tokens += assertion_tokens + + formatted_text = "\n".join(formatted_assertions) + + log.info( + "REDUCE CONTEXT: Selected %s of %s assertions within %s tokens (limit: %s)", + len(selected_assertions), + len(ranked_assertions), + current_tokens, + self.max_data_tokens, + ) + return selected_assertions, formatted_text + + def _build_assertion_mapping( + self, assertions: list[Assertion] + ) -> dict[str, dict[str, Any]]: + """Build mapping from assertion index to assertion data for source resolution. + + Args: + assertions: List of assertions to create mapping for + + Returns + ------- + Dictionary mapping assertion IDs to assertion data + """ + assertion_id_to_data = {} + for i, assertion in enumerate(assertions, 1): + assertion_id = str(i) + assertion_id_to_data[f"assertion_{assertion_id}"] = { + "statement": assertion.statement, + "sources": assertion.sources, + "score": assertion.score, + } + return assertion_id_to_data + + def _map_sources_and_aggregate( + self, sources: list[Any], assertion_mapping: dict[str, dict[str, Any]] + ) -> tuple[list[Any], list[Any], list[str]]: + """Map source assertion IDs to statements and aggregate source chunks. + + Args: + sources: List of source IDs from consolidated assertion + assertion_mapping: Mapping from assertion IDs to assertion data + + Returns + ------- + Tuple of (mapped_sources, aggregated_source_chunks, hallucinated_sources) + """ + mapped_sources = [] + aggregated_source_chunks = [] + hallucinated_sources = [] + + if assertion_mapping and sources: + for source in sources: + source_str = str(source).strip() + if source_str in assertion_mapping: + source_assertion = assertion_mapping[source_str] + mapped_sources.append({ + "statement": source_assertion["statement"], + "score": source_assertion["score"], + }) + # Aggregate source chunks from the original assertion + if source_assertion["sources"]: + aggregated_source_chunks.extend(source_assertion["sources"]) + else: + # Track hallucinated source IDs + hallucinated_sources.append(source_str) + + return mapped_sources, aggregated_source_chunks, hallucinated_sources + + def _validate_consolidated_assertions( + self, + consolidated_assertions: list[dict[str, Any]], + assertion_mapping: dict[str, dict[str, Any]], + ) -> list[Assertion]: + """Validate and create Assertion objects from consolidated assertions. + + Args: + consolidated_assertions: Raw consolidated assertions from LLM + assertion_mapping: Mapping from assertion IDs to assertion data + + Returns + ------- + List of validated Assertion objects + """ + validated_assertions = [] + for assertion in consolidated_assertions: + result = self._process_consolidated_assertion(assertion, assertion_mapping) + if result: + validated_assertions.append(result) + return validated_assertions + + def _process_consolidated_assertion( + self, assertion: dict[str, Any], assertion_mapping: dict[str, dict[str, Any]] + ) -> Assertion | None: + """Process a single consolidated assertion and return Assertion object or None if invalid.""" + statement = assertion.get("statement", "").strip() + if not statement: + return None + + sources = assertion.get("sources", []) + mapped_sources, aggregated_chunks, hallucinated = ( + self._map_sources_and_aggregate(sources, assertion_mapping) + ) + + # Discard if all sources are hallucinated + if not mapped_sources: + if hallucinated: + log.warning( + "Discarding assertion with all hallucinated sources: '%s...' (hallucinated: %s)", + statement[:100], + hallucinated, + ) + return None + + # Log partial hallucinations + if hallucinated: + log.warning( + "Assertion has %s hallucinated source(s): %s (keeping %s valid)", + len(hallucinated), + hallucinated, + len(mapped_sources), + ) + + # Log missing source chunks + if not aggregated_chunks: + log.warning( + "Global assertion has 0 source chunks: '%s...' (LLM sources: %s, mapping keys: %s)", + statement[:100], + sources, + list(assertion_mapping.keys())[:10], + ) + + try: + return Assertion( + statement=statement, + sources=aggregated_chunks, + score=assertion.get("score", 5), + reasoning=assertion.get("reasoning", ""), + attributes={"source_assertions": mapped_sources}, + ) + except ValueError as e: + log.warning("Skipping invalid consolidated assertion: %s", e) + return None + + async def generate_reduce_response( + self, question_text: str, reduce_context: tuple[list[Assertion], str] + ) -> list[Assertion]: + """Generate reduce response using LLM to consolidate assertions into high-level ones. + + Args: + question_text: The question text for context + reduce_context: Tuple of (unique_assertions, formatted_text) from build_reduce_context + + Returns + ------- + List of consolidated high-level Assertion objects + """ + unique_assertions, formatted_text = reduce_context + + if not unique_assertions: + return [] + + # Use LLM with reduce prompt to consolidate assertions + log.info("REDUCE RESPONSE: Consolidating %s assertions", len(unique_assertions)) + + # Build base prompt + base_prompt = self.reduce_prompt.substitute( + question_text=question_text, assertions_context=formatted_text + ) + + # Dynamically add count instruction if max_assertions is specified + if self.max_assertions is not None and self.max_assertions > 0: + count_instruction = self._max_assertion_instruction_prompt.substitute( + max_assertions=self.max_assertions + ) + prompt_content = base_prompt + "\n\n" + count_instruction + else: + prompt_content = base_prompt + + messages = [{"role": "user", "content": prompt_content}] + + result = await self.llm.chat(messages=messages, **self.llm_params) + response, j = try_parse_json_object(result.output.content) + + if j == {}: + log.warning( + "Failed to parse consolidation response, returning original: %s", + response, + ) + return self._rank_and_limit_assertions( + unique_assertions, self.max_assertions + ) + + parsed_result = json.loads(response) + consolidated_assertions = parsed_result.get("assertions", []) + + if not consolidated_assertions: + log.warning("No consolidated assertions returned, using original") + return self._rank_and_limit_assertions( + unique_assertions, self.max_assertions + ) + + # Build mapping from assertion IDs to assertion data + assertion_mapping = self._build_assertion_mapping(unique_assertions) + + # Validate consolidated assertions with source mapping + validated_assertions = self._validate_consolidated_assertions( + consolidated_assertions, assertion_mapping + ) + + # rank and limit assertions + validated_assertions = self._rank_and_limit_assertions( + validated_assertions, self.max_assertions + ) + + log.info( + "Successfully consolidated %s assertions into %s", + len(unique_assertions), + len(validated_assertions), + ) + return validated_assertions diff --git a/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/local_claim_assertion_gen.py b/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/local_claim_assertion_gen.py new file mode 100644 index 0000000..27e287c --- /dev/null +++ b/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/local_claim_assertion_gen.py @@ -0,0 +1,298 @@ +# Copyright (c) 2025 Microsoft Corporation. +"""Generate assertions for evaluating answer accuracy based on claims for local questions.""" + +from __future__ import annotations + +import json +import logging +from pathlib import Path +from string import Template +from typing import TYPE_CHECKING, Any + +from benchmark_qed.autod.data_processor.text_utils import try_parse_json_object +from benchmark_qed.autoq.prompts import data_questions +from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.base import ( + Assertion, + AssertionGenerationResult, + BaseAssertionGenerator, + ClaimDict, +) +from benchmark_qed.config.defaults import LLM_PARAMS, MAX_ASSERTIONS +from benchmark_qed.config.utils import load_template_file + +if TYPE_CHECKING: + from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.validator import ( + AssertionValidator, + ) + from benchmark_qed.llm.type.base import ChatModel + +log: logging.Logger = logging.getLogger(__name__) + +ASSERTION_GEN_PROMPTS_PATH = Path(data_questions.__file__).parent + + +class LocalClaimAssertionGenerator(BaseAssertionGenerator): + """ + Generate factual assertions for evaluating answer accuracy based on claims for local questions. + + This generator is designed for data_local_questions and handles simple claim lists. + Takes a question and a list of relevant claims as input, and generates testable assertions + that can be used as unit tests to verify the accuracy of answers to the question. + + Supports optional validation of generated assertions using AssertionValidator. + + Optimized for simple, direct processing without complex batching overhead. + """ + + def __init__( + self, + llm: ChatModel, + llm_params: dict[str, Any] = LLM_PARAMS, + json_mode: bool = True, + system_prompt: Template | None = None, + max_assertions: int | None = MAX_ASSERTIONS, + validator: AssertionValidator | None = None, + max_concurrent_questions: int | None = None, + ) -> None: + super().__init__( + llm, + llm_params, + json_mode, + max_assertions, + validator, + max_concurrent_questions, + ) + + system_prompt = system_prompt or load_template_file( + ASSERTION_GEN_PROMPTS_PATH + / "assertions" + / "local_claim_assertion_gen_prompt.txt" + ) + if isinstance(system_prompt, str): + system_prompt = Template(system_prompt) + self.system_prompt = system_prompt + + # Load max assertion instruction template for dynamic count limiting + self._max_assertion_instruction_prompt = load_template_file( + ASSERTION_GEN_PROMPTS_PATH + / "assertions" + / "local_max_assertion_instruction_prompt.txt" + ) + + async def agenerate_assertions( + self, question_text: str, **kwargs: Any + ) -> AssertionGenerationResult: + """Generate assertions for data local questions. + + Args: + question_text: The question text to generate assertions for. + **kwargs: Additional parameters: + - claims: List of claims to generate assertions from + """ + claims: list[ClaimDict] = kwargs.get("claims", []) + + if not claims: + log.warning("No claims provided for assertion generation") + return AssertionGenerationResult(assertions=[], total_assertions=0) + + # Process all claims in a single batch for local questions + claims_text = self._build_context(claims) + + if self.system_prompt is None: + msg = "System prompt cannot be None" + raise ValueError(msg) + + # Build base prompt + base_prompt = self.system_prompt.substitute( + query=question_text, context_data=claims_text + ) + + # Dynamically add count instruction if max_assertions is specified + if self.max_assertions is not None and self.max_assertions > 0: + count_instruction = self._max_assertion_instruction_prompt.substitute( + max_assertions=self.max_assertions + ) + prompt_content = base_prompt + "\n\n" + count_instruction + else: + prompt_content = base_prompt + + messages = [ + { + "role": "system", + "content": prompt_content, + }, + ] + + result = await self.llm.chat(messages=messages, **self.llm_params) + log.debug("Assertion results: %s", result) + response, j = try_parse_json_object(result.output.content) + if j == {}: + msg = f"Invalid json response, returning empty assertion list: {response}" + log.warning(msg) + return AssertionGenerationResult(assertions=[], total_assertions=0) + + parsed_assertions = json.loads(response).get("assertions") + if not parsed_assertions or not isinstance(parsed_assertions, list): + log.warning( + "No assertions found in the response, returning empty assertion list" + ) + return AssertionGenerationResult(assertions=[], total_assertions=0) + + # Parse and create Assertion objects from LLM response + assertions = self._parse_assertions(parsed_assertions, claims=claims) + + # Validate assertions with LLM if validator is configured + assertions = await self._validate_assertions(assertions, question_text) + + # Apply ranking and limiting (None means no limit) + assertions = self._rank_and_limit_assertions(assertions, self.max_assertions) + + return AssertionGenerationResult( + assertions=assertions, + total_assertions=len(assertions), + ) + + def _parse_assertions( + self, parsed_assertions: list[dict[str, Any]], **kwargs: Any + ) -> list[Assertion]: + """ + Parse assertions from LLM response and create Assertion objects with claim ID mapping. + + Assertions with hallucinated source IDs (not in claim mapping) are discarded. + + Args: + parsed_assertions: Raw assertions from LLM response as dictionaries + **kwargs: Additional parameters, expects 'claims' for claim mapping + + Returns + ------- + List of validated Assertion objects with mapped claim sources + """ + claims = kwargs.get("claims", []) + + # Build claim ID mapping + claim_id_to_text = ( + {f"claim_{i + 1}": claim for i, claim in enumerate(claims)} + if claims + else {} + ) + + validated_assertions = [] + for assertion in parsed_assertions: + result = self._process_single_assertion(assertion, claim_id_to_text) + if result: + validated_assertions.append(result) + + return validated_assertions + + def _process_single_assertion( + self, assertion: dict[str, Any], claim_id_to_text: dict[str, Any] + ) -> Assertion | None: + """Process a single assertion and return Assertion object or None if invalid.""" + statement = assertion.get("statement", "").strip() + score = assertion.get("score", 5) + sources = assertion.get("sources", []) + + # Validate basic fields + if not statement or not isinstance(score, int) or not (1 <= score <= 10): + return None + + if not claim_id_to_text: + log.warning( + "No claims provided for source mapping, skipping: '%s...'", + statement[:80], + ) + return None + + if not sources: + log.warning("Assertion has no sources, skipping: '%s...'", statement[:80]) + return None + + # Map sources and detect hallucinations + source_claim_texts, source_chunks, hallucinated = self._map_claim_sources( + sources, claim_id_to_text + ) + + # Discard if all sources are hallucinated + if not source_claim_texts: + log.warning( + "Discarding assertion with all hallucinated sources: '%s...' (invalid: %s)", + statement[:80], + hallucinated, + ) + return None + + # Log partial hallucinations + if hallucinated: + log.warning( + "Assertion has %s hallucinated source(s): %s (keeping %s valid)", + len(hallucinated), + hallucinated, + len(source_claim_texts), + ) + + if not source_chunks: + log.warning( + "Assertion has no traceable source texts, skipping: '%s...'", + statement[:80], + ) + return None + + try: + return Assertion( + statement=statement, + score=score, + sources=list(source_chunks), + reasoning=assertion.get("reasoning", ""), + attributes={ + **assertion.get("attributes", {}), + "source_claims": source_claim_texts, + }, + ) + except ValueError as e: + log.warning("Skipping invalid assertion: %s", e) + return None + + def _map_claim_sources( + self, sources: list[Any], claim_id_to_text: dict[str, Any] + ) -> tuple[list[str], set[str], list[str]]: + """Map source IDs to claim texts and aggregate source chunks. + + Returns + ------- + Tuple of (source_claim_texts, source_chunks, hallucinated_sources) + """ + source_claim_texts = [] + source_chunks: set[str] = set() + hallucinated = [] + + for source in sources: + source_str = str(source).strip() + claim = claim_id_to_text.get(source_str) + + if claim: + source_claim_texts.append(claim.get("statement", "")) + source_chunks.update(s["text"] for s in claim.get("sources", [])) + else: + hallucinated.append(source_str) + + return source_claim_texts, source_chunks, hallucinated + + @staticmethod + def _build_context(claims: list[ClaimDict]) -> str: + """Format claims list for the assertion generation prompt.""" + if not claims: + return "No claims provided." + + formatted_claims = [] + for i, claim in enumerate(claims): + claim_id = f"claim_{i + 1}" + statement = claim.get("statement", "") + score = claim.get("score", 0) + + formatted_claim = ( + f"ID: {claim_id}\nStatement: {statement}\nImportance Score: {score}\n" + ) + formatted_claims.append(formatted_claim) + + return "\n".join(formatted_claims) diff --git a/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/local_source_assertion_gen.py b/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/local_source_assertion_gen.py new file mode 100644 index 0000000..6ec8a53 --- /dev/null +++ b/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/local_source_assertion_gen.py @@ -0,0 +1,308 @@ +# Copyright (c) 2025 Microsoft Corporation. +"""Generate assertions for evaluating answer accuracy directly from source texts.""" + +from __future__ import annotations + +import json +import logging +from pathlib import Path +from string import Template +from typing import TYPE_CHECKING, Any + +from benchmark_qed.autod.data_model.text_unit import TextUnit +from benchmark_qed.autod.data_processor.text_utils import try_parse_json_object +from benchmark_qed.autoq.prompts import data_questions +from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.base import ( + Assertion, + AssertionGenerationResult, + BaseAssertionGenerator, +) +from benchmark_qed.config.defaults import LLM_PARAMS +from benchmark_qed.config.utils import load_template_file + +if TYPE_CHECKING: + from benchmark_qed.llm.type.base import ChatModel + +log: logging.Logger = logging.getLogger(__name__) + +ASSERTION_GEN_PROMPTS_PATH = Path(data_questions.__file__).parent + + +class LocalSourceAssertionGenerator(BaseAssertionGenerator): + """ + Generate factual assertions directly from source text passages for local questions. + + This generator bypasses the intermediate claim extraction step and generates + assertions directly from the source text units that were used to create the question. + This is a simpler, faster approach that preserves full context from source texts. + + Advantages over claim-based generation: + - Single LLM call instead of two (claim extraction + assertion generation) + - Full context preserved - no intermediate filtering + - Avoids potential claim extraction errors propagating + - Better for cases where source texts are already concise and relevant + + Use this generator when: + - Source texts are relatively short and focused + - You want to minimize LLM calls for cost/latency reasons + - You want to preserve full context without intermediate abstraction + """ + + def __init__( + self, + llm: ChatModel, + llm_params: dict[str, Any] = LLM_PARAMS, + json_mode: bool = True, + system_prompt: Template | None = None, + max_assertions: int | None = 5, + max_concurrent_questions: int | None = None, + ) -> None: + """ + Initialize the LocalSourceAssertionGenerator. + + Parameters + ---------- + llm : ChatModel + The language model to use for assertion generation. + llm_params : dict[str, Any] + Parameters to pass to the LLM. + json_mode : bool + Whether to use JSON mode for structured output. + system_prompt : Template | None + Custom system prompt template. If None, uses default. + max_assertions : int | None + Maximum number of assertions to generate per question. + None means no limit. + max_concurrent_questions : int | None + Maximum number of questions to process concurrently. + None means sequential processing. + """ + super().__init__( + llm, + llm_params, + json_mode, + max_assertions, + max_concurrent_questions=max_concurrent_questions, + ) + + system_prompt = system_prompt or load_template_file( + ASSERTION_GEN_PROMPTS_PATH + / "assertions" + / "local_source_assertion_gen_prompt.txt" + ) + if isinstance(system_prompt, str): + system_prompt = Template(system_prompt) + self.system_prompt = system_prompt + + # Load max assertion instruction template for dynamic count limiting + self._max_assertion_instruction_prompt = load_template_file( + ASSERTION_GEN_PROMPTS_PATH + / "assertions" + / "local_source_max_assertion_instruction_prompt.txt" + ) + + async def agenerate_assertions( + self, question_text: str, **kwargs: Any + ) -> AssertionGenerationResult: + """ + Generate assertions directly from source text units. + + Parameters + ---------- + question_text : str + The question text to generate assertions for. + **kwargs : Any + Additional parameters. Expects 'source_texts' as a list of TextUnit objects + or a list of dicts with 'id' and 'text' keys. + + Returns + ------- + AssertionGenerationResult + Result containing the generated assertions. + """ + source_texts: list[TextUnit | dict[str, Any]] = kwargs.get("source_texts", []) + if not source_texts: + log.warning("No source texts provided for assertion generation") + return AssertionGenerationResult(assertions=[], total_assertions=0) + + # Build context from source texts + context_data = self._build_context(source_texts) + + if self.system_prompt is None: + msg = "System prompt cannot be None" + raise ValueError(msg) + + # Build base prompt + base_prompt = self.system_prompt.substitute( + query=question_text, context_data=context_data + ) + + # Dynamically add count instruction if max_assertions is specified + if self.max_assertions is not None and self.max_assertions > 0: + count_instruction = self._max_assertion_instruction_prompt.substitute( + max_assertions=self.max_assertions + ) + prompt_content = base_prompt + "\n\n" + count_instruction + else: + prompt_content = base_prompt + + messages = [ + { + "role": "system", + "content": prompt_content, + }, + ] + + result = await self.llm.chat(messages=messages, **self.llm_params) + log.debug("Assertion results: %s", result) + response, j = try_parse_json_object(result.output.content) + if j == {}: + msg = f"Invalid json response, returning empty assertion list: {response}" + log.warning(msg) + return AssertionGenerationResult(assertions=[], total_assertions=0) + + parsed_assertions = json.loads(response).get("assertions") + if not parsed_assertions or not isinstance(parsed_assertions, list): + log.warning( + "No assertions found in the response, returning empty assertion list" + ) + return AssertionGenerationResult(assertions=[], total_assertions=0) + + # Validate and clean the assertions + validated_assertions = self._parse_assertions( + parsed_assertions, source_texts=source_texts + ) + + # Apply ranking and limiting (None means no limit) + validated_assertions = self._rank_and_limit_assertions( + validated_assertions, self.max_assertions + ) + + return AssertionGenerationResult( + assertions=validated_assertions, + total_assertions=len(validated_assertions), + ) + + def _parse_assertions( + self, parsed_assertions: list[dict[str, Any]], **kwargs: Any + ) -> list[Assertion]: + """ + Validate assertions and create Assertion objects with source text mapping. + + Parameters + ---------- + parsed_assertions : list[dict[str, Any]] + Raw assertions from LLM response as dictionaries. + **kwargs : Any + Additional parameters, expects 'source_texts' for source mapping. + + Returns + ------- + list[Assertion] + List of validated Assertion objects with mapped source texts. + """ + source_texts: list[TextUnit | dict[str, Any]] = kwargs.get("source_texts", []) + validated_assertions = [] + + # Create source ID to text mapping + source_id_to_text: dict[str, str] = {} + for i, source in enumerate(source_texts): + if isinstance(source, TextUnit): + source_id = source.short_id or f"source_{i + 1}" + source_id_to_text[source_id] = source.text + # Also map the generated ID format + source_id_to_text[f"source_{i + 1}"] = source.text + else: + # Handle dict format + source_id = source.get("id", f"source_{i + 1}") + source_id_to_text[source_id] = source.get("text", "") + source_id_to_text[f"source_{i + 1}"] = source.get("text", "") + + for assertion in parsed_assertions: + statement = assertion.get("statement", "").strip() + score = assertion.get("score", 5) + + if statement != "" and isinstance(score, int) and 1 <= score <= 10: + # Map source IDs to actual source texts + sources = assertion.get("sources", []) + source_chunk_texts: list[str] = [] + + if source_id_to_text and sources: + for source in sources: + source_str = str(source).strip() + if source_str in source_id_to_text: + source_chunk_texts.append(source_id_to_text[source_str]) + else: + # Keep original source if not found in mapping + log.debug("Source ID not found in mapping: %s", source_str) + else: + # If no source texts provided or no sources, use empty list + source_chunk_texts = [] + + # Create Assertion object + try: + assertion_obj = Assertion( + statement=statement, + score=score, + sources=source_chunk_texts, + attributes={ + **assertion.get("attributes", {}), + "source_ids": sources, + }, + ) + # Debug logging for source counts + source_count = len(source_chunk_texts) if source_chunk_texts else 0 + if source_count == 0: + log.warning( + "Source assertion created with 0 sources: '%s...'", + statement[:100], + ) + log.debug(" Original sources: %s", sources) + log.debug( + " Available source IDs: %s", list(source_id_to_text.keys()) + ) + else: + log.debug( + "Source assertion created with %s sources", source_count + ) + + validated_assertions.append(assertion_obj) + except ValueError as e: + log.warning("Skipping invalid assertion: %s", e) + continue + + return validated_assertions + + @staticmethod + def _build_context(source_texts: list[TextUnit | dict[str, Any]]) -> str: + """ + Format source texts for the assertion generation prompt. + + Parameters + ---------- + source_texts : list[TextUnit | dict[str, Any]] + List of source text units or dicts. + + Returns + ------- + str + Formatted context string for the prompt. + """ + if not source_texts: + return "No source texts provided." + + formatted_sources = [] + for i, source in enumerate(source_texts): + source_id = f"source_{i + 1}" + + if isinstance(source, TextUnit): + text = source.text + source_id = source.short_id or source_id + else: + text = source.get("text", "") + source_id = source.get("id", source_id) + + formatted_source = f"ID: {source_id}\nText: {text}\n" + formatted_sources.append(formatted_source) + + return "\n".join(formatted_sources) diff --git a/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/ranking.py b/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/ranking.py new file mode 100644 index 0000000..7058ebd --- /dev/null +++ b/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/ranking.py @@ -0,0 +1,200 @@ +# Copyright (c) 2025 Microsoft Corporation. +"""Ranking algorithms and utilities for assertion generation.""" + +from collections.abc import Callable +from typing import Any + + +def calculate_rrf_scores( + items: list[Any], + score_key_func: Callable[[Any], float | int], + source_count_key_func: Callable[[Any], int], + k: int = 60, + score_weight: float = 0.3, + source_count_weight: float = 0.7, +) -> dict[int, float]: + """ + Calculate Reciprocal Rank Fusion (RRF) scores for a list of items. + + RRF combines rankings from multiple criteria by computing reciprocal ranks. + Formula: RRF_score = score_weight * (1/(k + score_rank)) + source_count_weight * (1/(k + source_count_rank)) + + This is commonly used in information retrieval to combine different ranking signals. + Higher RRF scores indicate better overall ranking across all criteria. + + Tie Handling: Uses dense ranking where tied items receive the same rank and + subsequent ranks skip appropriately. For example, values [10, 8, 8, 6] get + ranks [1, 2, 2, 4]. + + Args: + items: List of items to rank + score_key_func: Function to extract importance score from item (higher is better) + source_count_key_func: Function to extract source count from item (higher is better) + k: Smoothing constant (default=60, common in literature) + score_weight: Weight for importance score ranking (default=0.5) + source_count_weight: Weight for source count ranking (default=0.5) + + Returns + ------- + Dictionary mapping item id() to RRF score (higher scores = better ranking) + + Example: + >>> documents = [ + ... { + ... "relevance": 0.9, + ... "citations": 5, + ... "title": "Doc A", + ... }, + ... { + ... "relevance": 0.7, + ... "citations": 10, + ... "title": "Doc B", + ... }, + ... { + ... "relevance": 0.8, + ... "citations": 2, + ... "title": "Doc C", + ... }, + ... ] + >>> rrf_scores = calculate_rrf_scores( + ... items=documents, + ... score_key_func=lambda d: d[ + ... "relevance" + ... ], + ... source_count_key_func=lambda d: d[ + ... "citations" + ... ], + ... ) + >>> # Sort by RRF score (descending) + >>> ranked_docs = sorted( + ... documents, + ... key=lambda d: -rrf_scores[ + ... id(d) + ... ], + ... ) + + # Custom weights example (prioritize score over source count): + >>> rrf_scores_weighted = calculate_rrf_scores( + ... items=documents, + ... score_key_func=lambda d: d[ + ... "relevance" + ... ], + ... source_count_key_func=lambda d: d[ + ... "citations" + ... ], + ... score_weight=0.7, + ... source_count_weight=0.3, + ... ) + + # Items with identical scores will have identical RRF scores: + >>> tied_docs = [ + ... { + ... "score": 5, + ... "sources": 2, + ... }, + ... { + ... "score": 5, + ... "sources": 2, + ... }, + ... ] + >>> tied_scores = calculate_rrf_scores( + ... tied_docs, + ... lambda d: d[ + ... "score" + ... ], + ... lambda d: d[ + ... "sources" + ... ], + ... ) + >>> # Both documents will have identical RRF scores + """ + if not items: + return {} + + # Create rankings for each criterion with proper tie handling using dense ranking + # 1. Rank by importance score (descending: higher scores = better rank) + score_ranks = calculate_dense_ranks(items, score_key_func, reverse=True) + + # 2. Rank by source count (descending: more sources = better rank) + source_count_ranks = calculate_dense_ranks( + items, source_count_key_func, reverse=True + ) + + # Apply Reciprocal Rank Fusion (RRF) with weights + rrf_scores = {} + for item in items: + item_id = id(item) + score_rank = score_ranks[item_id] + source_rank = source_count_ranks[item_id] + + # Calculate weighted RRF score (higher is better) + rrf_score = (score_weight * (1 / (k + score_rank))) + ( + source_count_weight * (1 / (k + source_rank)) + ) + rrf_scores[item_id] = rrf_score + + return rrf_scores + + +def calculate_dense_ranks( + items: list[Any], key_func: Callable[[Any], float | int], reverse: bool = True +) -> dict[int, int]: + """ + Calculate dense ranks for items, properly handling ties. + + Dense ranking assigns the same rank to tied items and skips subsequent ranks + appropriately. This is the standard ranking method for handling ties in + ranking algorithms. + + Examples + -------- + - Values [10, 8, 8, 6] get ranks [1, 2, 2, 4] + - Values [5, 5, 3] get ranks [1, 1, 3] + - Values [7, 7, 7] get ranks [1, 1, 1] + + Args: + items: List of items to rank + key_func: Function to extract the value to rank by + reverse: If True, higher values get better (lower) ranks (default: True) + + Returns + ------- + Dictionary mapping item id() to rank (1-based) + + Example: + >>> items = [ + ... {"score": 10}, + ... {"score": 8}, + ... {"score": 8}, + ... {"score": 6}, + ... ] + >>> ranks = calculate_dense_ranks( + ... items, + ... lambda x: x[ + ... "score" + ... ], + ... reverse=True, + ... ) + >>> # Results in ranks: [1, 2, 2, 4] for the respective items + """ + if not items: + return {} + + # Sort items by the ranking criterion + sorted_items = sorted(items, key=key_func, reverse=reverse) + + ranks = {} + current_rank = 1 + prev_value = None + + for i, item in enumerate(sorted_items): + value = key_func(item) + + # If this value is different from previous, update rank to current position + 1 + if prev_value is not None and value != prev_value: + current_rank = i + 1 + + ranks[id(item)] = current_rank + prev_value = value + + return ranks diff --git a/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/validator.py b/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/validator.py new file mode 100644 index 0000000..dfc7445 --- /dev/null +++ b/benchmark_qed/autoq/question_gen/data_questions/assertion_gen/validator.py @@ -0,0 +1,325 @@ +# Copyright (c) 2025 Microsoft Corporation. +"""Assertion validation for quality control and hallucination prevention.""" + +from __future__ import annotations + +import asyncio +import logging +from dataclasses import dataclass, field +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from benchmark_qed.autod.data_processor.text_utils import try_parse_json_object +from benchmark_qed.autoq.prompts import data_questions +from benchmark_qed.config.defaults import LLM_PARAMS, MIN_ASSERTION_VALIDATION_SCORE +from benchmark_qed.config.utils import load_template_file + +if TYPE_CHECKING: + from string import Template + + from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.base import ( + Assertion, + ) + from benchmark_qed.llm.type.base import ChatModel + +log: logging.Logger = logging.getLogger(__name__) + +VALIDATOR_PROMPTS_PATH = Path(data_questions.__file__).parent + + +@dataclass +class ValidationScores: + """Scores from LLM validation.""" + + grounding: int = 3 + """Grounding score (1-5): how well supported by sources.""" + + relevance: int = 3 + """Relevance score (1-5): how useful for evaluating the question.""" + + verifiability: int = 3 + """Verifiability score (1-5): how clear and checkable.""" + + reasoning: str = "" + """Explanation of the validation result.""" + + def is_valid(self, min_score: int = 3) -> bool: + """Check if all scores meet the minimum threshold.""" + return all( + score >= min_score + for score in [self.grounding, self.relevance, self.verifiability] + ) + + def to_dict(self) -> dict[str, Any]: + """Convert validation scores to a dictionary for serialization.""" + return { + "grounding": self.grounding, + "relevance": self.relevance, + "verifiability": self.verifiability, + "reasoning": self.reasoning, + } + + @classmethod + def from_llm_response(cls, parsed: dict[str, Any]) -> ValidationScores: + """Create ValidationScores from parsed LLM response.""" + return cls( + grounding=cls._clamp(int(parsed.get("grounding", 3))), + relevance=cls._clamp(int(parsed.get("relevance", 3))), + verifiability=cls._clamp(int(parsed.get("verifiability", 3))), + reasoning=parsed.get("reasoning", "No reasoning provided"), + ) + + @staticmethod + def _clamp(value: int, min_val: int = 1, max_val: int = 5) -> int: + """Clamp value to range.""" + return max(min_val, min(max_val, value)) + + +@dataclass +class ValidationResult: + """Result of assertion validation.""" + + assertion: Assertion + """The original assertion.""" + + is_valid: bool + """Whether the assertion passed all validation checks.""" + + scores: ValidationScores + """Validation scores from LLM.""" + + error: str | None = None + """Error message if validation failed.""" + + +@dataclass +class ValidationSummary: + """Summary of validation results for a set of assertions.""" + + valid_assertions: list[Assertion] = field(default_factory=list) + """Assertions that passed all validation checks.""" + + invalid_assertions: list[ValidationResult] = field(default_factory=list) + """Assertions that failed validation with details.""" + + @property + def total_count(self) -> int: + """Total number of assertions validated.""" + return len(self.valid_assertions) + len(self.invalid_assertions) + + @property + def valid_count(self) -> int: + """Number of valid assertions.""" + return len(self.valid_assertions) + + @property + def validation_rate(self) -> float: + """Percentage of assertions that passed validation.""" + return self.valid_count / self.total_count if self.total_count > 0 else 0.0 + + +class AssertionValidator: + """ + Validate assertions for quality control and hallucination prevention. + + Uses LLM to validate assertions against three criteria: + 1. Grounding - verifies assertions are supported by their source texts + 2. Relevance - checks if assertions are useful for evaluating the question + 3. Verifiability - ensures assertions are clear and testable + + Use this validator after assertion generation to filter out potentially + hallucinated or low-quality assertions before using them for evaluation. + """ + + def __init__( + self, + llm: ChatModel, + llm_params: dict[str, Any] = LLM_PARAMS, + min_criterion_score: int = MIN_ASSERTION_VALIDATION_SCORE, + validation_prompt: Template | None = None, + concurrent_validations: int = 8, + ) -> None: + """ + Initialize the AssertionValidator. + + Parameters + ---------- + llm : ChatModel + Language model for validation. + llm_params : dict[str, Any] + Parameters for the LLM. + min_criterion_score : int + Minimum score (1-5) for grounding, relevance, verifiability. Default 3. + validation_prompt : Template | None + Custom prompt for validation. + concurrent_validations : int + Number of concurrent validations. Default 8. + """ + self.llm = llm + self.llm_params: dict[str, Any] = llm_params.copy() + self.min_criterion_score = min_criterion_score + self.concurrent_validations = concurrent_validations + self._semaphore = asyncio.Semaphore(concurrent_validations) + + # Load validation prompt + if validation_prompt: + self.validation_prompt: Template = validation_prompt + else: + # Default to local validation prompt for backwards compatibility + prompt_path = ( + VALIDATOR_PROMPTS_PATH / "assertions" / "local_validation_prompt.txt" + ) + self.validation_prompt = load_template_file(prompt_path) + + async def validate_assertion( + self, + assertion: Assertion, + question_text: str, + ) -> ValidationResult: + """ + Validate a single assertion against the question and its sources. + + Parameters + ---------- + assertion : Assertion + The assertion to validate. + question_text : str + The question this assertion is meant to evaluate. + + Returns + ------- + ValidationResult + Validation result with scores and validity. + """ + # Use all sources for validation + sources_text = self._format_sources(assertion.sources) + if not sources_text: + return ValidationResult( + assertion=assertion, + is_valid=False, + scores=ValidationScores( + grounding=1, reasoning="No valid sources to validate against" + ), + error="No valid sources", + ) + + # Get LLM validation + async with self._semaphore: + try: + scores = await self._get_llm_validation( + assertion.statement, question_text, sources_text + ) + is_valid = scores.is_valid(self.min_criterion_score) + return ValidationResult( + assertion=assertion, is_valid=is_valid, scores=scores + ) + except Exception as e: # noqa: BLE001 + log.warning("Validation failed for assertion: %s", e) + return ValidationResult( + assertion=assertion, + is_valid=True, # Default to valid on error to avoid losing assertions + scores=ValidationScores(reasoning=f"Validation error: {e}"), + error=str(e), + ) + + async def _get_llm_validation( + self, statement: str, question_text: str, sources_text: str + ) -> ValidationScores: + """Get validation scores from LLM.""" + prompt = self.validation_prompt.substitute( + question=question_text, + assertion=statement, + sources=sources_text, + ) + messages = [{"role": "user", "content": prompt}] + result = await self.llm.chat(messages=messages, **self.llm_params) + _, parsed = try_parse_json_object(result.output.content) + + if not parsed: + log.warning( + "Failed to parse validation response: %s", result.output.content[:200] + ) + return ValidationScores(reasoning="Could not parse validation response") + + return ValidationScores.from_llm_response(parsed) + + @staticmethod + def _format_sources(sources: list[str]) -> str: + """Format and deduplicate sources for the validation prompt.""" + # Filter valid sources and deduplicate while preserving order + valid_sources = [] + seen = set() + for s in sources: + if s and str(s).strip(): + text = str(s).strip() + if text not in seen: + seen.add(text) + valid_sources.append(text) + + if not valid_sources: + return "" + return "\n\n".join( + f"Source {i + 1}: {source}" for i, source in enumerate(valid_sources) + ) + + async def validate_assertions( + self, + assertions: list[Assertion], + question_text: str, + ) -> ValidationSummary: + """ + Validate a list of assertions and return summary. + + Parameters + ---------- + assertions : list[Assertion] + List of assertions to validate. + question_text : str + The question these assertions are meant to evaluate. + + Returns + ------- + ValidationSummary + Summary with valid/invalid assertions and statistics. + """ + if not assertions: + return ValidationSummary() + + log.info( + "Validating %s assertions for question: %s...", + len(assertions), + question_text[:50], + ) + + # Run validations concurrently + results = await asyncio.gather( + *[self.validate_assertion(a, question_text) for a in assertions], + return_exceptions=True, + ) + + summary = ValidationSummary() + for result in results: + if isinstance(result, Exception): + log.error("Validation error: %s", result) + continue + if isinstance(result, ValidationResult): + # Store validation scores in assertion attributes + result.assertion.attributes["validation"] = { + "is_valid": result.is_valid, + "scores": result.scores.to_dict(), + } + if result.error: + result.assertion.attributes["validation"]["error"] = result.error + + if result.is_valid: + summary.valid_assertions.append(result.assertion) + else: + summary.invalid_assertions.append(result) + + log.info( + "Validation complete: %s/%s assertions passed (%.1f%%)", + summary.valid_count, + summary.total_count, + summary.validation_rate * 100, + ) + return summary diff --git a/benchmark_qed/autoq/question_gen/data_questions/global_question_gen.py b/benchmark_qed/autoq/question_gen/data_questions/global_question_gen.py index a159bbc..25180d5 100644 --- a/benchmark_qed/autoq/question_gen/data_questions/global_question_gen.py +++ b/benchmark_qed/autoq/question_gen/data_questions/global_question_gen.py @@ -1,6 +1,8 @@ # Copyright (c) 2025 Microsoft Corporation. """Data-global question generation module.""" +from __future__ import annotations + import asyncio import json import logging @@ -9,13 +11,11 @@ from collections import defaultdict from dataclasses import dataclass from pathlib import Path -from string import Template -from typing import Any +from typing import TYPE_CHECKING, Any from tqdm.asyncio import tqdm_asyncio import benchmark_qed.config.defaults as defs -from benchmark_qed.autod.data_processor.embedding import TextEmbedder from benchmark_qed.autod.data_processor.text_utils import try_parse_json_object from benchmark_qed.autod.sampler.enums import ClusterRepresentativeSelectionType from benchmark_qed.autod.sampler.sampling.kmeans_sampler import KmeansTextSampler @@ -23,12 +23,24 @@ from benchmark_qed.autoq.data_model.question import Question from benchmark_qed.autoq.prompts.data_questions import global_questions from benchmark_qed.autoq.question_gen.base import BaseQuestionGen, QuestionGenResult +from benchmark_qed.autoq.question_gen.data_questions.assertion_gen import ( + AssertionValidator, + GlobalClaimAssertionGenerator, +) from benchmark_qed.autoq.question_gen.data_questions.claim_extractor.global_claim_extractor import ( DataGlobalClaimExtractor, ) from benchmark_qed.autoq.sampler.question_sampler import QuestionSampler from benchmark_qed.config.utils import load_template_file -from benchmark_qed.llm.type.base import ChatModel + +if TYPE_CHECKING: + from string import Template + + import tiktoken + + from benchmark_qed.autod.data_processor.embedding import TextEmbedder + from benchmark_qed.autoq.config import AssertionConfig, AssertionPromptConfig + from benchmark_qed.llm.type.base import ChatModel log: logging.Logger = logging.getLogger(__name__) @@ -46,15 +58,24 @@ class DataGlobalQuestionContext: class DataGlobalQuestionGen(BaseQuestionGen): - """Generate data-global questions for a given dataset from a set of local questions.""" + """ + Generate data-global questions for a given dataset from a set of local questions. + + Supports optional assertion generation after claim extraction to create + testable facts that can be used for answer accuracy evaluation. Configure + assertion generation using the assertion_config parameter. + """ def __init__( self, llm: ChatModel, text_embedder: TextEmbedder, local_questions: list[Question], + token_encoder: tiktoken.Encoding | None = None, question_sampler: QuestionSampler | None = None, claim_extractor_params: dict[str, Any] | None = None, + assertion_config: AssertionConfig | None = None, + assertion_prompt_config: AssertionPromptConfig | None = None, llm_params: dict[str, Any] = defs.LLM_PARAMS, json_mode: bool = True, generation_system_prompt: Template | None = None, @@ -62,8 +83,17 @@ def __init__( concurrent_coroutines: int = 32, random_seed: int = defs.RANDOM_SEED, ) -> None: + # Import here to avoid circular imports + from benchmark_qed.autoq.config import AssertionConfig, AssertionPromptConfig + if claim_extractor_params is None: claim_extractor_params = {} + if assertion_config is None: + assertion_config = AssertionConfig() + if assertion_prompt_config is None: + assertion_prompt_config = AssertionPromptConfig() + + self.assertion_config = assertion_config self.random_seed = random_seed if question_sampler is not None: question_sampler.random_seed = self.random_seed @@ -82,6 +112,37 @@ def __init__( ) super().__init__(llm, llm_params, question_sampler) self.text_embedder = text_embedder + self.token_encoder = token_encoder + + # Assertion generation setup (max_assertions != 0 enables it) + self.assertion_generator: GlobalClaimAssertionGenerator | None = None + global_assertion_config = assertion_config.global_ + max_assertions = global_assertion_config.max_assertions + if max_assertions is None or max_assertions > 0: + # Create validator if validation is enabled + validator = None + if global_assertion_config.enable_validation: + validator = AssertionValidator( + llm=llm, + llm_params=llm_params, + min_criterion_score=global_assertion_config.min_validation_score, + validation_prompt=assertion_prompt_config.global_validation_prompt.template, + concurrent_validations=global_assertion_config.concurrent_llm_calls, + ) + + self.assertion_generator = GlobalClaimAssertionGenerator( + llm=llm, + llm_params=llm_params, + token_encoder=self.token_encoder, + max_assertions=max_assertions, + validator=validator, + batch_size=global_assertion_config.batch_size, + max_data_tokens=global_assertion_config.max_data_tokens, + concurrent_coroutines=global_assertion_config.concurrent_llm_calls, + max_concurrent_questions=global_assertion_config.max_concurrent_questions, + map_system_prompt=assertion_prompt_config.global_assertion_map_prompt.template, + reduce_system_prompt=assertion_prompt_config.global_assertion_reduce_prompt.template, + ) self.json_mode = json_mode if json_mode: @@ -139,6 +200,23 @@ async def agenerate( # select a subset of questions if needed final_questions = self.select(candidate_questions=results, top_k=num_questions) + + # Generate assertions only for the final selected questions + max_assertions = ( + self.assertion_config.global_.max_assertions + if self.assertion_config + else None + ) + if ( + max_assertions is None or max_assertions > 0 + ) and self.assertion_generator is not None: + log.info( + "Generating assertions for %s final questions", len(final_questions) + ) + await self.assertion_generator.agenerate_assertions_for_questions( + final_questions + ) + return QuestionGenResult( selected_questions=final_questions, candidate_questions=results, @@ -239,11 +317,27 @@ async def _agenerate_single_chain( question_set.add(question) # extract claims for the question - claim_extraction_result = ( + claim_extraction_results = ( await self.claim_extractor.aextract_claims( question, question_context.local_questions ) ) + + question_attributes = { + "abstract_categories": question_context.category, + "claims": claim_extraction_results.claims, + "claim_count": len(claim_extraction_results.claims), + "reference_coverage": claim_extraction_results.reference_coverage, + "relevant_references_count": claim_extraction_results.relevant_references_count, + "input_questions_count": len(question_context.local_questions), + } + + # Initialize empty assertions - will be generated later for final questions only + question_attributes.update({ + "assertions": [], + "assertion_count": 0, + }) + results.append( Question( id=str(uuid.uuid4()), @@ -251,16 +345,7 @@ async def _agenerate_single_chain( question_type=QuestionType.DATA_GLOBAL, embedding=await self.text_embedder.embed_raw_text(question), references=question_context.local_questions, - attributes={ - "abstract_categories": question_context.category, - "claims": claim_extraction_result.claims, - "claim_count": len(claim_extraction_result.claims), - "reference_coverage": claim_extraction_result.reference_coverage, - "relevant_references_count": claim_extraction_result.relevant_references_count, - "input_questions_count": len( - question_context.local_questions - ), - }, + attributes=question_attributes, ) ) return results diff --git a/benchmark_qed/autoq/question_gen/data_questions/local_question_gen.py b/benchmark_qed/autoq/question_gen/data_questions/local_question_gen.py index 2f3ea8e..e9e1a56 100644 --- a/benchmark_qed/autoq/question_gen/data_questions/local_question_gen.py +++ b/benchmark_qed/autoq/question_gen/data_questions/local_question_gen.py @@ -1,22 +1,20 @@ # Copyright (c) 2025 Microsoft Corporation. """Data-local question generation module.""" +from __future__ import annotations + import asyncio import json import logging import math import uuid from pathlib import Path -from string import Template -from typing import Any +from typing import TYPE_CHECKING, Any from tqdm.asyncio import tqdm_asyncio import benchmark_qed.config.defaults as defs -from benchmark_qed.autod.data_model.text_unit import TextUnit -from benchmark_qed.autod.data_processor.embedding import TextEmbedder from benchmark_qed.autod.data_processor.text_utils import try_parse_json_object -from benchmark_qed.autod.sampler.clustering.cluster import TextCluster from benchmark_qed.autod.sampler.enums import ClusterRepresentativeSelectionType from benchmark_qed.autod.sampler.neighboring.semantic_neighbors import ( compute_intra_inter_references_similarity_ratio, @@ -28,12 +26,24 @@ from benchmark_qed.autoq.data_model.question import Question from benchmark_qed.autoq.prompts.data_questions import local_questions from benchmark_qed.autoq.question_gen.base import BaseQuestionGen, QuestionGenResult +from benchmark_qed.autoq.question_gen.data_questions.assertion_gen import ( + AssertionValidator, + LocalClaimAssertionGenerator, +) from benchmark_qed.autoq.question_gen.data_questions.claim_extractor.local_claim_extractor import ( DataLocalClaimExtractor, ) from benchmark_qed.autoq.sampler.question_sampler import QuestionSampler from benchmark_qed.config.utils import load_template_file -from benchmark_qed.llm.type.base import ChatModel + +if TYPE_CHECKING: + from string import Template + + from benchmark_qed.autod.data_model.text_unit import TextUnit + from benchmark_qed.autod.data_processor.embedding import TextEmbedder + from benchmark_qed.autod.sampler.clustering.cluster import TextCluster + from benchmark_qed.autoq.config import AssertionConfig, AssertionPromptConfig + from benchmark_qed.llm.type.base import ChatModel log: logging.Logger = logging.getLogger(__name__) @@ -41,7 +51,13 @@ class DataLocalQuestionGen(BaseQuestionGen): - """Generate local data questions for a given dataset.""" + """ + Generate local data questions for a given dataset. + + Supports optional assertion generation after claim extraction to create + testable facts that can be used for answer accuracy evaluation. Configure + assertion generation using the assertion_config parameter. + """ def __init__( self, @@ -50,6 +66,8 @@ def __init__( text_units: list[TextUnit], question_sampler: QuestionSampler | None = None, claim_extractor_params: dict[str, Any] | None = None, + assertion_config: AssertionConfig | None = None, + assertion_prompt_config: AssertionPromptConfig | None = None, llm_params: dict[str, Any] = defs.LLM_PARAMS, json_mode: bool = True, generation_system_prompt: Template | None = None, @@ -58,8 +76,52 @@ def __init__( concurrent_coroutines: int = 32, random_seed: int = defs.RANDOM_SEED, ) -> None: + """ + Initialize the DataLocalQuestionGen. + + Parameters + ---------- + llm : ChatModel + The language model to use. + text_embedder : TextEmbedder + Text embedder for computing embeddings. + text_units : list[TextUnit] + The text units to generate questions from. + question_sampler : QuestionSampler | None + Optional sampler for selecting questions. + claim_extractor_params : dict[str, Any] | None + Parameters for the claim extractor. + assertion_config : AssertionConfig | None + Configuration for assertion generation and validation. If None, uses default + configuration from AssertionConfig(). Set max_assertions to 0 to disable. + assertion_prompt_config : AssertionPromptConfig | None + Custom prompts for assertion generation. If None, uses default prompts. + llm_params : dict[str, Any] + Parameters for the LLM. + json_mode : bool + Whether to use JSON mode. + generation_system_prompt : Template | None + Custom system prompt for question generation. + generation_user_prompt : Template | None + Custom user prompt for question generation. + expansion_system_prompt : Template | None + Custom prompt for question expansion. + concurrent_coroutines : int + Number of concurrent coroutines. + random_seed : int + Random seed for reproducibility. + """ + # Import here to avoid circular imports + from benchmark_qed.autoq.config import AssertionConfig, AssertionPromptConfig + if claim_extractor_params is None: claim_extractor_params = {} + if assertion_config is None: + assertion_config = AssertionConfig() + if assertion_prompt_config is None: + assertion_prompt_config = AssertionPromptConfig() + + self.assertion_config = assertion_config self.random_seed = random_seed if question_sampler is not None: question_sampler.random_seed = random_seed @@ -84,6 +146,31 @@ def __init__( llm=llm, **claim_extractor_params ) + # Initialize assertion generator if enabled (max_assertions != 0) + self.assertion_generator: LocalClaimAssertionGenerator | None = None + local_assertion_config = assertion_config.local + max_assertions = local_assertion_config.max_assertions + if max_assertions is None or max_assertions > 0: + # Create validator if validation is enabled + validator = None + if local_assertion_config.enable_validation: + validator = AssertionValidator( + llm=llm, + llm_params=llm_params, + min_criterion_score=local_assertion_config.min_validation_score, + validation_prompt=assertion_prompt_config.local_validation_prompt.template, + concurrent_validations=local_assertion_config.concurrent_llm_calls, + ) + + self.assertion_generator = LocalClaimAssertionGenerator( + llm=llm, + llm_params=llm_params, + max_assertions=max_assertions, + validator=validator, + system_prompt=assertion_prompt_config.local_assertion_gen_prompt.template, + max_concurrent_questions=local_assertion_config.max_concurrent_questions, + ) + self.json_mode = json_mode if json_mode: self.llm_params["response_format"] = {"type": "json_object"} @@ -146,6 +233,23 @@ async def agenerate( msg = f"Generated {len(results)} candidate questions from {len(text_clusters)} clusters." log.info(msg) final_questions = self.select(candidate_questions=results, top_k=num_questions) + + # Generate assertions only for the final selected questions + max_assertions = ( + self.assertion_config.local.max_assertions + if self.assertion_config + else None + ) + if ( + max_assertions is None or max_assertions > 0 + ) and self.assertion_generator is not None: + log.info( + "Generating assertions for %s final questions", len(final_questions) + ) + await self.assertion_generator.agenerate_assertions_for_questions( + final_questions + ) + return QuestionGenResult( selected_questions=final_questions, candidate_questions=results, @@ -230,6 +334,9 @@ async def _agenerate_single_chain( ) if claims.reference_coverage > 0: + # Initialize empty assertions - will be generated later for final questions only + assertions = [] + # calculate the similarity of the question to the references ( question_embedding, @@ -278,6 +385,10 @@ async def _agenerate_single_chain( "intra_inter_similarity_ratio": intra_inter_similarity_ratio, "claim_count": len(claims.claims), "claims": claims.claims, + "assertions": assertions or [], + "assertion_count": len(assertions) + if assertions + else 0, }, ) ) diff --git a/benchmark_qed/cli/init_config.py b/benchmark_qed/cli/init_config.py index f0f9b90..ea4f020 100644 --- a/benchmark_qed/cli/init_config.py +++ b/benchmark_qed/cli/init_config.py @@ -21,6 +21,9 @@ from benchmark_qed.autoq.prompts.activity_questions import ( local_questions as activity_local_prompts, ) +from benchmark_qed.autoq.prompts.data_questions import ( + assertions as autoq_assertion_prompts, +) from benchmark_qed.autoq.prompts.data_questions import ( global_questions as data_global_prompts, ) @@ -177,6 +180,35 @@ class ConfigType(StrEnum): prompt: prompts/data_questions/data_local/data_local_expansion_system_prompt.txt data_local_gen_user_prompt: prompt: prompts/data_questions/data_local/data_local_gen_user_prompt.txt + +## Assertion Generation Configuration +assertions: + local: + max_assertions: 20 # Maximum assertions per question. Set to 0 to disable, or null for unlimited. + enable_validation: true # Enable to filter low-quality assertions. + min_validation_score: 3 # Minimum score (1-5) for an assertion to pass validation. + concurrent_llm_calls: 8 # Concurrent LLM calls for validation. + max_concurrent_questions: 8 # Parallel questions for assertion generation. Set to 1 for sequential. + global: + max_assertions: 20 # Maximum assertions per question. Set to 0 to disable, or null for unlimited. + enable_validation: true # Enable to filter low-quality assertions. + min_validation_score: 3 # Minimum score (1-5) for an assertion to pass validation. + batch_size: 50 # Batch size for map-reduce processing. + max_data_tokens: 32000 # Maximum input tokens for the reduce step. + concurrent_llm_calls: 8 # Concurrent LLM calls for batch processing and validation. + max_concurrent_questions: 2 # Parallel questions for assertion generation. Set to 1 for sequential. + +assertion_prompts: + local_assertion_gen_prompt: + prompt: prompts/data_questions/assertions/local_claim_assertion_gen_prompt.txt + global_assertion_map_prompt: + prompt: prompts/data_questions/assertions/global_claim_assertion_map_prompt.txt + global_assertion_reduce_prompt: + prompt: prompts/data_questions/assertions/global_claim_assertion_reduce_prompt.txt + local_validation_prompt: + prompt: prompts/data_questions/assertions/local_validation_prompt.txt + global_validation_prompt: + prompt: prompts/data_questions/assertions/global_validation_prompt.txt """ AUTOE_ASSERTION_CONTENT = f"""## Input Configuration @@ -318,6 +350,10 @@ def init( Path(data_questions_prompts.__file__).parent, prompts_folder / "data_questions", ) + __copy_prompts( + Path(autoq_assertion_prompts.__file__).parent, + prompts_folder / "data_questions" / "assertions", + ) case ConfigType.autoe_pairwise: settings.write_text(AUTOE_PAIRWISE_CONTENT, encoding="utf-8") __copy_prompts(Path(pairwise_prompts.__file__).parent, prompts_folder) diff --git a/benchmark_qed/config/defaults.py b/benchmark_qed/config/defaults.py index 10100ea..ca72ff6 100644 --- a/benchmark_qed/config/defaults.py +++ b/benchmark_qed/config/defaults.py @@ -11,7 +11,7 @@ CHUNK_SIZE = 600 CHUNK_OVERLAP = 100 EMBEDDING_BATCH_SIZE = 32 -NUM_CLUSTERS = 100 +NUM_CLUSTERS = 50 NUM_SAMPLES_PER_CLUSTER = 10 RANDOM_SEED = 42 @@ -26,3 +26,19 @@ # AutoQ defaults NUM_QUESTIONS = 50 OVERSAMPLE_FACTOR = 2.0 +CONCURRENT_REQUESTS = 8 + +# Activity question defaults +NUM_PERSONAS = 5 +NUM_TASKS_PER_PERSONA = 5 +NUM_ENTITIES_PER_TASK = 10 + +# Assertion generation defaults +MAX_ASSERTIONS = 20 +ENABLE_ASSERTION_VALIDATION = True +MIN_ASSERTION_VALIDATION_SCORE = 3 +ASSERTION_BATCH_SIZE = 50 +ASSERTION_MAX_DATA_TOKENS = 32000 +ASSERTION_CONCURRENT_LLM_CALLS = 8 +ASSERTION_MAX_CONCURRENT_LOCAL_QUESTIONS = 8 +ASSERTION_MAX_CONCURRENT_GLOBAL_QUESTIONS = 2 diff --git a/benchmark_qed/config/utils.py b/benchmark_qed/config/utils.py index ac6cfbd..5c60a1d 100644 --- a/benchmark_qed/config/utils.py +++ b/benchmark_qed/config/utils.py @@ -93,4 +93,4 @@ def load_template_file(file_path: Path) -> Template: if not file_path.exists(): msg = f"Template file {file_path} does not exist." raise FileNotFoundError(msg) - return Template(file_path.read_text()) + return Template(file_path.read_text(encoding="utf-8")) diff --git a/docs/cli/autoq.md b/docs/cli/autoq.md index eba81fa..6885e2e 100644 --- a/docs/cli/autoq.md +++ b/docs/cli/autoq.md @@ -76,6 +76,57 @@ Configuration for sampling data from clusters. --- +#### `AssertionConfig` +Configuration for assertion generation with separate settings for local and global questions. + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `local` | `LocalAssertionConfig` | _(see below)_ | Configuration for local assertion generation. | +| `global` | `GlobalAssertionConfig` | _(see below)_ | Configuration for global assertion generation. | + +--- + +#### `LocalAssertionConfig` +Configuration for local assertion generation. + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `max_assertions` | `int \| None` | `20` | Maximum assertions per question. Set to `0` to disable, or `None` for unlimited. | +| `enable_validation` | `bool` | `True` | Whether to validate assertions against source data. | +| `min_validation_score` | `int` | `3` | Minimum score (1-5) for grounding, relevance, and verifiability. | +| `concurrent_llm_calls` | `int` | `8` | Concurrent LLM calls for validation. | +| `max_concurrent_questions` | `int \| None` | `8` | Questions to process in parallel. Set to `1` for sequential. | + +--- + +#### `GlobalAssertionConfig` +Configuration for global assertion generation. + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `max_assertions` | `int \| None` | `20` | Maximum assertions per question. Set to `0` to disable, or `None` for unlimited. | +| `enable_validation` | `bool` | `True` | Whether to validate assertions against source data. | +| `min_validation_score` | `int` | `3` | Minimum score (1-5) for grounding, relevance, and verifiability. | +| `batch_size` | `int` | `50` | Batch size for map-reduce claim processing. | +| `max_data_tokens` | `int` | `32000` | Maximum input tokens for the reduce step. | +| `concurrent_llm_calls` | `int` | `8` | Concurrent LLM calls for batch processing and validation. | +| `max_concurrent_questions` | `int \| None` | `2` | Questions to process in parallel. Set to `1` for sequential. | + +--- + +#### `AssertionPromptConfig` +Configuration for assertion generation prompts. Each prompt can be specified as a file path or direct text. + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `local_assertion_gen_prompt` | `PromptConfig` | _(default file)_ | Prompt for generating assertions from local claims. | +| `global_assertion_map_prompt` | `PromptConfig` | _(default file)_ | Prompt for the map step in global assertion generation. | +| `global_assertion_reduce_prompt` | `PromptConfig` | _(default file)_ | Prompt for the reduce step in global assertion generation. | +| `local_validation_prompt` | `PromptConfig` | _(default file)_ | Prompt for validating local assertions (fact-focused) against source data. | +| `global_validation_prompt` | `PromptConfig` | _(default file)_ | Prompt for validating global assertions (theme-focused) against source data. | + +--- + #### `QuestionGenerationConfig` Top-level configuration for the entire question generation process. @@ -91,6 +142,8 @@ Top-level configuration for the entire question generation process. | `sampling` | `SamplingConfig` | `SamplingConfig()` | Sampling configuration. | | `chat_model` | `LLMConfig` | `LLMConfig()` | LLM configuration for chat. | | `embedding_model` | `LLMConfig` | `LLMConfig()` | LLM configuration for embeddings. | +| `assertions` | `AssertionConfig` | `AssertionConfig()` | Assertion generation configuration. | +| `assertion_prompts` | `AssertionPromptConfig` | `AssertionPromptConfig()` | Assertion prompt configuration. | --- @@ -150,6 +203,35 @@ activity_global: num_personas: 5 num_tasks_per_persona: 2 num_entities_per_task: 5 + +## Assertion Generation Configuration +assertions: + local: + max_assertions: 20 # Set to 0 to disable, or null/None for unlimited + enable_validation: true # Enable to filter low-quality assertions + min_validation_score: 3 # Minimum score (1-5) to pass validation + concurrent_llm_calls: 8 # Concurrent LLM calls for validation + max_concurrent_questions: 8 # Parallel questions for assertion generation. Set to 1 for sequential. + global: + max_assertions: 20 + enable_validation: true + min_validation_score: 3 + batch_size: 50 # Batch size for map-reduce processing + max_data_tokens: 32000 # Max tokens for reduce step + concurrent_llm_calls: 8 # Concurrent LLM calls for batch processing/validation + max_concurrent_questions: 2 # Parallel questions for assertion generation. Set to 1 for sequential. + +assertion_prompts: + local_assertion_gen_prompt: + prompt: prompts/data_questions/assertions/local_claim_assertion_gen_prompt.txt + global_assertion_map_prompt: + prompt: prompts/data_questions/assertions/global_claim_assertion_map_prompt.txt + global_assertion_reduce_prompt: + prompt: prompts/data_questions/assertions/global_claim_assertion_reduce_prompt.txt + local_validation_prompt: + prompt: prompts/data_questions/assertions/local_validation_prompt.txt + global_validation_prompt: + prompt: prompts/data_questions/assertions/global_validation_prompt.txt ``` ```markdown @@ -159,6 +241,59 @@ OPENAI_API_KEY=your-secret-api-key-here >💡 Note: The api_key field uses an environment variable reference `${OPENAI_API_KEY}`. Make sure to define this variable in a .env file or your environment before running the application. +--- + +## Assertion Generation + +Assertions are testable factual statements derived from extracted claims that can be used as "unit tests" to evaluate the accuracy of RAG system answers. Each question can have multiple assertions that verify specific facts the answer should contain. + +### How Assertions Work + +1. **Claim Extraction**: During question generation, claims (factual statements) are extracted from the source text. +2. **Assertion Generation**: Claims are transformed into testable assertions with clear pass/fail criteria. +3. **Optional Validation**: Assertions can be validated against source data to filter out low-quality assertions. + +### Assertion Types + +- **Local Assertions**: Generated for `data_local` questions from claims extracted from individual text chunks. +- **Global Assertions**: Generated for `data_global` questions using a map-reduce approach across multiple source documents. + +### Validation + +When `enable_validation` is set to `true`, each assertion is scored on three criteria (1-5 scale): + +| Criterion | Description | +|-----------|-------------| +| **Grounding** | Is the assertion factually supported by the source data? | +| **Relevance** | Is the assertion relevant to the question being asked? | +| **Verifiability** | Can the assertion be objectively verified from an answer? | + +Assertions must meet the `min_validation_score` threshold on all three criteria to be included. + +### Controlling Assertion Limits + +To disable assertion generation entirely, set `max_assertions: 0` for both local and global: + +```yaml +assertions: + local: + max_assertions: 0 + global: + max_assertions: 0 +``` + +To generate unlimited assertions (no cap), set `max_assertions: null`: + +```yaml +assertions: + local: + max_assertions: null # or omit to use default of 20 + global: + max_assertions: null +``` + +--- + ## Providing Prompts: File or Text Prompts for question generation can be provided in two ways, as defined by the `PromptConfig` class: diff --git a/docs/developing.md b/docs/developing.md index c1811d4..f58fabe 100644 --- a/docs/developing.md +++ b/docs/developing.md @@ -47,6 +47,13 @@ Follow these steps to generate synthetic queries using AutoQ: ``` This will process your input data and save the generated queries in the `output` directory. + By default, AutoQ also generates **assertions** for data-driven queries. Assertions are testable factual statements that can be used to evaluate answer accuracy. You can configure assertion generation in `settings.yaml`: + ```yaml + assertions: + max_assertions: 20 # Set to 0 to disable, or null for unlimited + enable_validation: true # Enable to filter low-quality assertions (can be slow) + ``` + ## Comparing RAG answer pairs Follow these steps to compare RAG answer pairs using the pairwise scoring pipeline: diff --git a/docs/index.md b/docs/index.md index 7416b2b..ff8f114 100644 --- a/docs/index.md +++ b/docs/index.md @@ -47,7 +47,13 @@ The AutoQ component generates four synthetic query classes based on the scope an AutoQ can be configured to generate any number and distribution of synthetic queries along these classes. -> **Note:** AutoQ generates queries only; it does **not** produce reference (ground truth) answers for these queries. +In addition to queries, AutoQ can generate **assertions**—testable factual statements that serve as "unit tests" for evaluating answer accuracy. Assertions are automatically generated for data-driven queries (not available for activity-driven queries), and can optionally be validated against source data to ensure quality. + +> **Note:** AutoQ generates queries and assertions only; it does **not** produce reference (ground truth) answers. Since assertions are generated (and optionally validated) by LLMs, they may contain inaccuracies and should be manually reviewed before use. + +> **Tip: Adding Assertions to Existing Questions** +> +> If you have existing data-local or data-global questions that were generated without assertions, you can generate assertions for them retrospectively. Please refer to the [Assertion Generation Notebook](notebooks/assertion_gen.ipynb) for step-by-step instructions. #### Example @@ -90,7 +96,7 @@ AutoE automates the evaluation of RAG methods using the LLM-as-a-Judge approach. 2. **Reference-based scoring**: When reference answers (such as ground truth or "gold standard" responses) are available, AutoE can evaluate RAG-generated answers against these references using [default metrics](https://github.com/microsoft/benchmark-qed/blob/799b78b6716a8f24fcd354b89a37b429ba1e587a/benchmark_qed/config/model/score.py#L50) like correctness, completeness, or other user-defined criteria on a customizable scoring scale. -3. **Assertion-based scoring**: AutoE can evaluate RAG answers against user-defined assertions. The LLM judge determines whether each answer satisfies the given assertions, providing binary pass/fail scores. This method is particularly useful for testing specific requirements or compliance with predefined criteria. +3. **Assertion-based scoring**: AutoE can evaluate RAG answers against assertions (either user-defined or automatically generated by AutoQ). The LLM judge determines whether each answer satisfies the given assertions, providing binary pass/fail scores. This method is particularly useful for testing specific factual requirements or compliance with predefined criteria. When using AutoQ-generated assertions, this enables automated end-to-end evaluation of answer accuracy for data-local and data-global questions. > **Choosing the Right LLM Judge** > diff --git a/docs/notebooks/assertion_gen.ipynb b/docs/notebooks/assertion_gen.ipynb new file mode 100644 index 0000000..0407722 --- /dev/null +++ b/docs/notebooks/assertion_gen.ipynb @@ -0,0 +1,351 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9134042d", + "metadata": {}, + "source": [ + "# Assertion Generation for Existing Questions\n", + "\n", + "This notebook demonstrates how to generate assertions for existing data-local and data-global questions that were previously generated without assertions (e.g., when `max_assertions=0` was used or assertions were disabled during question generation).\n", + "\n", + "This is useful when you want to retroactively add assertion-based evaluation capabilities to existing question sets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b919966", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (c) 2025 Microsoft Corporation.\n", + "\n", + "import sys\n", + "\n", + "sys.path.insert(1, \"../../../\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "991bef76", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext dotenv\n", + "%dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d056812", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import os\n", + "\n", + "from pydantic import SecretStr\n", + "\n", + "from benchmark_qed.autoq.io.question import (\n", + " load_questions,\n", + " save_questions,\n", + ")\n", + "from benchmark_qed.autoq.question_gen.data_questions.assertion_gen import (\n", + " AssertionValidator,\n", + ")\n", + "from benchmark_qed.config.llm_config import LLMConfig, LLMProvider\n", + "from benchmark_qed.llm.factory import ModelFactory\n", + "\n", + "logging.basicConfig(level=logging.INFO)\n", + "\n", + "if logging.getLogger(\"httpx\") is not None:\n", + " logging.getLogger(\"httpx\").setLevel(logging.ERROR)" + ] + }, + { + "cell_type": "markdown", + "id": "626735ae", + "metadata": {}, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bc61323", + "metadata": {}, + "outputs": [], + "source": [ + "# DATA CONFIGS\n", + "OUTPUT_QUESTIONS_PATH = \"../../output/AP_news/questions\"\n", + "\n", + "# MODEL CONFIGS\n", + "API_KEY = SecretStr(os.getenv(\"OPENAI_API_KEY\", \"\"))\n", + "LLM_MODEL = \"gpt-4.1\"\n", + "LLM_PARAMS = {\n", + " \"temperature\": 0.0,\n", + " \"seed\": 42,\n", + "} # adjust this based on your model. For example, some reasoning models do not support temperature settings\n", + "\n", + "# CONCURRENCY CONFIGS\n", + "CONCURRENT_REQUESTS = 8 # Concurrent LLM calls. Adjust based on your model capacity.\n", + "\n", + "# ASSERTION GENERATION CONFIGS\n", + "MAX_ASSERTIONS = 20 # Maximum number of assertions per question\n", + "BATCH_SIZE = 100 # Batch size for processing claims in global assertion generation\n", + "MAX_DATA_TOKENS = (\n", + " 32000 # Maximum input data tokens for the reduce step in global assertions\n", + ")\n", + "ENABLE_VALIDATION = True # Set to True to validate assertions against sources\n", + "MIN_VALIDATION_SCORE = 3 # Minimum score (1-5) for grounding, relevance, verifiability\n", + "\n", + "# Parallelism for assertion generation (adjust based on your model rate limits)\n", + "CONCURRENT_LOCAL_QUESTIONS = 8 # Questions to process in parallel for local assertions\n", + "CONCURRENT_GLOBAL_QUESTIONS = 2 # Questions to process in parallel for global assertions (lower due to internal parallelism, set to 1 for sequential)\n", + "\n", + "llm = ModelFactory.create_chat_model(\n", + " model_config=LLMConfig(\n", + " model=LLM_MODEL,\n", + " api_key=API_KEY,\n", + " llm_provider=LLMProvider.OpenAIChat,\n", + " call_args=LLM_PARAMS,\n", + " )\n", + ")\n", + "\n", + "# Create validator if validation is enabled\n", + "# Validator checks assertions for:\n", + "# - Grounding: Is the assertion supported by source texts?\n", + "# - Relevance: Is it useful for evaluating the question?\n", + "# - Verifiability: Is it clear and testable?\n", + "validator = (\n", + " AssertionValidator(\n", + " llm=llm,\n", + " llm_params=LLM_PARAMS,\n", + " min_criterion_score=MIN_VALIDATION_SCORE,\n", + " concurrent_validations=CONCURRENT_REQUESTS,\n", + " )\n", + " if ENABLE_VALIDATION\n", + " else None\n", + ")\n", + "\n", + "if validator:\n", + " print(f\"Validation enabled (min score: {MIN_VALIDATION_SCORE}/5)\")\n", + "else:\n", + " print(\"Validation disabled\")" + ] + }, + { + "cell_type": "markdown", + "id": "77c3ccb2", + "metadata": {}, + "source": [ + "## Generate Assertions for Existing Data-Local Questions\n", + "\n", + "Generate assertions for data-local questions that were previously created without assertions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b1f44c1", + "metadata": {}, + "outputs": [], + "source": [ + "from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.local_claim_assertion_gen import (\n", + " LocalClaimAssertionGenerator,\n", + ")\n", + "\n", + "# Load existing data-local questions from disk\n", + "# Replace with your actual path to existing questions\n", + "existing_local_questions = load_questions(\n", + " f\"{OUTPUT_QUESTIONS_PATH}/data_local_questions/selected_questions.json\"\n", + ")\n", + "\n", + "print(f\"Loaded {len(existing_local_questions)} existing data-local questions\")\n", + "\n", + "# Filter questions that have claims\n", + "questions_with_claims = [\n", + " q\n", + " for q in existing_local_questions\n", + " if hasattr(q, \"attributes\") and q.attributes and \"claims\" in q.attributes\n", + "]\n", + "questions_without_claims = [\n", + " q\n", + " for q in existing_local_questions\n", + " if not (hasattr(q, \"attributes\") and q.attributes and \"claims\" in q.attributes)\n", + "]\n", + "\n", + "print(\n", + " f\"Questions with claims: {len(questions_with_claims)}, without claims: {len(questions_without_claims)}\"\n", + ")\n", + "\n", + "# Initialize local assertion generator with optional validator\n", + "local_assertion_generator = LocalClaimAssertionGenerator(\n", + " llm=llm,\n", + " max_assertions=MAX_ASSERTIONS,\n", + " validator=validator, # Pass validator for quality filtering (None to skip)\n", + " max_concurrent_questions=CONCURRENT_LOCAL_QUESTIONS, # Process questions in parallel\n", + ")\n", + "\n", + "# Generate assertions for all questions with claims (parallel processing)\n", + "await local_assertion_generator.agenerate_assertions_for_questions(\n", + " questions_with_claims\n", + ")\n", + "\n", + "# Combine back with questions that had no claims\n", + "updated_local_questions = questions_with_claims + questions_without_claims\n", + "\n", + "# Save updated questions with assertions\n", + "save_questions(\n", + " updated_local_questions,\n", + " f\"{OUTPUT_QUESTIONS_PATH}/data_local_questions/\",\n", + " \"selected_questions_with_assertions\",\n", + ")\n", + "\n", + "print(f\"Saved {len(updated_local_questions)} data-local questions with assertions\")" + ] + }, + { + "cell_type": "markdown", + "id": "6585003a", + "metadata": {}, + "source": [ + "## Generate Assertions for Existing Data-Global Questions\n", + "\n", + "Generate assertions for data-global questions that were previously created without assertions. Global assertion generation uses a map-reduce approach, first generating local assertions from referenced questions, then consolidating them into global assertions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1d04ceb", + "metadata": {}, + "outputs": [], + "source": [ + "from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.global_claim_assertion_gen import (\n", + " GlobalClaimAssertionGenerator,\n", + ")\n", + "\n", + "# Load existing data-global questions from disk\n", + "# Replace with your actual path to existing questions\n", + "existing_global_questions = load_questions(\n", + " f\"{OUTPUT_QUESTIONS_PATH}/data_global_questions/selected_questions.json\"\n", + ")\n", + "\n", + "print(f\"Loaded {len(existing_global_questions)} existing data-global questions\")\n", + "\n", + "# Filter questions that have claims\n", + "questions_with_claims = [\n", + " q\n", + " for q in existing_global_questions\n", + " if hasattr(q, \"attributes\") and q.attributes and \"claims\" in q.attributes\n", + "]\n", + "questions_without_claims = [\n", + " q\n", + " for q in existing_global_questions\n", + " if not (hasattr(q, \"attributes\") and q.attributes and \"claims\" in q.attributes)\n", + "]\n", + "\n", + "print(\n", + " f\"Questions with claims: {len(questions_with_claims)}, without claims: {len(questions_without_claims)}\"\n", + ")\n", + "\n", + "# Initialize global assertion generator with optional validator\n", + "global_assertion_generator = GlobalClaimAssertionGenerator(\n", + " llm=llm,\n", + " max_assertions=MAX_ASSERTIONS,\n", + " batch_size=BATCH_SIZE, # Batch size for processing multiple claims\n", + " max_data_tokens=MAX_DATA_TOKENS, # max input data tokens for the reduce step\n", + " concurrent_coroutines=CONCURRENT_REQUESTS,\n", + " validator=validator, # Pass validator for quality filtering (None to skip)\n", + " max_concurrent_questions=CONCURRENT_GLOBAL_QUESTIONS, # Process questions in parallel (lower due to internal parallelism)\n", + ")\n", + "\n", + "# Generate assertions for all questions with claims (parallel processing)\n", + "await global_assertion_generator.agenerate_assertions_for_questions(\n", + " questions_with_claims\n", + ")\n", + "\n", + "# Combine back with questions that had no claims\n", + "updated_global_questions = questions_with_claims + questions_without_claims\n", + "\n", + "# Save updated questions with assertions\n", + "save_questions(\n", + " updated_global_questions,\n", + " f\"{OUTPUT_QUESTIONS_PATH}/data_global_questions/\",\n", + " \"selected_questions_with_assertions\",\n", + ")\n", + "\n", + "print(f\"Saved {len(updated_global_questions)} data-global questions with assertions\")" + ] + }, + { + "cell_type": "markdown", + "id": "7f23ef6c", + "metadata": {}, + "source": [ + "## Notes on Assertion Generation\n", + "\n", + "**When to use this approach:**\n", + "- You have existing questions that were generated with `max_assertions=0` or without assertion generation\n", + "- You want to add evaluation capabilities to previously generated question sets\n", + "- You need to regenerate assertions with different parameters or improved prompts\n", + "\n", + "**Input Requirements:**\n", + "- Questions must have `claims` in their `attributes` field\n", + "- For data-local questions: claims should be a list of claim dictionaries\n", + "- For data-global questions: claims can be in various formats (simple or complex)\n", + "\n", + "**Output Format:**\n", + "- Assertions are added to the question's `attributes.assertions` field\n", + "- Each assertion contains a `statement` that can be used for evaluation\n", + "- Questions without valid claims are left unchanged\n", + "\n", + "**Configuration Options:**\n", + "- `MAX_ASSERTIONS`: Maximum number of assertions to generate per question (default: 20)\n", + "- `ENABLE_VALIDATION`: Set to `True` to validate assertions for quality (default: True)\n", + "- `MIN_VALIDATION_SCORE`: Minimum score (1-5) for validation criteria (default: 3)\n", + "- `BATCH_SIZE`: For global questions, controls how many claims are processed together \n", + "- `MAX_DATA_TOKENS`: For global questions, controls the max input data tokens in the reduce step\n", + "- `CONCURRENT_REQUESTS`: Controls parallel LLM calls for batch processing and validation\n", + "\n", + "**Parallelism Settings:**\n", + "- `CONCURRENT_LOCAL_QUESTIONS`: Questions to process in parallel for local assertions (default: 8)\n", + "- `CONCURRENT_GLOBAL_QUESTIONS`: Questions to process in parallel for global assertions (default: 2, lower due to internal parallelism; set to `1` for sequential)\n", + "\n", + "**Validation:**\n", + "When `ENABLE_VALIDATION=True`, each assertion is checked for:\n", + "- **Grounding**: Is the assertion factually supported by source texts?\n", + "- **Relevance**: Is the assertion useful for evaluating answers to the question?\n", + "- **Verifiability**: Is the assertion clear and objectively checkable?\n", + "\n", + "Assertions must score at least `MIN_VALIDATION_SCORE` on all three criteria to pass validation and be included in the final assertion set." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "benchmark-qed", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/docs/notebooks/autoe.ipynb b/docs/notebooks/autoe.ipynb index dca43b8..7642cbb 100644 --- a/docs/notebooks/autoe.ipynb +++ b/docs/notebooks/autoe.ipynb @@ -445,7 +445,7 @@ ], "metadata": { "kernelspec": { - "display_name": "benchmark-qed", + "display_name": "benchmark-qed (3.13.7)", "language": "python", "name": "python3" }, @@ -459,7 +459,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.13.7" } }, "nbformat": 4, diff --git a/docs/notebooks/autoq.ipynb b/docs/notebooks/autoq.ipynb index 7a07c4d..28b2e6d 100644 --- a/docs/notebooks/autoq.ipynb +++ b/docs/notebooks/autoq.ipynb @@ -86,11 +86,13 @@ " - Candidate local questions are generated for each target text region using a two-step (extract+expand) process\n", " - Candidate questions are clustered and ranked using semantic similarity-based metrics to select a smaller subset of best questions.\n", " - Relevant claims are extracted for each question based on the sources texts in the corresponding text region\n", + " - (Optional): *assertions* are generated for each question based on the extracted claims. These assertions can then be used to evaluate assertion-based accuracy of RAG methods.\n", " - Any abstract categories (e.g., themes) reflected by the sample text are captured\n", "3. *Data-driven global questions*\n", " - For each abstract category with 2+ local questions, generate a global question\n", " - Relevant claims are extracted for each global question by aggregating relevant claims from the referenced local questions.\n", " - Candidate questions are clustered and ranked using counts of extracted claims' references and input local questions to select a smaller subset of best questions.\n", + " - (Optional): *assertions* are generated for each selected question based on the extracted claims. These assertions can then be used to evaluate assertion-based accuracy of RAG methods.\n", "4. *Activity-driven local questions*\n", " - A dataset summary is generated from the sample texts using AutoD\n", " - A set of {persona, task, relevant entities} is generated based on the dataset summary and sample texts\n", @@ -169,6 +171,19 @@ " 8 # Control for request concurrency. Adjust this based on your model capacity.\n", ")\n", "\n", + "# ASSERTION GENERATION CONFIGS\n", + "MAX_ASSERTIONS = 20 # Maximum number of assertions per question (set to 0 to disable)\n", + "BATCH_SIZE = 100 # Batch size for processing claims in global assertion generation\n", + "MAX_DATA_TOKENS = (\n", + " 32000 # Maximum input data tokens for the reduce step in global assertions\n", + ")\n", + "ENABLE_VALIDATION = True # Set to True to validate assertions against sources\n", + "MIN_VALIDATION_SCORE = 3 # Minimum score (1-5) for grounding, relevance, verifiability\n", + "\n", + "# Parallelism for assertion generation (adjust based on your model rate limits)\n", + "CONCURRENT_LOCAL_QUESTIONS = 8 # Questions to process in parallel for local assertions\n", + "CONCURRENT_GLOBAL_QUESTIONS = 2 # Questions to process in parallel for global assertions (lower due to internal parallelism, set to 1 for sequential)\n", + "\n", "text_embedder = TextEmbedder(\n", " ModelFactory.create_embedding_model(\n", " LLMConfig(\n", @@ -243,6 +258,11 @@ "metadata": {}, "outputs": [], "source": [ + "from benchmark_qed.autoq.config import (\n", + " AssertionConfig,\n", + " GlobalAssertionConfig,\n", + " LocalAssertionConfig,\n", + ")\n", "from benchmark_qed.autoq.question_gen.data_questions.local_question_gen import (\n", " DataLocalQuestionGen,\n", ")\n", @@ -253,12 +273,40 @@ "sample_texts_df = pd.read_parquet(f\"{OUTPUT_DATA_PATH}/sample_texts.parquet\")\n", "sample_texts = load_text_units(df=sample_texts_df)\n", "\n", + "# Configure assertion generation using the assertion generation configs\n", + "# Set max_assertions to an integer to limit assertions, 0 to disable, None for unlimited\n", + "# Set enable_validation=True to validate assertions against source text\n", + "local_assertion_config = LocalAssertionConfig(\n", + " max_assertions=MAX_ASSERTIONS,\n", + " enable_validation=ENABLE_VALIDATION,\n", + " min_validation_score=MIN_VALIDATION_SCORE,\n", + " concurrent_llm_calls=CONCURRENT_REQUESTS,\n", + " max_concurrent_questions=CONCURRENT_LOCAL_QUESTIONS,\n", + ")\n", + "global_assertion_config = GlobalAssertionConfig(\n", + " max_assertions=MAX_ASSERTIONS,\n", + " enable_validation=ENABLE_VALIDATION,\n", + " min_validation_score=MIN_VALIDATION_SCORE,\n", + " batch_size=BATCH_SIZE,\n", + " max_data_tokens=MAX_DATA_TOKENS,\n", + " concurrent_llm_calls=CONCURRENT_REQUESTS,\n", + " max_concurrent_questions=CONCURRENT_GLOBAL_QUESTIONS,\n", + ")\n", + "assertion_config = AssertionConfig(\n", + " local=local_assertion_config,\n", + " **{\n", + " \"global\": global_assertion_config\n", + " }, # Use dict unpacking since \"global\" is a keyword\n", + ")\n", + "\n", "data_local_generator = DataLocalQuestionGen(\n", " llm=llm,\n", + " llm_params=LLM_PARAMS,\n", " text_embedder=text_embedder,\n", " text_units=sample_texts,\n", " concurrent_coroutines=CONCURRENT_REQUESTS,\n", " random_seed=RANDOM_SEED,\n", + " assertion_config=assertion_config,\n", ")\n", "\n", "data_local_question_results = await data_local_generator.agenerate(\n", @@ -313,12 +361,15 @@ ")\n", "print(f\"Loaded {len(local_questions)} candidate local questions.\")\n", "\n", + "# Reuse the same assertion config from data local questions\n", "data_global_generator = DataGlobalQuestionGen(\n", " llm=llm,\n", + " llm_params=LLM_PARAMS,\n", " text_embedder=text_embedder,\n", " local_questions=local_questions,\n", " concurrent_coroutines=CONCURRENT_REQUESTS,\n", " random_seed=RANDOM_SEED,\n", + " assertion_config=assertion_config,\n", ")\n", "\n", "data_global_question_results = await data_global_generator.agenerate(\n", @@ -384,6 +435,7 @@ "\n", "activity_generator = ActivityContextGen(\n", " llm=llm,\n", + " llm_params=LLM_PARAMS,\n", " text_embedder=text_embedder,\n", " token_encoder=token_encoder,\n", " text_units=sample_texts,\n", @@ -434,6 +486,7 @@ "\n", "activity_local_generator = ActivityLocalQuestionGen(\n", " llm=llm,\n", + " llm_params=LLM_PARAMS,\n", " text_embedder=text_embedder,\n", " activity_context=activity_context,\n", " concurrent_coroutines=CONCURRENT_REQUESTS,\n", @@ -494,6 +547,7 @@ "\n", "activity_global_generator = ActivityGlobalQuestionGen(\n", " llm=llm,\n", + " llm_params=LLM_PARAMS,\n", " text_embedder=text_embedder,\n", " activity_context=activity_context,\n", " concurrent_coroutines=CONCURRENT_REQUESTS,\n", @@ -541,9 +595,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index da7f5fb..14725db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,8 @@ dependencies = [ "statsmodels>=0.14.4", "tiktoken>=0.9.0", "typer>=0.15.1", + "matplotlib>=3.10.5", + "seaborn>=0.13.2", ] [dependency-groups] diff --git a/uv.lock b/uv.lock index 4e4e3a2..bfb18a4 100644 --- a/uv.lock +++ b/uv.lock @@ -234,6 +234,7 @@ dependencies = [ { name = "azure-identity" }, { name = "fastparquet" }, { name = "json-repair" }, + { name = "matplotlib" }, { name = "nest-asyncio" }, { name = "numpy" }, { name = "openai" }, @@ -244,6 +245,7 @@ dependencies = [ { name = "pyyaml" }, { name = "scikit-learn" }, { name = "scipy" }, + { name = "seaborn" }, { name = "statsmodels" }, { name = "tiktoken" }, { name = "typer" }, @@ -276,6 +278,7 @@ requires-dist = [ { name = "azure-identity", specifier = ">=1.22.0" }, { name = "fastparquet", specifier = ">=2024.11.0" }, { name = "json-repair", specifier = ">=0.44.1" }, + { name = "matplotlib", specifier = ">=3.10.5" }, { name = "nest-asyncio", specifier = ">=1.6.0" }, { name = "numpy", specifier = ">=2.2.0" }, { name = "openai", specifier = ">=1.68.2" }, @@ -286,6 +289,7 @@ requires-dist = [ { name = "pyyaml", specifier = ">=6.0.2" }, { name = "scikit-learn", specifier = ">=1.6.1" }, { name = "scipy", specifier = ">=1.14.1" }, + { name = "seaborn", specifier = ">=0.13.2" }, { name = "statsmodels", specifier = ">=0.14.4" }, { name = "tiktoken", specifier = ">=0.9.0" }, { name = "typer", specifier = ">=0.15.1" }, @@ -463,6 +467,88 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e6/75/49e5bfe642f71f272236b5b2d2691cf915a7283cc0ceda56357b61daa538/comm-0.2.2-py3-none-any.whl", hash = "sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3", size = 7180 }, ] +[[package]] +name = "contourpy" +version = "1.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/58/01/1253e6698a07380cd31a736d248a3f2a50a7c88779a1813da27503cadc2a/contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880", size = 13466174 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/91/2e/c4390a31919d8a78b90e8ecf87cd4b4c4f05a5b48d05ec17db8e5404c6f4/contourpy-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:709a48ef9a690e1343202916450bc48b9e51c049b089c7f79a267b46cffcdaa1", size = 288773 }, + { url = "https://files.pythonhosted.org/packages/0d/44/c4b0b6095fef4dc9c420e041799591e3b63e9619e3044f7f4f6c21c0ab24/contourpy-1.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:23416f38bfd74d5d28ab8429cc4d63fa67d5068bd711a85edb1c3fb0c3e2f381", size = 270149 }, + { url = "https://files.pythonhosted.org/packages/30/2e/dd4ced42fefac8470661d7cb7e264808425e6c5d56d175291e93890cce09/contourpy-1.3.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:929ddf8c4c7f348e4c0a5a3a714b5c8542ffaa8c22954862a46ca1813b667ee7", size = 329222 }, + { url = "https://files.pythonhosted.org/packages/f2/74/cc6ec2548e3d276c71389ea4802a774b7aa3558223b7bade3f25787fafc2/contourpy-1.3.3-cp311-cp311-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9e999574eddae35f1312c2b4b717b7885d4edd6cb46700e04f7f02db454e67c1", size = 377234 }, + { url = "https://files.pythonhosted.org/packages/03/b3/64ef723029f917410f75c09da54254c5f9ea90ef89b143ccadb09df14c15/contourpy-1.3.3-cp311-cp311-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0bf67e0e3f482cb69779dd3061b534eb35ac9b17f163d851e2a547d56dba0a3a", size = 380555 }, + { url = "https://files.pythonhosted.org/packages/5f/4b/6157f24ca425b89fe2eb7e7be642375711ab671135be21e6faa100f7448c/contourpy-1.3.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:51e79c1f7470158e838808d4a996fa9bac72c498e93d8ebe5119bc1e6becb0db", size = 355238 }, + { url = "https://files.pythonhosted.org/packages/98/56/f914f0dd678480708a04cfd2206e7c382533249bc5001eb9f58aa693e200/contourpy-1.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:598c3aaece21c503615fd59c92a3598b428b2f01bfb4b8ca9c4edeecc2438620", size = 1326218 }, + { url = "https://files.pythonhosted.org/packages/fb/d7/4a972334a0c971acd5172389671113ae82aa7527073980c38d5868ff1161/contourpy-1.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:322ab1c99b008dad206d406bb61d014cf0174df491ae9d9d0fac6a6fda4f977f", size = 1392867 }, + { url = "https://files.pythonhosted.org/packages/75/3e/f2cc6cd56dc8cff46b1a56232eabc6feea52720083ea71ab15523daab796/contourpy-1.3.3-cp311-cp311-win32.whl", hash = "sha256:fd907ae12cd483cd83e414b12941c632a969171bf90fc937d0c9f268a31cafff", size = 183677 }, + { url = "https://files.pythonhosted.org/packages/98/4b/9bd370b004b5c9d8045c6c33cf65bae018b27aca550a3f657cdc99acdbd8/contourpy-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:3519428f6be58431c56581f1694ba8e50626f2dd550af225f82fb5f5814d2a42", size = 225234 }, + { url = "https://files.pythonhosted.org/packages/d9/b6/71771e02c2e004450c12b1120a5f488cad2e4d5b590b1af8bad060360fe4/contourpy-1.3.3-cp311-cp311-win_arm64.whl", hash = "sha256:15ff10bfada4bf92ec8b31c62bf7c1834c244019b4a33095a68000d7075df470", size = 193123 }, + { url = "https://files.pythonhosted.org/packages/be/45/adfee365d9ea3d853550b2e735f9d66366701c65db7855cd07621732ccfc/contourpy-1.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b08a32ea2f8e42cf1d4be3169a98dd4be32bafe4f22b6c4cb4ba810fa9e5d2cb", size = 293419 }, + { url = "https://files.pythonhosted.org/packages/53/3e/405b59cfa13021a56bba395a6b3aca8cec012b45bf177b0eaf7a202cde2c/contourpy-1.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:556dba8fb6f5d8742f2923fe9457dbdd51e1049c4a43fd3986a0b14a1d815fc6", size = 273979 }, + { url = "https://files.pythonhosted.org/packages/d4/1c/a12359b9b2ca3a845e8f7f9ac08bdf776114eb931392fcad91743e2ea17b/contourpy-1.3.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92d9abc807cf7d0e047b95ca5d957cf4792fcd04e920ca70d48add15c1a90ea7", size = 332653 }, + { url = "https://files.pythonhosted.org/packages/63/12/897aeebfb475b7748ea67b61e045accdfcf0d971f8a588b67108ed7f5512/contourpy-1.3.3-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2e8faa0ed68cb29af51edd8e24798bb661eac3bd9f65420c1887b6ca89987c8", size = 379536 }, + { url = "https://files.pythonhosted.org/packages/43/8a/a8c584b82deb248930ce069e71576fc09bd7174bbd35183b7943fb1064fd/contourpy-1.3.3-cp312-cp312-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:626d60935cf668e70a5ce6ff184fd713e9683fb458898e4249b63be9e28286ea", size = 384397 }, + { url = "https://files.pythonhosted.org/packages/cc/8f/ec6289987824b29529d0dfda0d74a07cec60e54b9c92f3c9da4c0ac732de/contourpy-1.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d00e655fcef08aba35ec9610536bfe90267d7ab5ba944f7032549c55a146da1", size = 362601 }, + { url = "https://files.pythonhosted.org/packages/05/0a/a3fe3be3ee2dceb3e615ebb4df97ae6f3828aa915d3e10549ce016302bd1/contourpy-1.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:451e71b5a7d597379ef572de31eeb909a87246974d960049a9848c3bc6c41bf7", size = 1331288 }, + { url = "https://files.pythonhosted.org/packages/33/1d/acad9bd4e97f13f3e2b18a3977fe1b4a37ecf3d38d815333980c6c72e963/contourpy-1.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:459c1f020cd59fcfe6650180678a9993932d80d44ccde1fa1868977438f0b411", size = 1403386 }, + { url = "https://files.pythonhosted.org/packages/cf/8f/5847f44a7fddf859704217a99a23a4f6417b10e5ab1256a179264561540e/contourpy-1.3.3-cp312-cp312-win32.whl", hash = "sha256:023b44101dfe49d7d53932be418477dba359649246075c996866106da069af69", size = 185018 }, + { url = "https://files.pythonhosted.org/packages/19/e8/6026ed58a64563186a9ee3f29f41261fd1828f527dd93d33b60feca63352/contourpy-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:8153b8bfc11e1e4d75bcb0bff1db232f9e10b274e0929de9d608027e0d34ff8b", size = 226567 }, + { url = "https://files.pythonhosted.org/packages/d1/e2/f05240d2c39a1ed228d8328a78b6f44cd695f7ef47beb3e684cf93604f86/contourpy-1.3.3-cp312-cp312-win_arm64.whl", hash = "sha256:07ce5ed73ecdc4a03ffe3e1b3e3c1166db35ae7584be76f65dbbe28a7791b0cc", size = 193655 }, + { url = "https://files.pythonhosted.org/packages/68/35/0167aad910bbdb9599272bd96d01a9ec6852f36b9455cf2ca67bd4cc2d23/contourpy-1.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:177fb367556747a686509d6fef71d221a4b198a3905fe824430e5ea0fda54eb5", size = 293257 }, + { url = "https://files.pythonhosted.org/packages/96/e4/7adcd9c8362745b2210728f209bfbcf7d91ba868a2c5f40d8b58f54c509b/contourpy-1.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d002b6f00d73d69333dac9d0b8d5e84d9724ff9ef044fd63c5986e62b7c9e1b1", size = 274034 }, + { url = "https://files.pythonhosted.org/packages/73/23/90e31ceeed1de63058a02cb04b12f2de4b40e3bef5e082a7c18d9c8ae281/contourpy-1.3.3-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:348ac1f5d4f1d66d3322420f01d42e43122f43616e0f194fc1c9f5d830c5b286", size = 334672 }, + { url = "https://files.pythonhosted.org/packages/ed/93/b43d8acbe67392e659e1d984700e79eb67e2acb2bd7f62012b583a7f1b55/contourpy-1.3.3-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:655456777ff65c2c548b7c454af9c6f33f16c8884f11083244b5819cc214f1b5", size = 381234 }, + { url = "https://files.pythonhosted.org/packages/46/3b/bec82a3ea06f66711520f75a40c8fc0b113b2a75edb36aa633eb11c4f50f/contourpy-1.3.3-cp313-cp313-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:644a6853d15b2512d67881586bd03f462c7ab755db95f16f14d7e238f2852c67", size = 385169 }, + { url = "https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4debd64f124ca62069f313a9cb86656ff087786016d76927ae2cf37846b006c9", size = 362859 }, + { url = "https://files.pythonhosted.org/packages/33/71/e2a7945b7de4e58af42d708a219f3b2f4cff7386e6b6ab0a0fa0033c49a9/contourpy-1.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a15459b0f4615b00bbd1e91f1b9e19b7e63aea7483d03d804186f278c0af2659", size = 1332062 }, + { url = "https://files.pythonhosted.org/packages/12/fc/4e87ac754220ccc0e807284f88e943d6d43b43843614f0a8afa469801db0/contourpy-1.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca0fdcd73925568ca027e0b17ab07aad764be4706d0a925b89227e447d9737b7", size = 1403932 }, + { url = "https://files.pythonhosted.org/packages/a6/2e/adc197a37443f934594112222ac1aa7dc9a98faf9c3842884df9a9d8751d/contourpy-1.3.3-cp313-cp313-win32.whl", hash = "sha256:b20c7c9a3bf701366556e1b1984ed2d0cedf999903c51311417cf5f591d8c78d", size = 185024 }, + { url = "https://files.pythonhosted.org/packages/18/0b/0098c214843213759692cc638fce7de5c289200a830e5035d1791d7a2338/contourpy-1.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:1cadd8b8969f060ba45ed7c1b714fe69185812ab43bd6b86a9123fe8f99c3263", size = 226578 }, + { url = "https://files.pythonhosted.org/packages/8a/9a/2f6024a0c5995243cd63afdeb3651c984f0d2bc727fd98066d40e141ad73/contourpy-1.3.3-cp313-cp313-win_arm64.whl", hash = "sha256:fd914713266421b7536de2bfa8181aa8c699432b6763a0ea64195ebe28bff6a9", size = 193524 }, + { url = "https://files.pythonhosted.org/packages/c0/b3/f8a1a86bd3298513f500e5b1f5fd92b69896449f6cab6a146a5d52715479/contourpy-1.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:88df9880d507169449d434c293467418b9f6cbe82edd19284aa0409e7fdb933d", size = 306730 }, + { url = "https://files.pythonhosted.org/packages/3f/11/4780db94ae62fc0c2053909b65dc3246bd7cecfc4f8a20d957ad43aa4ad8/contourpy-1.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d06bb1f751ba5d417047db62bca3c8fde202b8c11fb50742ab3ab962c81e8216", size = 287897 }, + { url = "https://files.pythonhosted.org/packages/ae/15/e59f5f3ffdd6f3d4daa3e47114c53daabcb18574a26c21f03dc9e4e42ff0/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e4e6b05a45525357e382909a4c1600444e2a45b4795163d3b22669285591c1ae", size = 326751 }, + { url = "https://files.pythonhosted.org/packages/0f/81/03b45cfad088e4770b1dcf72ea78d3802d04200009fb364d18a493857210/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ab3074b48c4e2cf1a960e6bbeb7f04566bf36b1861d5c9d4d8ac04b82e38ba20", size = 375486 }, + { url = "https://files.pythonhosted.org/packages/0c/ba/49923366492ffbdd4486e970d421b289a670ae8cf539c1ea9a09822b371a/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c3d53c796f8647d6deb1abe867daeb66dcc8a97e8455efa729516b997b8ed99", size = 388106 }, + { url = "https://files.pythonhosted.org/packages/9f/52/5b00ea89525f8f143651f9f03a0df371d3cbd2fccd21ca9b768c7a6500c2/contourpy-1.3.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50ed930df7289ff2a8d7afeb9603f8289e5704755c7e5c3bbd929c90c817164b", size = 352548 }, + { url = "https://files.pythonhosted.org/packages/32/1d/a209ec1a3a3452d490f6b14dd92e72280c99ae3d1e73da74f8277d4ee08f/contourpy-1.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4feffb6537d64b84877da813a5c30f1422ea5739566abf0bd18065ac040e120a", size = 1322297 }, + { url = "https://files.pythonhosted.org/packages/bc/9e/46f0e8ebdd884ca0e8877e46a3f4e633f6c9c8c4f3f6e72be3fe075994aa/contourpy-1.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2b7e9480ffe2b0cd2e787e4df64270e3a0440d9db8dc823312e2c940c167df7e", size = 1391023 }, + { url = "https://files.pythonhosted.org/packages/b9/70/f308384a3ae9cd2209e0849f33c913f658d3326900d0ff5d378d6a1422d2/contourpy-1.3.3-cp313-cp313t-win32.whl", hash = "sha256:283edd842a01e3dcd435b1c5116798d661378d83d36d337b8dde1d16a5fc9ba3", size = 196157 }, + { url = "https://files.pythonhosted.org/packages/b2/dd/880f890a6663b84d9e34a6f88cded89d78f0091e0045a284427cb6b18521/contourpy-1.3.3-cp313-cp313t-win_amd64.whl", hash = "sha256:87acf5963fc2b34825e5b6b048f40e3635dd547f590b04d2ab317c2619ef7ae8", size = 240570 }, + { url = "https://files.pythonhosted.org/packages/80/99/2adc7d8ffead633234817ef8e9a87115c8a11927a94478f6bb3d3f4d4f7d/contourpy-1.3.3-cp313-cp313t-win_arm64.whl", hash = "sha256:3c30273eb2a55024ff31ba7d052dde990d7d8e5450f4bbb6e913558b3d6c2301", size = 199713 }, + { url = "https://files.pythonhosted.org/packages/72/8b/4546f3ab60f78c514ffb7d01a0bd743f90de36f0019d1be84d0a708a580a/contourpy-1.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fde6c716d51c04b1c25d0b90364d0be954624a0ee9d60e23e850e8d48353d07a", size = 292189 }, + { url = "https://files.pythonhosted.org/packages/fd/e1/3542a9cb596cadd76fcef413f19c79216e002623158befe6daa03dbfa88c/contourpy-1.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:cbedb772ed74ff5be440fa8eee9bd49f64f6e3fc09436d9c7d8f1c287b121d77", size = 273251 }, + { url = "https://files.pythonhosted.org/packages/b1/71/f93e1e9471d189f79d0ce2497007731c1e6bf9ef6d1d61b911430c3db4e5/contourpy-1.3.3-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22e9b1bd7a9b1d652cd77388465dc358dafcd2e217d35552424aa4f996f524f5", size = 335810 }, + { url = "https://files.pythonhosted.org/packages/91/f9/e35f4c1c93f9275d4e38681a80506b5510e9327350c51f8d4a5a724d178c/contourpy-1.3.3-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a22738912262aa3e254e4f3cb079a95a67132fc5a063890e224393596902f5a4", size = 382871 }, + { url = "https://files.pythonhosted.org/packages/b5/71/47b512f936f66a0a900d81c396a7e60d73419868fba959c61efed7a8ab46/contourpy-1.3.3-cp314-cp314-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:afe5a512f31ee6bd7d0dda52ec9864c984ca3d66664444f2d72e0dc4eb832e36", size = 386264 }, + { url = "https://files.pythonhosted.org/packages/04/5f/9ff93450ba96b09c7c2b3f81c94de31c89f92292f1380261bd7195bea4ea/contourpy-1.3.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f64836de09927cba6f79dcd00fdd7d5329f3fccc633468507079c829ca4db4e3", size = 363819 }, + { url = "https://files.pythonhosted.org/packages/3e/a6/0b185d4cc480ee494945cde102cb0149ae830b5fa17bf855b95f2e70ad13/contourpy-1.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:1fd43c3be4c8e5fd6e4f2baeae35ae18176cf2e5cced681cca908addf1cdd53b", size = 1333650 }, + { url = "https://files.pythonhosted.org/packages/43/d7/afdc95580ca56f30fbcd3060250f66cedbde69b4547028863abd8aa3b47e/contourpy-1.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6afc576f7b33cf00996e5c1102dc2a8f7cc89e39c0b55df93a0b78c1bd992b36", size = 1404833 }, + { url = "https://files.pythonhosted.org/packages/e2/e2/366af18a6d386f41132a48f033cbd2102e9b0cf6345d35ff0826cd984566/contourpy-1.3.3-cp314-cp314-win32.whl", hash = "sha256:66c8a43a4f7b8df8b71ee1840e4211a3c8d93b214b213f590e18a1beca458f7d", size = 189692 }, + { url = "https://files.pythonhosted.org/packages/7d/c2/57f54b03d0f22d4044b8afb9ca0e184f8b1afd57b4f735c2fa70883dc601/contourpy-1.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:cf9022ef053f2694e31d630feaacb21ea24224be1c3ad0520b13d844274614fd", size = 232424 }, + { url = "https://files.pythonhosted.org/packages/18/79/a9416650df9b525737ab521aa181ccc42d56016d2123ddcb7b58e926a42c/contourpy-1.3.3-cp314-cp314-win_arm64.whl", hash = "sha256:95b181891b4c71de4bb404c6621e7e2390745f887f2a026b2d99e92c17892339", size = 198300 }, + { url = "https://files.pythonhosted.org/packages/1f/42/38c159a7d0f2b7b9c04c64ab317042bb6952b713ba875c1681529a2932fe/contourpy-1.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:33c82d0138c0a062380332c861387650c82e4cf1747aaa6938b9b6516762e772", size = 306769 }, + { url = "https://files.pythonhosted.org/packages/c3/6c/26a8205f24bca10974e77460de68d3d7c63e282e23782f1239f226fcae6f/contourpy-1.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ea37e7b45949df430fe649e5de8351c423430046a2af20b1c1961cae3afcda77", size = 287892 }, + { url = "https://files.pythonhosted.org/packages/66/06/8a475c8ab718ebfd7925661747dbb3c3ee9c82ac834ccb3570be49d129f4/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d304906ecc71672e9c89e87c4675dc5c2645e1f4269a5063b99b0bb29f232d13", size = 326748 }, + { url = "https://files.pythonhosted.org/packages/b4/a3/c5ca9f010a44c223f098fccd8b158bb1cb287378a31ac141f04730dc49be/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ca658cd1a680a5c9ea96dc61cdbae1e85c8f25849843aa799dfd3cb370ad4fbe", size = 375554 }, + { url = "https://files.pythonhosted.org/packages/80/5b/68bd33ae63fac658a4145088c1e894405e07584a316738710b636c6d0333/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ab2fd90904c503739a75b7c8c5c01160130ba67944a7b77bbf36ef8054576e7f", size = 388118 }, + { url = "https://files.pythonhosted.org/packages/40/52/4c285a6435940ae25d7410a6c36bda5145839bc3f0beb20c707cda18b9d2/contourpy-1.3.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7301b89040075c30e5768810bc96a8e8d78085b47d8be6e4c3f5a0b4ed478a0", size = 352555 }, + { url = "https://files.pythonhosted.org/packages/24/ee/3e81e1dd174f5c7fefe50e85d0892de05ca4e26ef1c9a59c2a57e43b865a/contourpy-1.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2a2a8b627d5cc6b7c41a4beff6c5ad5eb848c88255fda4a8745f7e901b32d8e4", size = 1322295 }, + { url = "https://files.pythonhosted.org/packages/3c/b2/6d913d4d04e14379de429057cd169e5e00f6c2af3bb13e1710bcbdb5da12/contourpy-1.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:fd6ec6be509c787f1caf6b247f0b1ca598bef13f4ddeaa126b7658215529ba0f", size = 1391027 }, + { url = "https://files.pythonhosted.org/packages/93/8a/68a4ec5c55a2971213d29a9374913f7e9f18581945a7a31d1a39b5d2dfe5/contourpy-1.3.3-cp314-cp314t-win32.whl", hash = "sha256:e74a9a0f5e3fff48fb5a7f2fd2b9b70a3fe014a67522f79b7cca4c0c7e43c9ae", size = 202428 }, + { url = "https://files.pythonhosted.org/packages/fa/96/fd9f641ffedc4fa3ace923af73b9d07e869496c9cc7a459103e6e978992f/contourpy-1.3.3-cp314-cp314t-win_amd64.whl", hash = "sha256:13b68d6a62db8eafaebb8039218921399baf6e47bf85006fd8529f2a08ef33fc", size = 250331 }, + { url = "https://files.pythonhosted.org/packages/ae/8c/469afb6465b853afff216f9528ffda78a915ff880ed58813ba4faf4ba0b6/contourpy-1.3.3-cp314-cp314t-win_arm64.whl", hash = "sha256:b7448cb5a725bb1e35ce88771b86fba35ef418952474492cf7c764059933ff8b", size = 203831 }, + { url = "https://files.pythonhosted.org/packages/a5/29/8dcfe16f0107943fa92388c23f6e05cff0ba58058c4c95b00280d4c75a14/contourpy-1.3.3-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:cd5dfcaeb10f7b7f9dc8941717c6c2ade08f587be2226222c12b25f0483ed497", size = 278809 }, + { url = "https://files.pythonhosted.org/packages/85/a9/8b37ef4f7dafeb335daee3c8254645ef5725be4d9c6aa70b50ec46ef2f7e/contourpy-1.3.3-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:0c1fc238306b35f246d61a1d416a627348b5cf0648648a031e14bb8705fcdfe8", size = 261593 }, + { url = "https://files.pythonhosted.org/packages/0a/59/ebfb8c677c75605cc27f7122c90313fd2f375ff3c8d19a1694bda74aaa63/contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:70f9aad7de812d6541d29d2bbf8feb22ff7e1c299523db288004e3157ff4674e", size = 302202 }, + { url = "https://files.pythonhosted.org/packages/3c/37/21972a15834d90bfbfb009b9d004779bd5a07a0ec0234e5ba8f64d5736f4/contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ed3657edf08512fc3fe81b510e35c2012fbd3081d2e26160f27ca28affec989", size = 329207 }, + { url = "https://files.pythonhosted.org/packages/0c/58/bd257695f39d05594ca4ad60df5bcb7e32247f9951fd09a9b8edb82d1daa/contourpy-1.3.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:3d1a3799d62d45c18bafd41c5fa05120b96a28079f2393af559b843d1a966a77", size = 225315 }, +] + [[package]] name = "coverage" version = "7.9.1" @@ -622,6 +708,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/99/49/0ab9774f64555a1b50102757811508f5ace451cf5dc0a2d074a4b9deca6a/cryptography-45.0.4-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:bbc505d1dc469ac12a0a064214879eac6294038d6b24ae9f71faae1448a9608d", size = 3337594 }, ] +[[package]] +name = "cycler" +version = "0.12.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c", size = 7615 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321 }, +] + [[package]] name = "debugpy" version = "1.8.14" @@ -739,6 +834,39 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/09/5a/1d0d47e64816002824d4a876644e8c65540fa23f91b701f0daa726931545/fastparquet-2024.11.0-cp313-cp313-win_amd64.whl", hash = "sha256:d20632964e65530374ff7cddd42cc06aa0a1388934903693d6d22592a5ba827b", size = 673266 }, ] +[[package]] +name = "fonttools" +version = "4.59.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8a/27/ec3c723bfdf86f34c5c82bf6305df3e0f0d8ea798d2d3a7cb0c0a866d286/fonttools-4.59.0.tar.gz", hash = "sha256:be392ec3529e2f57faa28709d60723a763904f71a2b63aabe14fee6648fe3b14", size = 3532521 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/96/520733d9602fa1bf6592e5354c6721ac6fc9ea72bc98d112d0c38b967199/fonttools-4.59.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:841b2186adce48903c0fef235421ae21549020eca942c1da773ac380b056ab3c", size = 2782387 }, + { url = "https://files.pythonhosted.org/packages/87/6a/170fce30b9bce69077d8eec9bea2cfd9f7995e8911c71be905e2eba6368b/fonttools-4.59.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9bcc1e77fbd1609198966ded6b2a9897bd6c6bcbd2287a2fc7d75f1a254179c5", size = 2342194 }, + { url = "https://files.pythonhosted.org/packages/b0/b6/7c8166c0066856f1408092f7968ac744060cf72ca53aec9036106f57eeca/fonttools-4.59.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:37c377f7cb2ab2eca8a0b319c68146d34a339792f9420fca6cd49cf28d370705", size = 5032333 }, + { url = "https://files.pythonhosted.org/packages/eb/0c/707c5a19598eafcafd489b73c4cb1c142102d6197e872f531512d084aa76/fonttools-4.59.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fa39475eaccb98f9199eccfda4298abaf35ae0caec676ffc25b3a5e224044464", size = 4974422 }, + { url = "https://files.pythonhosted.org/packages/f6/e7/6d33737d9fe632a0f59289b6f9743a86d2a9d0673de2a0c38c0f54729822/fonttools-4.59.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d3972b13148c1d1fbc092b27678a33b3080d1ac0ca305742b0119b75f9e87e38", size = 5010631 }, + { url = "https://files.pythonhosted.org/packages/63/e1/a4c3d089ab034a578820c8f2dff21ef60daf9668034a1e4fb38bb1cc3398/fonttools-4.59.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a408c3c51358c89b29cfa5317cf11518b7ce5de1717abb55c5ae2d2921027de6", size = 5122198 }, + { url = "https://files.pythonhosted.org/packages/09/77/ca82b9c12fa4de3c520b7760ee61787640cf3fde55ef1b0bfe1de38c8153/fonttools-4.59.0-cp311-cp311-win32.whl", hash = "sha256:6770d7da00f358183d8fd5c4615436189e4f683bdb6affb02cad3d221d7bb757", size = 2214216 }, + { url = "https://files.pythonhosted.org/packages/ab/25/5aa7ca24b560b2f00f260acf32c4cf29d7aaf8656e159a336111c18bc345/fonttools-4.59.0-cp311-cp311-win_amd64.whl", hash = "sha256:84fc186980231a287b28560d3123bd255d3c6b6659828c642b4cf961e2b923d0", size = 2261879 }, + { url = "https://files.pythonhosted.org/packages/e2/77/b1c8af22f4265e951cd2e5535dbef8859efcef4fb8dee742d368c967cddb/fonttools-4.59.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f9b3a78f69dcbd803cf2fb3f972779875b244c1115481dfbdd567b2c22b31f6b", size = 2767562 }, + { url = "https://files.pythonhosted.org/packages/ff/5a/aeb975699588176bb357e8b398dfd27e5d3a2230d92b81ab8cbb6187358d/fonttools-4.59.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:57bb7e26928573ee7c6504f54c05860d867fd35e675769f3ce01b52af38d48e2", size = 2335168 }, + { url = "https://files.pythonhosted.org/packages/54/97/c6101a7e60ae138c4ef75b22434373a0da50a707dad523dd19a4889315bf/fonttools-4.59.0-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:4536f2695fe5c1ffb528d84a35a7d3967e5558d2af58b4775e7ab1449d65767b", size = 4909850 }, + { url = "https://files.pythonhosted.org/packages/bd/6c/fa4d18d641054f7bff878cbea14aa9433f292b9057cb1700d8e91a4d5f4f/fonttools-4.59.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:885bde7d26e5b40e15c47bd5def48b38cbd50830a65f98122a8fb90962af7cd1", size = 4955131 }, + { url = "https://files.pythonhosted.org/packages/20/5c/331947fc1377deb928a69bde49f9003364f5115e5cbe351eea99e39412a2/fonttools-4.59.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6801aeddb6acb2c42eafa45bc1cb98ba236871ae6f33f31e984670b749a8e58e", size = 4899667 }, + { url = "https://files.pythonhosted.org/packages/8a/46/b66469dfa26b8ff0baa7654b2cc7851206c6d57fe3abdabbaab22079a119/fonttools-4.59.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:31003b6a10f70742a63126b80863ab48175fb8272a18ca0846c0482968f0588e", size = 5051349 }, + { url = "https://files.pythonhosted.org/packages/2e/05/ebfb6b1f3a4328ab69787d106a7d92ccde77ce66e98659df0f9e3f28d93d/fonttools-4.59.0-cp312-cp312-win32.whl", hash = "sha256:fbce6dae41b692a5973d0f2158f782b9ad05babc2c2019a970a1094a23909b1b", size = 2201315 }, + { url = "https://files.pythonhosted.org/packages/09/45/d2bdc9ea20bbadec1016fd0db45696d573d7a26d95ab5174ffcb6d74340b/fonttools-4.59.0-cp312-cp312-win_amd64.whl", hash = "sha256:332bfe685d1ac58ca8d62b8d6c71c2e52a6c64bc218dc8f7825c9ea51385aa01", size = 2249408 }, + { url = "https://files.pythonhosted.org/packages/f3/bb/390990e7c457d377b00890d9f96a3ca13ae2517efafb6609c1756e213ba4/fonttools-4.59.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:78813b49d749e1bb4db1c57f2d4d7e6db22c253cb0a86ad819f5dc197710d4b2", size = 2758704 }, + { url = "https://files.pythonhosted.org/packages/df/6f/d730d9fcc9b410a11597092bd2eb9ca53e5438c6cb90e4b3047ce1b723e9/fonttools-4.59.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:401b1941ce37e78b8fd119b419b617277c65ae9417742a63282257434fd68ea2", size = 2330764 }, + { url = "https://files.pythonhosted.org/packages/75/b4/b96bb66f6f8cc4669de44a158099b249c8159231d254ab6b092909388be5/fonttools-4.59.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:efd7e6660674e234e29937bc1481dceb7e0336bfae75b856b4fb272b5093c5d4", size = 4890699 }, + { url = "https://files.pythonhosted.org/packages/b5/57/7969af50b26408be12baa317c6147588db5b38af2759e6df94554dbc5fdb/fonttools-4.59.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:51ab1ff33c19e336c02dee1e9fd1abd974a4ca3d8f7eef2a104d0816a241ce97", size = 4952934 }, + { url = "https://files.pythonhosted.org/packages/d6/e2/dd968053b6cf1f46c904f5bd409b22341477c017d8201619a265e50762d3/fonttools-4.59.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a9bf8adc9e1f3012edc8f09b08336272aec0c55bc677422273e21280db748f7c", size = 4892319 }, + { url = "https://files.pythonhosted.org/packages/6b/95/a59810d8eda09129f83467a4e58f84205dc6994ebaeb9815406363e07250/fonttools-4.59.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:37e01c6ec0c98599778c2e688350d624fa4770fbd6144551bd5e032f1199171c", size = 5034753 }, + { url = "https://files.pythonhosted.org/packages/a5/84/51a69ee89ff8d1fea0c6997e946657e25a3f08513de8435fe124929f3eef/fonttools-4.59.0-cp313-cp313-win32.whl", hash = "sha256:70d6b3ceaa9cc5a6ac52884f3b3d9544e8e231e95b23f138bdb78e6d4dc0eae3", size = 2199688 }, + { url = "https://files.pythonhosted.org/packages/a0/ee/f626cd372932d828508137a79b85167fdcf3adab2e3bed433f295c596c6a/fonttools-4.59.0-cp313-cp313-win_amd64.whl", hash = "sha256:26731739daa23b872643f0e4072d5939960237d540c35c14e6a06d47d71ca8fe", size = 2248560 }, + { url = "https://files.pythonhosted.org/packages/d0/9c/df0ef2c51845a13043e5088f7bb988ca6cd5bb82d5d4203d6a158aa58cf2/fonttools-4.59.0-py3-none-any.whl", hash = "sha256:241313683afd3baacb32a6bd124d0bce7404bc5280e12e291bae1b9bba28711d", size = 1128050 }, +] + [[package]] name = "frozenlist" version = "1.7.0" @@ -1183,6 +1311,72 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ed/f1/82ea8e783433707cafd9790099a2d19f113c22f32a31c8bb5abdc7a61dbb/jupytext-1.17.2-py3-none-any.whl", hash = "sha256:4f85dc43bb6a24b75491c5c434001ad5ef563932f68f15dd3e1c8ce12a4a426b", size = 164401 }, ] +[[package]] +name = "kiwisolver" +version = "1.4.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/82/59/7c91426a8ac292e1cdd53a63b6d9439abd573c875c3f92c146767dd33faf/kiwisolver-1.4.8.tar.gz", hash = "sha256:23d5f023bdc8c7e54eb65f03ca5d5bb25b601eac4d7f1a042888a1f45237987e", size = 97538 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/ed/c913ee28936c371418cb167b128066ffb20bbf37771eecc2c97edf8a6e4c/kiwisolver-1.4.8-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a4d3601908c560bdf880f07d94f31d734afd1bb71e96585cace0e38ef44c6d84", size = 124635 }, + { url = "https://files.pythonhosted.org/packages/4c/45/4a7f896f7467aaf5f56ef093d1f329346f3b594e77c6a3c327b2d415f521/kiwisolver-1.4.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:856b269c4d28a5c0d5e6c1955ec36ebfd1651ac00e1ce0afa3e28da95293b561", size = 66717 }, + { url = "https://files.pythonhosted.org/packages/5f/b4/c12b3ac0852a3a68f94598d4c8d569f55361beef6159dce4e7b624160da2/kiwisolver-1.4.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c2b9a96e0f326205af81a15718a9073328df1173a2619a68553decb7097fd5d7", size = 65413 }, + { url = "https://files.pythonhosted.org/packages/a9/98/1df4089b1ed23d83d410adfdc5947245c753bddfbe06541c4aae330e9e70/kiwisolver-1.4.8-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5020c83e8553f770cb3b5fc13faac40f17e0b205bd237aebd21d53d733adb03", size = 1343994 }, + { url = "https://files.pythonhosted.org/packages/8d/bf/b4b169b050c8421a7c53ea1ea74e4ef9c335ee9013216c558a047f162d20/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dace81d28c787956bfbfbbfd72fdcef014f37d9b48830829e488fdb32b49d954", size = 1434804 }, + { url = "https://files.pythonhosted.org/packages/66/5a/e13bd341fbcf73325ea60fdc8af752addf75c5079867af2e04cc41f34434/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:11e1022b524bd48ae56c9b4f9296bce77e15a2e42a502cceba602f804b32bb79", size = 1450690 }, + { url = "https://files.pythonhosted.org/packages/9b/4f/5955dcb376ba4a830384cc6fab7d7547bd6759fe75a09564910e9e3bb8ea/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b9b4d2892fefc886f30301cdd80debd8bb01ecdf165a449eb6e78f79f0fabd6", size = 1376839 }, + { url = "https://files.pythonhosted.org/packages/3a/97/5edbed69a9d0caa2e4aa616ae7df8127e10f6586940aa683a496c2c280b9/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a96c0e790ee875d65e340ab383700e2b4891677b7fcd30a699146f9384a2bb0", size = 1435109 }, + { url = "https://files.pythonhosted.org/packages/13/fc/e756382cb64e556af6c1809a1bbb22c141bbc2445049f2da06b420fe52bf/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:23454ff084b07ac54ca8be535f4174170c1094a4cff78fbae4f73a4bcc0d4dab", size = 2245269 }, + { url = "https://files.pythonhosted.org/packages/76/15/e59e45829d7f41c776d138245cabae6515cb4eb44b418f6d4109c478b481/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:87b287251ad6488e95b4f0b4a79a6d04d3ea35fde6340eb38fbd1ca9cd35bbbc", size = 2393468 }, + { url = "https://files.pythonhosted.org/packages/e9/39/483558c2a913ab8384d6e4b66a932406f87c95a6080112433da5ed668559/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:b21dbe165081142b1232a240fc6383fd32cdd877ca6cc89eab93e5f5883e1c25", size = 2355394 }, + { url = "https://files.pythonhosted.org/packages/01/aa/efad1fbca6570a161d29224f14b082960c7e08268a133fe5dc0f6906820e/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:768cade2c2df13db52475bd28d3a3fac8c9eff04b0e9e2fda0f3760f20b3f7fc", size = 2490901 }, + { url = "https://files.pythonhosted.org/packages/c9/4f/15988966ba46bcd5ab9d0c8296914436720dd67fca689ae1a75b4ec1c72f/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d47cfb2650f0e103d4bf68b0b5804c68da97272c84bb12850d877a95c056bd67", size = 2312306 }, + { url = "https://files.pythonhosted.org/packages/2d/27/bdf1c769c83f74d98cbc34483a972f221440703054894a37d174fba8aa68/kiwisolver-1.4.8-cp311-cp311-win_amd64.whl", hash = "sha256:ed33ca2002a779a2e20eeb06aea7721b6e47f2d4b8a8ece979d8ba9e2a167e34", size = 71966 }, + { url = "https://files.pythonhosted.org/packages/4a/c9/9642ea855604aeb2968a8e145fc662edf61db7632ad2e4fb92424be6b6c0/kiwisolver-1.4.8-cp311-cp311-win_arm64.whl", hash = "sha256:16523b40aab60426ffdebe33ac374457cf62863e330a90a0383639ce14bf44b2", size = 65311 }, + { url = "https://files.pythonhosted.org/packages/fc/aa/cea685c4ab647f349c3bc92d2daf7ae34c8e8cf405a6dcd3a497f58a2ac3/kiwisolver-1.4.8-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d6af5e8815fd02997cb6ad9bbed0ee1e60014438ee1a5c2444c96f87b8843502", size = 124152 }, + { url = "https://files.pythonhosted.org/packages/c5/0b/8db6d2e2452d60d5ebc4ce4b204feeb16176a851fd42462f66ade6808084/kiwisolver-1.4.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bade438f86e21d91e0cf5dd7c0ed00cda0f77c8c1616bd83f9fc157fa6760d31", size = 66555 }, + { url = "https://files.pythonhosted.org/packages/60/26/d6a0db6785dd35d3ba5bf2b2df0aedc5af089962c6eb2cbf67a15b81369e/kiwisolver-1.4.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b83dc6769ddbc57613280118fb4ce3cd08899cc3369f7d0e0fab518a7cf37fdb", size = 65067 }, + { url = "https://files.pythonhosted.org/packages/c9/ed/1d97f7e3561e09757a196231edccc1bcf59d55ddccefa2afc9c615abd8e0/kiwisolver-1.4.8-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:111793b232842991be367ed828076b03d96202c19221b5ebab421ce8bcad016f", size = 1378443 }, + { url = "https://files.pythonhosted.org/packages/29/61/39d30b99954e6b46f760e6289c12fede2ab96a254c443639052d1b573fbc/kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:257af1622860e51b1a9d0ce387bf5c2c4f36a90594cb9514f55b074bcc787cfc", size = 1472728 }, + { url = "https://files.pythonhosted.org/packages/0c/3e/804163b932f7603ef256e4a715e5843a9600802bb23a68b4e08c8c0ff61d/kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69b5637c3f316cab1ec1c9a12b8c5f4750a4c4b71af9157645bf32830e39c03a", size = 1478388 }, + { url = "https://files.pythonhosted.org/packages/8a/9e/60eaa75169a154700be74f875a4d9961b11ba048bef315fbe89cb6999056/kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:782bb86f245ec18009890e7cb8d13a5ef54dcf2ebe18ed65f795e635a96a1c6a", size = 1413849 }, + { url = "https://files.pythonhosted.org/packages/bc/b3/9458adb9472e61a998c8c4d95cfdfec91c73c53a375b30b1428310f923e4/kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc978a80a0db3a66d25767b03688f1147a69e6237175c0f4ffffaaedf744055a", size = 1475533 }, + { url = "https://files.pythonhosted.org/packages/e4/7a/0a42d9571e35798de80aef4bb43a9b672aa7f8e58643d7bd1950398ffb0a/kiwisolver-1.4.8-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:36dbbfd34838500a31f52c9786990d00150860e46cd5041386f217101350f0d3", size = 2268898 }, + { url = "https://files.pythonhosted.org/packages/d9/07/1255dc8d80271400126ed8db35a1795b1a2c098ac3a72645075d06fe5c5d/kiwisolver-1.4.8-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:eaa973f1e05131de5ff3569bbba7f5fd07ea0595d3870ed4a526d486fe57fa1b", size = 2425605 }, + { url = "https://files.pythonhosted.org/packages/84/df/5a3b4cf13780ef6f6942df67b138b03b7e79e9f1f08f57c49957d5867f6e/kiwisolver-1.4.8-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a66f60f8d0c87ab7f59b6fb80e642ebb29fec354a4dfad687ca4092ae69d04f4", size = 2375801 }, + { url = "https://files.pythonhosted.org/packages/8f/10/2348d068e8b0f635c8c86892788dac7a6b5c0cb12356620ab575775aad89/kiwisolver-1.4.8-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:858416b7fb777a53f0c59ca08190ce24e9abbd3cffa18886a5781b8e3e26f65d", size = 2520077 }, + { url = "https://files.pythonhosted.org/packages/32/d8/014b89fee5d4dce157d814303b0fce4d31385a2af4c41fed194b173b81ac/kiwisolver-1.4.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:085940635c62697391baafaaeabdf3dd7a6c3643577dde337f4d66eba021b2b8", size = 2338410 }, + { url = "https://files.pythonhosted.org/packages/bd/72/dfff0cc97f2a0776e1c9eb5bef1ddfd45f46246c6533b0191887a427bca5/kiwisolver-1.4.8-cp312-cp312-win_amd64.whl", hash = "sha256:01c3d31902c7db5fb6182832713d3b4122ad9317c2c5877d0539227d96bb2e50", size = 71853 }, + { url = "https://files.pythonhosted.org/packages/dc/85/220d13d914485c0948a00f0b9eb419efaf6da81b7d72e88ce2391f7aed8d/kiwisolver-1.4.8-cp312-cp312-win_arm64.whl", hash = "sha256:a3c44cb68861de93f0c4a8175fbaa691f0aa22550c331fefef02b618a9dcb476", size = 65424 }, + { url = "https://files.pythonhosted.org/packages/79/b3/e62464a652f4f8cd9006e13d07abad844a47df1e6537f73ddfbf1bc997ec/kiwisolver-1.4.8-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:1c8ceb754339793c24aee1c9fb2485b5b1f5bb1c2c214ff13368431e51fc9a09", size = 124156 }, + { url = "https://files.pythonhosted.org/packages/8d/2d/f13d06998b546a2ad4f48607a146e045bbe48030774de29f90bdc573df15/kiwisolver-1.4.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:54a62808ac74b5e55a04a408cda6156f986cefbcf0ada13572696b507cc92fa1", size = 66555 }, + { url = "https://files.pythonhosted.org/packages/59/e3/b8bd14b0a54998a9fd1e8da591c60998dc003618cb19a3f94cb233ec1511/kiwisolver-1.4.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:68269e60ee4929893aad82666821aaacbd455284124817af45c11e50a4b42e3c", size = 65071 }, + { url = "https://files.pythonhosted.org/packages/f0/1c/6c86f6d85ffe4d0ce04228d976f00674f1df5dc893bf2dd4f1928748f187/kiwisolver-1.4.8-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34d142fba9c464bc3bbfeff15c96eab0e7310343d6aefb62a79d51421fcc5f1b", size = 1378053 }, + { url = "https://files.pythonhosted.org/packages/4e/b9/1c6e9f6dcb103ac5cf87cb695845f5fa71379021500153566d8a8a9fc291/kiwisolver-1.4.8-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ddc373e0eef45b59197de815b1b28ef89ae3955e7722cc9710fb91cd77b7f47", size = 1472278 }, + { url = "https://files.pythonhosted.org/packages/ee/81/aca1eb176de671f8bda479b11acdc42c132b61a2ac861c883907dde6debb/kiwisolver-1.4.8-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:77e6f57a20b9bd4e1e2cedda4d0b986ebd0216236f0106e55c28aea3d3d69b16", size = 1478139 }, + { url = "https://files.pythonhosted.org/packages/49/f4/e081522473671c97b2687d380e9e4c26f748a86363ce5af48b4a28e48d06/kiwisolver-1.4.8-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08e77738ed7538f036cd1170cbed942ef749137b1311fa2bbe2a7fda2f6bf3cc", size = 1413517 }, + { url = "https://files.pythonhosted.org/packages/8f/e9/6a7d025d8da8c4931522922cd706105aa32b3291d1add8c5427cdcd66e63/kiwisolver-1.4.8-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5ce1e481a74b44dd5e92ff03ea0cb371ae7a0268318e202be06c8f04f4f1246", size = 1474952 }, + { url = "https://files.pythonhosted.org/packages/82/13/13fa685ae167bee5d94b415991c4fc7bb0a1b6ebea6e753a87044b209678/kiwisolver-1.4.8-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:fc2ace710ba7c1dfd1a3b42530b62b9ceed115f19a1656adefce7b1782a37794", size = 2269132 }, + { url = "https://files.pythonhosted.org/packages/ef/92/bb7c9395489b99a6cb41d502d3686bac692586db2045adc19e45ee64ed23/kiwisolver-1.4.8-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:3452046c37c7692bd52b0e752b87954ef86ee2224e624ef7ce6cb21e8c41cc1b", size = 2425997 }, + { url = "https://files.pythonhosted.org/packages/ed/12/87f0e9271e2b63d35d0d8524954145837dd1a6c15b62a2d8c1ebe0f182b4/kiwisolver-1.4.8-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:7e9a60b50fe8b2ec6f448fe8d81b07e40141bfced7f896309df271a0b92f80f3", size = 2376060 }, + { url = "https://files.pythonhosted.org/packages/02/6e/c8af39288edbce8bf0fa35dee427b082758a4b71e9c91ef18fa667782138/kiwisolver-1.4.8-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:918139571133f366e8362fa4a297aeba86c7816b7ecf0bc79168080e2bd79957", size = 2520471 }, + { url = "https://files.pythonhosted.org/packages/13/78/df381bc7b26e535c91469f77f16adcd073beb3e2dd25042efd064af82323/kiwisolver-1.4.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e063ef9f89885a1d68dd8b2e18f5ead48653176d10a0e324e3b0030e3a69adeb", size = 2338793 }, + { url = "https://files.pythonhosted.org/packages/d0/dc/c1abe38c37c071d0fc71c9a474fd0b9ede05d42f5a458d584619cfd2371a/kiwisolver-1.4.8-cp313-cp313-win_amd64.whl", hash = "sha256:a17b7c4f5b2c51bb68ed379defd608a03954a1845dfed7cc0117f1cc8a9b7fd2", size = 71855 }, + { url = "https://files.pythonhosted.org/packages/a0/b6/21529d595b126ac298fdd90b705d87d4c5693de60023e0efcb4f387ed99e/kiwisolver-1.4.8-cp313-cp313-win_arm64.whl", hash = "sha256:3cd3bc628b25f74aedc6d374d5babf0166a92ff1317f46267f12d2ed54bc1d30", size = 65430 }, + { url = "https://files.pythonhosted.org/packages/34/bd/b89380b7298e3af9b39f49334e3e2a4af0e04819789f04b43d560516c0c8/kiwisolver-1.4.8-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:370fd2df41660ed4e26b8c9d6bbcad668fbe2560462cba151a721d49e5b6628c", size = 126294 }, + { url = "https://files.pythonhosted.org/packages/83/41/5857dc72e5e4148eaac5aa76e0703e594e4465f8ab7ec0fc60e3a9bb8fea/kiwisolver-1.4.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:84a2f830d42707de1d191b9490ac186bf7997a9495d4e9072210a1296345f7dc", size = 67736 }, + { url = "https://files.pythonhosted.org/packages/e1/d1/be059b8db56ac270489fb0b3297fd1e53d195ba76e9bbb30e5401fa6b759/kiwisolver-1.4.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:7a3ad337add5148cf51ce0b55642dc551c0b9d6248458a757f98796ca7348712", size = 66194 }, + { url = "https://files.pythonhosted.org/packages/e1/83/4b73975f149819eb7dcf9299ed467eba068ecb16439a98990dcb12e63fdd/kiwisolver-1.4.8-cp313-cp313t-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7506488470f41169b86d8c9aeff587293f530a23a23a49d6bc64dab66bedc71e", size = 1465942 }, + { url = "https://files.pythonhosted.org/packages/c7/2c/30a5cdde5102958e602c07466bce058b9d7cb48734aa7a4327261ac8e002/kiwisolver-1.4.8-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f0121b07b356a22fb0414cec4666bbe36fd6d0d759db3d37228f496ed67c880", size = 1595341 }, + { url = "https://files.pythonhosted.org/packages/ff/9b/1e71db1c000385aa069704f5990574b8244cce854ecd83119c19e83c9586/kiwisolver-1.4.8-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d6d6bd87df62c27d4185de7c511c6248040afae67028a8a22012b010bc7ad062", size = 1598455 }, + { url = "https://files.pythonhosted.org/packages/85/92/c8fec52ddf06231b31cbb779af77e99b8253cd96bd135250b9498144c78b/kiwisolver-1.4.8-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:291331973c64bb9cce50bbe871fb2e675c4331dab4f31abe89f175ad7679a4d7", size = 1522138 }, + { url = "https://files.pythonhosted.org/packages/0b/51/9eb7e2cd07a15d8bdd976f6190c0164f92ce1904e5c0c79198c4972926b7/kiwisolver-1.4.8-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:893f5525bb92d3d735878ec00f781b2de998333659507d29ea4466208df37bed", size = 1582857 }, + { url = "https://files.pythonhosted.org/packages/0f/95/c5a00387a5405e68ba32cc64af65ce881a39b98d73cc394b24143bebc5b8/kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b47a465040146981dc9db8647981b8cb96366fbc8d452b031e4f8fdffec3f26d", size = 2293129 }, + { url = "https://files.pythonhosted.org/packages/44/83/eeb7af7d706b8347548313fa3a3a15931f404533cc54fe01f39e830dd231/kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:99cea8b9dd34ff80c521aef46a1dddb0dcc0283cf18bde6d756f1e6f31772165", size = 2421538 }, + { url = "https://files.pythonhosted.org/packages/05/f9/27e94c1b3eb29e6933b6986ffc5fa1177d2cd1f0c8efc5f02c91c9ac61de/kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:151dffc4865e5fe6dafce5480fab84f950d14566c480c08a53c663a0020504b6", size = 2390661 }, + { url = "https://files.pythonhosted.org/packages/d9/d4/3c9735faa36ac591a4afcc2980d2691000506050b7a7e80bcfe44048daa7/kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:577facaa411c10421314598b50413aa1ebcf5126f704f1e5d72d7e4e9f020d90", size = 2546710 }, + { url = "https://files.pythonhosted.org/packages/4c/fa/be89a49c640930180657482a74970cdcf6f7072c8d2471e1babe17a222dc/kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:be4816dc51c8a471749d664161b434912eee82f2ea66bd7628bd14583a833e85", size = 2349213 }, +] + [[package]] name = "markdown" version = "3.8.2" @@ -1252,6 +1446,70 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739 }, ] +[[package]] +name = "matplotlib" +version = "3.10.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "contourpy" }, + { name = "cycler" }, + { name = "fonttools" }, + { name = "kiwisolver" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pillow" }, + { name = "pyparsing" }, + { name = "python-dateutil" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/43/91/f2939bb60b7ebf12478b030e0d7f340247390f402b3b189616aad790c366/matplotlib-3.10.5.tar.gz", hash = "sha256:352ed6ccfb7998a00881692f38b4ca083c691d3e275b4145423704c34c909076", size = 34804044 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/c7/1f2db90a1d43710478bb1e9b57b162852f79234d28e4f48a28cc415aa583/matplotlib-3.10.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:dcfc39c452c6a9f9028d3e44d2d721484f665304857188124b505b2c95e1eecf", size = 8239216 }, + { url = "https://files.pythonhosted.org/packages/82/6d/ca6844c77a4f89b1c9e4d481c412e1d1dbabf2aae2cbc5aa2da4a1d6683e/matplotlib-3.10.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:903352681b59f3efbf4546985142a9686ea1d616bb054b09a537a06e4b892ccf", size = 8102130 }, + { url = "https://files.pythonhosted.org/packages/1d/1e/5e187a30cc673a3e384f3723e5f3c416033c1d8d5da414f82e4e731128ea/matplotlib-3.10.5-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:080c3676a56b8ee1c762bcf8fca3fe709daa1ee23e6ef06ad9f3fc17332f2d2a", size = 8666471 }, + { url = "https://files.pythonhosted.org/packages/03/c0/95540d584d7d645324db99a845ac194e915ef75011a0d5e19e1b5cee7e69/matplotlib-3.10.5-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4b4984d5064a35b6f66d2c11d668565f4389b1119cc64db7a4c1725bc11adffc", size = 9500518 }, + { url = "https://files.pythonhosted.org/packages/ba/2e/e019352099ea58b4169adb9c6e1a2ad0c568c6377c2b677ee1f06de2adc7/matplotlib-3.10.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3967424121d3a46705c9fa9bdb0931de3228f13f73d7bb03c999c88343a89d89", size = 9552372 }, + { url = "https://files.pythonhosted.org/packages/b7/81/3200b792a5e8b354f31f4101ad7834743ad07b6d620259f2059317b25e4d/matplotlib-3.10.5-cp311-cp311-win_amd64.whl", hash = "sha256:33775bbeb75528555a15ac29396940128ef5613cf9a2d31fb1bfd18b3c0c0903", size = 8100634 }, + { url = "https://files.pythonhosted.org/packages/52/46/a944f6f0c1f5476a0adfa501969d229ce5ae60cf9a663be0e70361381f89/matplotlib-3.10.5-cp311-cp311-win_arm64.whl", hash = "sha256:c61333a8e5e6240e73769d5826b9a31d8b22df76c0778f8480baf1b4b01c9420", size = 7978880 }, + { url = "https://files.pythonhosted.org/packages/66/1e/c6f6bcd882d589410b475ca1fc22e34e34c82adff519caf18f3e6dd9d682/matplotlib-3.10.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:00b6feadc28a08bd3c65b2894f56cf3c94fc8f7adcbc6ab4516ae1e8ed8f62e2", size = 8253056 }, + { url = "https://files.pythonhosted.org/packages/53/e6/d6f7d1b59413f233793dda14419776f5f443bcccb2dfc84b09f09fe05dbe/matplotlib-3.10.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ee98a5c5344dc7f48dc261b6ba5d9900c008fc12beb3fa6ebda81273602cc389", size = 8110131 }, + { url = "https://files.pythonhosted.org/packages/66/2b/bed8a45e74957549197a2ac2e1259671cd80b55ed9e1fe2b5c94d88a9202/matplotlib-3.10.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a17e57e33de901d221a07af32c08870ed4528db0b6059dce7d7e65c1122d4bea", size = 8669603 }, + { url = "https://files.pythonhosted.org/packages/7e/a7/315e9435b10d057f5e52dfc603cd353167ae28bb1a4e033d41540c0067a4/matplotlib-3.10.5-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97b9d6443419085950ee4a5b1ee08c363e5c43d7176e55513479e53669e88468", size = 9508127 }, + { url = "https://files.pythonhosted.org/packages/7f/d9/edcbb1f02ca99165365d2768d517898c22c6040187e2ae2ce7294437c413/matplotlib-3.10.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ceefe5d40807d29a66ae916c6a3915d60ef9f028ce1927b84e727be91d884369", size = 9566926 }, + { url = "https://files.pythonhosted.org/packages/3b/d9/6dd924ad5616c97b7308e6320cf392c466237a82a2040381163b7500510a/matplotlib-3.10.5-cp312-cp312-win_amd64.whl", hash = "sha256:c04cba0f93d40e45b3c187c6c52c17f24535b27d545f757a2fffebc06c12b98b", size = 8107599 }, + { url = "https://files.pythonhosted.org/packages/0e/f3/522dc319a50f7b0279fbe74f86f7a3506ce414bc23172098e8d2bdf21894/matplotlib-3.10.5-cp312-cp312-win_arm64.whl", hash = "sha256:a41bcb6e2c8e79dc99c5511ae6f7787d2fb52efd3d805fff06d5d4f667db16b2", size = 7978173 }, + { url = "https://files.pythonhosted.org/packages/8d/05/4f3c1f396075f108515e45cb8d334aff011a922350e502a7472e24c52d77/matplotlib-3.10.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:354204db3f7d5caaa10e5de74549ef6a05a4550fdd1c8f831ab9bca81efd39ed", size = 8253586 }, + { url = "https://files.pythonhosted.org/packages/2f/2c/e084415775aac7016c3719fe7006cdb462582c6c99ac142f27303c56e243/matplotlib-3.10.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b072aac0c3ad563a2b3318124756cb6112157017f7431626600ecbe890df57a1", size = 8110715 }, + { url = "https://files.pythonhosted.org/packages/52/1b/233e3094b749df16e3e6cd5a44849fd33852e692ad009cf7de00cf58ddf6/matplotlib-3.10.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d52fd5b684d541b5a51fb276b2b97b010c75bee9aa392f96b4a07aeb491e33c7", size = 8669397 }, + { url = "https://files.pythonhosted.org/packages/e8/ec/03f9e003a798f907d9f772eed9b7c6a9775d5bd00648b643ebfb88e25414/matplotlib-3.10.5-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee7a09ae2f4676276f5a65bd9f2bd91b4f9fbaedf49f40267ce3f9b448de501f", size = 9508646 }, + { url = "https://files.pythonhosted.org/packages/91/e7/c051a7a386680c28487bca27d23b02d84f63e3d2a9b4d2fc478e6a42e37e/matplotlib-3.10.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ba6c3c9c067b83481d647af88b4e441d532acdb5ef22178a14935b0b881188f4", size = 9567424 }, + { url = "https://files.pythonhosted.org/packages/36/c2/24302e93ff431b8f4173ee1dd88976c8d80483cadbc5d3d777cef47b3a1c/matplotlib-3.10.5-cp313-cp313-win_amd64.whl", hash = "sha256:07442d2692c9bd1cceaa4afb4bbe5b57b98a7599de4dabfcca92d3eea70f9ebe", size = 8107809 }, + { url = "https://files.pythonhosted.org/packages/0b/33/423ec6a668d375dad825197557ed8fbdb74d62b432c1ed8235465945475f/matplotlib-3.10.5-cp313-cp313-win_arm64.whl", hash = "sha256:48fe6d47380b68a37ccfcc94f009530e84d41f71f5dae7eda7c4a5a84aa0a674", size = 7978078 }, + { url = "https://files.pythonhosted.org/packages/51/17/521fc16ec766455c7bb52cc046550cf7652f6765ca8650ff120aa2d197b6/matplotlib-3.10.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b80eb8621331449fc519541a7461987f10afa4f9cfd91afcd2276ebe19bd56c", size = 8295590 }, + { url = "https://files.pythonhosted.org/packages/f8/12/23c28b2c21114c63999bae129fce7fd34515641c517ae48ce7b7dcd33458/matplotlib-3.10.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:47a388908e469d6ca2a6015858fa924e0e8a2345a37125948d8e93a91c47933e", size = 8158518 }, + { url = "https://files.pythonhosted.org/packages/81/f8/aae4eb25e8e7190759f3cb91cbeaa344128159ac92bb6b409e24f8711f78/matplotlib-3.10.5-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8b6b49167d208358983ce26e43aa4196073b4702858670f2eb111f9a10652b4b", size = 8691815 }, + { url = "https://files.pythonhosted.org/packages/d0/ba/450c39ebdd486bd33a359fc17365ade46c6a96bf637bbb0df7824de2886c/matplotlib-3.10.5-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a8da0453a7fd8e3da114234ba70c5ba9ef0e98f190309ddfde0f089accd46ea", size = 9522814 }, + { url = "https://files.pythonhosted.org/packages/89/11/9c66f6a990e27bb9aa023f7988d2d5809cb98aa39c09cbf20fba75a542ef/matplotlib-3.10.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:52c6573dfcb7726a9907b482cd5b92e6b5499b284ffacb04ffbfe06b3e568124", size = 9573917 }, + { url = "https://files.pythonhosted.org/packages/b3/69/8b49394de92569419e5e05e82e83df9b749a0ff550d07631ea96ed2eb35a/matplotlib-3.10.5-cp313-cp313t-win_amd64.whl", hash = "sha256:a23193db2e9d64ece69cac0c8231849db7dd77ce59c7b89948cf9d0ce655a3ce", size = 8181034 }, + { url = "https://files.pythonhosted.org/packages/47/23/82dc435bb98a2fc5c20dffcac8f0b083935ac28286413ed8835df40d0baa/matplotlib-3.10.5-cp313-cp313t-win_arm64.whl", hash = "sha256:56da3b102cf6da2776fef3e71cd96fcf22103a13594a18ac9a9b31314e0be154", size = 8023337 }, + { url = "https://files.pythonhosted.org/packages/ac/e0/26b6cfde31f5383503ee45dcb7e691d45dadf0b3f54639332b59316a97f8/matplotlib-3.10.5-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:96ef8f5a3696f20f55597ffa91c28e2e73088df25c555f8d4754931515512715", size = 8253591 }, + { url = "https://files.pythonhosted.org/packages/c1/89/98488c7ef7ea20ea659af7499628c240a608b337af4be2066d644cfd0a0f/matplotlib-3.10.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:77fab633e94b9da60512d4fa0213daeb76d5a7b05156840c4fd0399b4b818837", size = 8112566 }, + { url = "https://files.pythonhosted.org/packages/52/67/42294dfedc82aea55e1a767daf3263aacfb5a125f44ba189e685bab41b6f/matplotlib-3.10.5-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:27f52634315e96b1debbfdc5c416592edcd9c4221bc2f520fd39c33db5d9f202", size = 9513281 }, + { url = "https://files.pythonhosted.org/packages/e7/68/f258239e0cf34c2cbc816781c7ab6fca768452e6bf1119aedd2bd4a882a3/matplotlib-3.10.5-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:525f6e28c485c769d1f07935b660c864de41c37fd716bfa64158ea646f7084bb", size = 9780873 }, + { url = "https://files.pythonhosted.org/packages/89/64/f4881554006bd12e4558bd66778bdd15d47b00a1f6c6e8b50f6208eda4b3/matplotlib-3.10.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:1f5f3ec4c191253c5f2b7c07096a142c6a1c024d9f738247bfc8e3f9643fc975", size = 9568954 }, + { url = "https://files.pythonhosted.org/packages/06/f8/42779d39c3f757e1f012f2dda3319a89fb602bd2ef98ce8faf0281f4febd/matplotlib-3.10.5-cp314-cp314-win_amd64.whl", hash = "sha256:707f9c292c4cd4716f19ab8a1f93f26598222cd931e0cd98fbbb1c5994bf7667", size = 8237465 }, + { url = "https://files.pythonhosted.org/packages/cf/f8/153fd06b5160f0cd27c8b9dd797fcc9fb56ac6a0ebf3c1f765b6b68d3c8a/matplotlib-3.10.5-cp314-cp314-win_arm64.whl", hash = "sha256:21a95b9bf408178d372814de7baacd61c712a62cae560b5e6f35d791776f6516", size = 8108898 }, + { url = "https://files.pythonhosted.org/packages/9a/ee/c4b082a382a225fe0d2a73f1f57cf6f6f132308805b493a54c8641006238/matplotlib-3.10.5-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:a6b310f95e1102a8c7c817ef17b60ee5d1851b8c71b63d9286b66b177963039e", size = 8295636 }, + { url = "https://files.pythonhosted.org/packages/30/73/2195fa2099718b21a20da82dfc753bf2af58d596b51aefe93e359dd5915a/matplotlib-3.10.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:94986a242747a0605cb3ff1cb98691c736f28a59f8ffe5175acaeb7397c49a5a", size = 8158575 }, + { url = "https://files.pythonhosted.org/packages/f6/e9/a08cdb34618a91fa08f75e6738541da5cacde7c307cea18ff10f0d03fcff/matplotlib-3.10.5-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ff10ea43288f0c8bab608a305dc6c918cc729d429c31dcbbecde3b9f4d5b569", size = 9522815 }, + { url = "https://files.pythonhosted.org/packages/4e/bb/34d8b7e0d1bb6d06ef45db01dfa560d5a67b1c40c0b998ce9ccde934bb09/matplotlib-3.10.5-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f6adb644c9d040ffb0d3434e440490a66cf73dbfa118a6f79cd7568431f7a012", size = 9783514 }, + { url = "https://files.pythonhosted.org/packages/12/09/d330d1e55dcca2e11b4d304cc5227f52e2512e46828d6249b88e0694176e/matplotlib-3.10.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:4fa40a8f98428f789a9dcacd625f59b7bc4e3ef6c8c7c80187a7a709475cf592", size = 9573932 }, + { url = "https://files.pythonhosted.org/packages/eb/3b/f70258ac729aa004aca673800a53a2b0a26d49ca1df2eaa03289a1c40f81/matplotlib-3.10.5-cp314-cp314t-win_amd64.whl", hash = "sha256:95672a5d628b44207aab91ec20bf59c26da99de12b88f7e0b1fb0a84a86ff959", size = 8322003 }, + { url = "https://files.pythonhosted.org/packages/5b/60/3601f8ce6d76a7c81c7f25a0e15fde0d6b66226dd187aa6d2838e6374161/matplotlib-3.10.5-cp314-cp314t-win_arm64.whl", hash = "sha256:2efaf97d72629e74252e0b5e3c46813e9eeaa94e011ecf8084a971a31a97f40b", size = 8153849 }, + { url = "https://files.pythonhosted.org/packages/dc/d6/e921be4e1a5f7aca5194e1f016cb67ec294548e530013251f630713e456d/matplotlib-3.10.5-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:160e125da27a749481eaddc0627962990f6029811dbeae23881833a011a0907f", size = 8233224 }, + { url = "https://files.pythonhosted.org/packages/ec/74/a2b9b04824b9c349c8f1b2d21d5af43fa7010039427f2b133a034cb09e59/matplotlib-3.10.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ac3d50760394d78a3c9be6b28318fe22b494c4fcf6407e8fd4794b538251899b", size = 8098539 }, + { url = "https://files.pythonhosted.org/packages/fc/66/cd29ebc7f6c0d2a15d216fb572573e8fc38bd5d6dec3bd9d7d904c0949f7/matplotlib-3.10.5-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6c49465bf689c4d59d174d0c7795fb42a21d4244d11d70e52b8011987367ac61", size = 8672192 }, +] + [[package]] name = "matplotlib-inline" version = "0.1.7" @@ -1783,6 +2041,90 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772 }, ] +[[package]] +name = "pillow" +version = "11.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/d0d6dea55cd152ce3d6767bb38a8fc10e33796ba4ba210cbab9354b6d238/pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523", size = 47113069 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/db/26/77f8ed17ca4ffd60e1dcd220a6ec6d71210ba398cfa33a13a1cd614c5613/pillow-11.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1cd110edf822773368b396281a2293aeb91c90a2db00d78ea43e7e861631b722", size = 5316531 }, + { url = "https://files.pythonhosted.org/packages/cb/39/ee475903197ce709322a17a866892efb560f57900d9af2e55f86db51b0a5/pillow-11.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9c412fddd1b77a75aa904615ebaa6001f169b26fd467b4be93aded278266b288", size = 4686560 }, + { url = "https://files.pythonhosted.org/packages/d5/90/442068a160fd179938ba55ec8c97050a612426fae5ec0a764e345839f76d/pillow-11.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d1aa4de119a0ecac0a34a9c8bde33f34022e2e8f99104e47a3ca392fd60e37d", size = 5870978 }, + { url = "https://files.pythonhosted.org/packages/13/92/dcdd147ab02daf405387f0218dcf792dc6dd5b14d2573d40b4caeef01059/pillow-11.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:91da1d88226663594e3f6b4b8c3c8d85bd504117d043740a8e0ec449087cc494", size = 7641168 }, + { url = "https://files.pythonhosted.org/packages/6e/db/839d6ba7fd38b51af641aa904e2960e7a5644d60ec754c046b7d2aee00e5/pillow-11.3.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:643f189248837533073c405ec2f0bb250ba54598cf80e8c1e043381a60632f58", size = 5973053 }, + { url = "https://files.pythonhosted.org/packages/f2/2f/d7675ecae6c43e9f12aa8d58b6012683b20b6edfbdac7abcb4e6af7a3784/pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:106064daa23a745510dabce1d84f29137a37224831d88eb4ce94bb187b1d7e5f", size = 6640273 }, + { url = "https://files.pythonhosted.org/packages/45/ad/931694675ede172e15b2ff03c8144a0ddaea1d87adb72bb07655eaffb654/pillow-11.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd8ff254faf15591e724dc7c4ddb6bf4793efcbe13802a4ae3e863cd300b493e", size = 6082043 }, + { url = "https://files.pythonhosted.org/packages/3a/04/ba8f2b11fc80d2dd462d7abec16351b45ec99cbbaea4387648a44190351a/pillow-11.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:932c754c2d51ad2b2271fd01c3d121daaa35e27efae2a616f77bf164bc0b3e94", size = 6715516 }, + { url = "https://files.pythonhosted.org/packages/48/59/8cd06d7f3944cc7d892e8533c56b0acb68399f640786313275faec1e3b6f/pillow-11.3.0-cp311-cp311-win32.whl", hash = "sha256:b4b8f3efc8d530a1544e5962bd6b403d5f7fe8b9e08227c6b255f98ad82b4ba0", size = 6274768 }, + { url = "https://files.pythonhosted.org/packages/f1/cc/29c0f5d64ab8eae20f3232da8f8571660aa0ab4b8f1331da5c2f5f9a938e/pillow-11.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:1a992e86b0dd7aeb1f053cd506508c0999d710a8f07b4c791c63843fc6a807ac", size = 6986055 }, + { url = "https://files.pythonhosted.org/packages/c6/df/90bd886fabd544c25addd63e5ca6932c86f2b701d5da6c7839387a076b4a/pillow-11.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:30807c931ff7c095620fe04448e2c2fc673fcbb1ffe2a7da3fb39613489b1ddd", size = 2423079 }, + { url = "https://files.pythonhosted.org/packages/40/fe/1bc9b3ee13f68487a99ac9529968035cca2f0a51ec36892060edcc51d06a/pillow-11.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdae223722da47b024b867c1ea0be64e0df702c5e0a60e27daad39bf960dd1e4", size = 5278800 }, + { url = "https://files.pythonhosted.org/packages/2c/32/7e2ac19b5713657384cec55f89065fb306b06af008cfd87e572035b27119/pillow-11.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:921bd305b10e82b4d1f5e802b6850677f965d8394203d182f078873851dada69", size = 4686296 }, + { url = "https://files.pythonhosted.org/packages/8e/1e/b9e12bbe6e4c2220effebc09ea0923a07a6da1e1f1bfbc8d7d29a01ce32b/pillow-11.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb76541cba2f958032d79d143b98a3a6b3ea87f0959bbe256c0b5e416599fd5d", size = 5871726 }, + { url = "https://files.pythonhosted.org/packages/8d/33/e9200d2bd7ba00dc3ddb78df1198a6e80d7669cce6c2bdbeb2530a74ec58/pillow-11.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:67172f2944ebba3d4a7b54f2e95c786a3a50c21b88456329314caaa28cda70f6", size = 7644652 }, + { url = "https://files.pythonhosted.org/packages/41/f1/6f2427a26fc683e00d985bc391bdd76d8dd4e92fac33d841127eb8fb2313/pillow-11.3.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97f07ed9f56a3b9b5f49d3661dc9607484e85c67e27f3e8be2c7d28ca032fec7", size = 5977787 }, + { url = "https://files.pythonhosted.org/packages/e4/c9/06dd4a38974e24f932ff5f98ea3c546ce3f8c995d3f0985f8e5ba48bba19/pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:676b2815362456b5b3216b4fd5bd89d362100dc6f4945154ff172e206a22c024", size = 6645236 }, + { url = "https://files.pythonhosted.org/packages/40/e7/848f69fb79843b3d91241bad658e9c14f39a32f71a301bcd1d139416d1be/pillow-11.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3e184b2f26ff146363dd07bde8b711833d7b0202e27d13540bfe2e35a323a809", size = 6086950 }, + { url = "https://files.pythonhosted.org/packages/0b/1a/7cff92e695a2a29ac1958c2a0fe4c0b2393b60aac13b04a4fe2735cad52d/pillow-11.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6be31e3fc9a621e071bc17bb7de63b85cbe0bfae91bb0363c893cbe67247780d", size = 6723358 }, + { url = "https://files.pythonhosted.org/packages/26/7d/73699ad77895f69edff76b0f332acc3d497f22f5d75e5360f78cbcaff248/pillow-11.3.0-cp312-cp312-win32.whl", hash = "sha256:7b161756381f0918e05e7cb8a371fff367e807770f8fe92ecb20d905d0e1c149", size = 6275079 }, + { url = "https://files.pythonhosted.org/packages/8c/ce/e7dfc873bdd9828f3b6e5c2bbb74e47a98ec23cc5c74fc4e54462f0d9204/pillow-11.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:a6444696fce635783440b7f7a9fc24b3ad10a9ea3f0ab66c5905be1c19ccf17d", size = 6986324 }, + { url = "https://files.pythonhosted.org/packages/16/8f/b13447d1bf0b1f7467ce7d86f6e6edf66c0ad7cf44cf5c87a37f9bed9936/pillow-11.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:2aceea54f957dd4448264f9bf40875da0415c83eb85f55069d89c0ed436e3542", size = 2423067 }, + { url = "https://files.pythonhosted.org/packages/1e/93/0952f2ed8db3a5a4c7a11f91965d6184ebc8cd7cbb7941a260d5f018cd2d/pillow-11.3.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:1c627742b539bba4309df89171356fcb3cc5a9178355b2727d1b74a6cf155fbd", size = 2128328 }, + { url = "https://files.pythonhosted.org/packages/4b/e8/100c3d114b1a0bf4042f27e0f87d2f25e857e838034e98ca98fe7b8c0a9c/pillow-11.3.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:30b7c02f3899d10f13d7a48163c8969e4e653f8b43416d23d13d1bbfdc93b9f8", size = 2170652 }, + { url = "https://files.pythonhosted.org/packages/aa/86/3f758a28a6e381758545f7cdb4942e1cb79abd271bea932998fc0db93cb6/pillow-11.3.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:7859a4cc7c9295f5838015d8cc0a9c215b77e43d07a25e460f35cf516df8626f", size = 2227443 }, + { url = "https://files.pythonhosted.org/packages/01/f4/91d5b3ffa718df2f53b0dc109877993e511f4fd055d7e9508682e8aba092/pillow-11.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ec1ee50470b0d050984394423d96325b744d55c701a439d2bd66089bff963d3c", size = 5278474 }, + { url = "https://files.pythonhosted.org/packages/f9/0e/37d7d3eca6c879fbd9dba21268427dffda1ab00d4eb05b32923d4fbe3b12/pillow-11.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7db51d222548ccfd274e4572fdbf3e810a5e66b00608862f947b163e613b67dd", size = 4686038 }, + { url = "https://files.pythonhosted.org/packages/ff/b0/3426e5c7f6565e752d81221af9d3676fdbb4f352317ceafd42899aaf5d8a/pillow-11.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2d6fcc902a24ac74495df63faad1884282239265c6839a0a6416d33faedfae7e", size = 5864407 }, + { url = "https://files.pythonhosted.org/packages/fc/c1/c6c423134229f2a221ee53f838d4be9d82bab86f7e2f8e75e47b6bf6cd77/pillow-11.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f0f5d8f4a08090c6d6d578351a2b91acf519a54986c055af27e7a93feae6d3f1", size = 7639094 }, + { url = "https://files.pythonhosted.org/packages/ba/c9/09e6746630fe6372c67c648ff9deae52a2bc20897d51fa293571977ceb5d/pillow-11.3.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c37d8ba9411d6003bba9e518db0db0c58a680ab9fe5179f040b0463644bc9805", size = 5973503 }, + { url = "https://files.pythonhosted.org/packages/d5/1c/a2a29649c0b1983d3ef57ee87a66487fdeb45132df66ab30dd37f7dbe162/pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13f87d581e71d9189ab21fe0efb5a23e9f28552d5be6979e84001d3b8505abe8", size = 6642574 }, + { url = "https://files.pythonhosted.org/packages/36/de/d5cc31cc4b055b6c6fd990e3e7f0f8aaf36229a2698501bcb0cdf67c7146/pillow-11.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:023f6d2d11784a465f09fd09a34b150ea4672e85fb3d05931d89f373ab14abb2", size = 6084060 }, + { url = "https://files.pythonhosted.org/packages/d5/ea/502d938cbaeec836ac28a9b730193716f0114c41325db428e6b280513f09/pillow-11.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:45dfc51ac5975b938e9809451c51734124e73b04d0f0ac621649821a63852e7b", size = 6721407 }, + { url = "https://files.pythonhosted.org/packages/45/9c/9c5e2a73f125f6cbc59cc7087c8f2d649a7ae453f83bd0362ff7c9e2aee2/pillow-11.3.0-cp313-cp313-win32.whl", hash = "sha256:a4d336baed65d50d37b88ca5b60c0fa9d81e3a87d4a7930d3880d1624d5b31f3", size = 6273841 }, + { url = "https://files.pythonhosted.org/packages/23/85/397c73524e0cd212067e0c969aa245b01d50183439550d24d9f55781b776/pillow-11.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0bce5c4fd0921f99d2e858dc4d4d64193407e1b99478bc5cacecba2311abde51", size = 6978450 }, + { url = "https://files.pythonhosted.org/packages/17/d2/622f4547f69cd173955194b78e4d19ca4935a1b0f03a302d655c9f6aae65/pillow-11.3.0-cp313-cp313-win_arm64.whl", hash = "sha256:1904e1264881f682f02b7f8167935cce37bc97db457f8e7849dc3a6a52b99580", size = 2423055 }, + { url = "https://files.pythonhosted.org/packages/dd/80/a8a2ac21dda2e82480852978416cfacd439a4b490a501a288ecf4fe2532d/pillow-11.3.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4c834a3921375c48ee6b9624061076bc0a32a60b5532b322cc0ea64e639dd50e", size = 5281110 }, + { url = "https://files.pythonhosted.org/packages/44/d6/b79754ca790f315918732e18f82a8146d33bcd7f4494380457ea89eb883d/pillow-11.3.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5e05688ccef30ea69b9317a9ead994b93975104a677a36a8ed8106be9260aa6d", size = 4689547 }, + { url = "https://files.pythonhosted.org/packages/49/20/716b8717d331150cb00f7fdd78169c01e8e0c219732a78b0e59b6bdb2fd6/pillow-11.3.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1019b04af07fc0163e2810167918cb5add8d74674b6267616021ab558dc98ced", size = 5901554 }, + { url = "https://files.pythonhosted.org/packages/74/cf/a9f3a2514a65bb071075063a96f0a5cf949c2f2fce683c15ccc83b1c1cab/pillow-11.3.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f944255db153ebb2b19c51fe85dd99ef0ce494123f21b9db4877ffdfc5590c7c", size = 7669132 }, + { url = "https://files.pythonhosted.org/packages/98/3c/da78805cbdbee9cb43efe8261dd7cc0b4b93f2ac79b676c03159e9db2187/pillow-11.3.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1f85acb69adf2aaee8b7da124efebbdb959a104db34d3a2cb0f3793dbae422a8", size = 6005001 }, + { url = "https://files.pythonhosted.org/packages/6c/fa/ce044b91faecf30e635321351bba32bab5a7e034c60187fe9698191aef4f/pillow-11.3.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05f6ecbeff5005399bb48d198f098a9b4b6bdf27b8487c7f38ca16eeb070cd59", size = 6668814 }, + { url = "https://files.pythonhosted.org/packages/7b/51/90f9291406d09bf93686434f9183aba27b831c10c87746ff49f127ee80cb/pillow-11.3.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a7bc6e6fd0395bc052f16b1a8670859964dbd7003bd0af2ff08342eb6e442cfe", size = 6113124 }, + { url = "https://files.pythonhosted.org/packages/cd/5a/6fec59b1dfb619234f7636d4157d11fb4e196caeee220232a8d2ec48488d/pillow-11.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:83e1b0161c9d148125083a35c1c5a89db5b7054834fd4387499e06552035236c", size = 6747186 }, + { url = "https://files.pythonhosted.org/packages/49/6b/00187a044f98255225f172de653941e61da37104a9ea60e4f6887717e2b5/pillow-11.3.0-cp313-cp313t-win32.whl", hash = "sha256:2a3117c06b8fb646639dce83694f2f9eac405472713fcb1ae887469c0d4f6788", size = 6277546 }, + { url = "https://files.pythonhosted.org/packages/e8/5c/6caaba7e261c0d75bab23be79f1d06b5ad2a2ae49f028ccec801b0e853d6/pillow-11.3.0-cp313-cp313t-win_amd64.whl", hash = "sha256:857844335c95bea93fb39e0fa2726b4d9d758850b34075a7e3ff4f4fa3aa3b31", size = 6985102 }, + { url = "https://files.pythonhosted.org/packages/f3/7e/b623008460c09a0cb38263c93b828c666493caee2eb34ff67f778b87e58c/pillow-11.3.0-cp313-cp313t-win_arm64.whl", hash = "sha256:8797edc41f3e8536ae4b10897ee2f637235c94f27404cac7297f7b607dd0716e", size = 2424803 }, + { url = "https://files.pythonhosted.org/packages/73/f4/04905af42837292ed86cb1b1dabe03dce1edc008ef14c473c5c7e1443c5d/pillow-11.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d9da3df5f9ea2a89b81bb6087177fb1f4d1c7146d583a3fe5c672c0d94e55e12", size = 5278520 }, + { url = "https://files.pythonhosted.org/packages/41/b0/33d79e377a336247df6348a54e6d2a2b85d644ca202555e3faa0cf811ecc/pillow-11.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0b275ff9b04df7b640c59ec5a3cb113eefd3795a8df80bac69646ef699c6981a", size = 4686116 }, + { url = "https://files.pythonhosted.org/packages/49/2d/ed8bc0ab219ae8768f529597d9509d184fe8a6c4741a6864fea334d25f3f/pillow-11.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0743841cabd3dba6a83f38a92672cccbd69af56e3e91777b0ee7f4dba4385632", size = 5864597 }, + { url = "https://files.pythonhosted.org/packages/b5/3d/b932bb4225c80b58dfadaca9d42d08d0b7064d2d1791b6a237f87f661834/pillow-11.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2465a69cf967b8b49ee1b96d76718cd98c4e925414ead59fdf75cf0fd07df673", size = 7638246 }, + { url = "https://files.pythonhosted.org/packages/09/b5/0487044b7c096f1b48f0d7ad416472c02e0e4bf6919541b111efd3cae690/pillow-11.3.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:41742638139424703b4d01665b807c6468e23e699e8e90cffefe291c5832b027", size = 5973336 }, + { url = "https://files.pythonhosted.org/packages/a8/2d/524f9318f6cbfcc79fbc004801ea6b607ec3f843977652fdee4857a7568b/pillow-11.3.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:93efb0b4de7e340d99057415c749175e24c8864302369e05914682ba642e5d77", size = 6642699 }, + { url = "https://files.pythonhosted.org/packages/6f/d2/a9a4f280c6aefedce1e8f615baaa5474e0701d86dd6f1dede66726462bbd/pillow-11.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7966e38dcd0fa11ca390aed7c6f20454443581d758242023cf36fcb319b1a874", size = 6083789 }, + { url = "https://files.pythonhosted.org/packages/fe/54/86b0cd9dbb683a9d5e960b66c7379e821a19be4ac5810e2e5a715c09a0c0/pillow-11.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:98a9afa7b9007c67ed84c57c9e0ad86a6000da96eaa638e4f8abe5b65ff83f0a", size = 6720386 }, + { url = "https://files.pythonhosted.org/packages/e7/95/88efcaf384c3588e24259c4203b909cbe3e3c2d887af9e938c2022c9dd48/pillow-11.3.0-cp314-cp314-win32.whl", hash = "sha256:02a723e6bf909e7cea0dac1b0e0310be9d7650cd66222a5f1c571455c0a45214", size = 6370911 }, + { url = "https://files.pythonhosted.org/packages/2e/cc/934e5820850ec5eb107e7b1a72dd278140731c669f396110ebc326f2a503/pillow-11.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:a418486160228f64dd9e9efcd132679b7a02a5f22c982c78b6fc7dab3fefb635", size = 7117383 }, + { url = "https://files.pythonhosted.org/packages/d6/e9/9c0a616a71da2a5d163aa37405e8aced9a906d574b4a214bede134e731bc/pillow-11.3.0-cp314-cp314-win_arm64.whl", hash = "sha256:155658efb5e044669c08896c0c44231c5e9abcaadbc5cd3648df2f7c0b96b9a6", size = 2511385 }, + { url = "https://files.pythonhosted.org/packages/1a/33/c88376898aff369658b225262cd4f2659b13e8178e7534df9e6e1fa289f6/pillow-11.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:59a03cdf019efbfeeed910bf79c7c93255c3d54bc45898ac2a4140071b02b4ae", size = 5281129 }, + { url = "https://files.pythonhosted.org/packages/1f/70/d376247fb36f1844b42910911c83a02d5544ebd2a8bad9efcc0f707ea774/pillow-11.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f8a5827f84d973d8636e9dc5764af4f0cf2318d26744b3d902931701b0d46653", size = 4689580 }, + { url = "https://files.pythonhosted.org/packages/eb/1c/537e930496149fbac69efd2fc4329035bbe2e5475b4165439e3be9cb183b/pillow-11.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ee92f2fd10f4adc4b43d07ec5e779932b4eb3dbfbc34790ada5a6669bc095aa6", size = 5902860 }, + { url = "https://files.pythonhosted.org/packages/bd/57/80f53264954dcefeebcf9dae6e3eb1daea1b488f0be8b8fef12f79a3eb10/pillow-11.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c96d333dcf42d01f47b37e0979b6bd73ec91eae18614864622d9b87bbd5bbf36", size = 7670694 }, + { url = "https://files.pythonhosted.org/packages/70/ff/4727d3b71a8578b4587d9c276e90efad2d6fe0335fd76742a6da08132e8c/pillow-11.3.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c96f993ab8c98460cd0c001447bff6194403e8b1d7e149ade5f00594918128b", size = 6005888 }, + { url = "https://files.pythonhosted.org/packages/05/ae/716592277934f85d3be51d7256f3636672d7b1abfafdc42cf3f8cbd4b4c8/pillow-11.3.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:41342b64afeba938edb034d122b2dda5db2139b9a4af999729ba8818e0056477", size = 6670330 }, + { url = "https://files.pythonhosted.org/packages/e7/bb/7fe6cddcc8827b01b1a9766f5fdeb7418680744f9082035bdbabecf1d57f/pillow-11.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:068d9c39a2d1b358eb9f245ce7ab1b5c3246c7c8c7d9ba58cfa5b43146c06e50", size = 6114089 }, + { url = "https://files.pythonhosted.org/packages/8b/f5/06bfaa444c8e80f1a8e4bff98da9c83b37b5be3b1deaa43d27a0db37ef84/pillow-11.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a1bc6ba083b145187f648b667e05a2534ecc4b9f2784c2cbe3089e44868f2b9b", size = 6748206 }, + { url = "https://files.pythonhosted.org/packages/f0/77/bc6f92a3e8e6e46c0ca78abfffec0037845800ea38c73483760362804c41/pillow-11.3.0-cp314-cp314t-win32.whl", hash = "sha256:118ca10c0d60b06d006be10a501fd6bbdfef559251ed31b794668ed569c87e12", size = 6377370 }, + { url = "https://files.pythonhosted.org/packages/4a/82/3a721f7d69dca802befb8af08b7c79ebcab461007ce1c18bd91a5d5896f9/pillow-11.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:8924748b688aa210d79883357d102cd64690e56b923a186f35a82cbc10f997db", size = 7121500 }, + { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835 }, + { url = "https://files.pythonhosted.org/packages/9e/e3/6fa84033758276fb31da12e5fb66ad747ae83b93c67af17f8c6ff4cc8f34/pillow-11.3.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7c8ec7a017ad1bd562f93dbd8505763e688d388cde6e4a010ae1486916e713e6", size = 5270566 }, + { url = "https://files.pythonhosted.org/packages/5b/ee/e8d2e1ab4892970b561e1ba96cbd59c0d28cf66737fc44abb2aec3795a4e/pillow-11.3.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9ab6ae226de48019caa8074894544af5b53a117ccb9d3b3dcb2871464c829438", size = 4654618 }, + { url = "https://files.pythonhosted.org/packages/f2/6d/17f80f4e1f0761f02160fc433abd4109fa1548dcfdca46cfdadaf9efa565/pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fe27fb049cdcca11f11a7bfda64043c37b30e6b91f10cb5bab275806c32f6ab3", size = 4874248 }, + { url = "https://files.pythonhosted.org/packages/de/5f/c22340acd61cef960130585bbe2120e2fd8434c214802f07e8c03596b17e/pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:465b9e8844e3c3519a983d58b80be3f668e2a7a5db97f2784e7079fbc9f9822c", size = 6583963 }, + { url = "https://files.pythonhosted.org/packages/31/5e/03966aedfbfcbb4d5f8aa042452d3361f325b963ebbadddac05b122e47dd/pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5418b53c0d59b3824d05e029669efa023bbef0f3e92e75ec8428f3799487f361", size = 4957170 }, + { url = "https://files.pythonhosted.org/packages/cc/2d/e082982aacc927fc2cab48e1e731bdb1643a1406acace8bed0900a61464e/pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:504b6f59505f08ae014f724b6207ff6222662aab5cc9542577fb084ed0676ac7", size = 5581505 }, + { url = "https://files.pythonhosted.org/packages/34/e7/ae39f538fd6844e982063c3a5e4598b8ced43b9633baa3a85ef33af8c05c/pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8", size = 6984598 }, +] + [[package]] name = "platformdirs" version = "4.3.8" @@ -2101,6 +2443,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/98/d4/10bb14004d3c792811e05e21b5e5dcae805aacb739bd12a0540967b99592/pymdown_extensions-10.16-py3-none-any.whl", hash = "sha256:f5dd064a4db588cb2d95229fc4ee63a1b16cc8b4d0e6145c0899ed8723da1df2", size = 266143 }, ] +[[package]] +name = "pyparsing" +version = "3.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bb/22/f1129e69d94ffff626bdb5c835506b3a5b4f3d070f17ea295e12c2c6f60f/pyparsing-3.2.3.tar.gz", hash = "sha256:b9c13f1ab8b3b542f72e28f634bad4de758ab3ce4546e4301970ad6fa77c38be", size = 1088608 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl", hash = "sha256:a749938e02d6fd0b59b356ca504a24982314bb090c383e3cf201c95ef7e2bfcf", size = 111120 }, +] + [[package]] name = "pyright" version = "1.1.402" @@ -2568,6 +2919,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/eb/c4/231cac7a8385394ebbbb4f1ca662203e9d8c332825ab4f36ffc3ead09a42/scipy-1.16.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f56296fefca67ba605fd74d12f7bd23636267731a72cb3947963e76b8c0a25db", size = 38515076 }, ] +[[package]] +name = "seaborn" +version = "0.13.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "matplotlib" }, + { name = "numpy" }, + { name = "pandas" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/86/59/a451d7420a77ab0b98f7affa3a1d78a313d2f7281a57afb1a34bae8ab412/seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7", size = 1457696 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987", size = 294914 }, +] + [[package]] name = "semversioner" version = "2.0.6"