Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 37 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,40 @@ cython_debug/
.DS_Store

# App
workspace/datasets/built/*/*.json
workspace/datasets/raw/squad/*.json
workspace/datasets/raw/bipia/
workspace/experiments/archive/
workspace/*

# Allow traversing into datasets
!workspace/datasets/
workspace/datasets/*

# Allow traversing into datasets/raw
!workspace/datasets/raw/
workspace/datasets/raw/*

# Allow traversing into datasets/raw/squad
!workspace/datasets/raw/squad/
workspace/datasets/raw/squad/*

# Whitelist the specific README
!workspace/datasets/raw/squad/README.md

# Allow traversing into datasets/built
!workspace/datasets/built/
workspace/datasets/built/*

# Whitelist specific built datasets
!workspace/datasets/built/bipia_example_dataset/
!workspace/datasets/built/squad_example_dataset/

# Allow traversing into experiments
!workspace/experiments/
workspace/experiments/*

# Whitelist specific experiments
!workspace/experiments/bipia_example/
!workspace/experiments/squad_example/

# Allow traversing into prompts
!workspace/prompts/


3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ data = [
]

[dependency-groups]
bipia = [
"emoji>=2.15.0",
]
dev = [
"mypy>=1.19.1",
"pytest-cov>=7.0.0",
Expand Down
10 changes: 8 additions & 2 deletions src/dcv_benchmark/analytics/plotter.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,14 @@ def _plot_latency_distribution(self, metrics: SecurityMetrics) -> None:
fig, ax = plt.subplots(figsize=(8, 5))

# Plot overlapping histograms
ax.hist(benign, bins=20, alpha=0.5, label="Benign", color="blue", density=True)
ax.hist(attack, bins=20, alpha=0.5, label="Attack", color="red", density=True)
if benign:
ax.hist(
benign, bins=20, alpha=0.5, label="Benign", color="blue", density=True
)
if attack:
ax.hist(
attack, bins=20, alpha=0.5, label="Attack", color="red", density=True
)

ax.set_xlabel("Latency (seconds)")
ax.set_ylabel("Density")
Expand Down
21 changes: 14 additions & 7 deletions src/dcv_benchmark/core/factories.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from typing import Any, cast
from typing import Any

from dcv_benchmark.components.llms import BaseLLM
from dcv_benchmark.components.llms import BaseLLM, create_llm
from dcv_benchmark.constants import (
BUILT_DATASETS_DIR,
)
from dcv_benchmark.evaluators.base import BaseEvaluator
from dcv_benchmark.evaluators.bipia import BipiaEvaluator
from dcv_benchmark.evaluators.bipia import BipiaDefenseEvaluator
from dcv_benchmark.evaluators.squad import SquadDefenseEvaluator
from dcv_benchmark.models.config.experiment import ExperimentConfig
from dcv_benchmark.models.dataset import BaseDataset
Expand Down Expand Up @@ -78,10 +78,17 @@ def create_experiment_evaluators(

# 2. BIPIA Logic
if dataset.meta.type == "bipia":
logger.info("Configuration: Detected BIPIA. Using 'BipiaEvaluator'.")
# For BIPIA, we generally need the LLM to judge.
judge_llm = cast(BaseLLM | None, getattr(target, "llm", None))
evaluators["bipia_asr"] = BipiaEvaluator(judge_llm=judge_llm)
logger.info("Configuration: Detected BIPIA. Using 'BipiaDefenseEvaluator'.")

# Resolve Judge LLM (Strict: No Fallback)
judge_llm: BaseLLM | None = None
if experiment_config.judge_llm:
logger.info(
f"Initializing dedicated Judge LLM: {experiment_config.judge_llm.model}"
)
judge_llm = create_llm(experiment_config.judge_llm)

evaluators["bipia_asr"] = BipiaDefenseEvaluator(judge_llm=judge_llm)
return evaluators

# Fallback / Warning
Expand Down
2 changes: 1 addition & 1 deletion src/dcv_benchmark/data_factory/squad/squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def fetch_squad_subset(output_dir: Path, count: int = 300, seed: int = 42) -> No
if load_dataset is None:
raise ImportError(
"The 'datasets' library is required for SQuAD. "
"Please install the data dependencies: pip install '.[data]'"
"Please install the data dependencies: uv pip install '.[data]'"
)

logger.info("Loading SQuAD (validation split) via HuggingFace...")
Expand Down
Loading