Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,5 +129,6 @@ cython_debug/
.DS_Store

# App
workspace/experiments/archive/
workspace/datasets/raw/squad/*.json
workspace/datasets/built/*/*.json
workspace/datasets/raw/squad/*.json
workspace/experiments/archive/
63 changes: 63 additions & 0 deletions src/dcv_benchmark/cli/commands/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import argparse
from typing import Any

from dcv_benchmark.cli.data import build_data, download_data


def handle_download(args: argparse.Namespace) -> None:
"""
Handles the 'data download' command.
"""
download_data(source=args.source, output_dir=args.output_dir)


def handle_build(args: argparse.Namespace) -> None:
"""
Handles the 'data build' command.
"""
build_data(config_path_str=args.config, name=args.name, overwrite=args.overwrite)


def register_data_commands(subparsers: Any, parent_parser: Any) -> None:
"""Registers the 'data' subcommand group."""
data_parser = subparsers.add_parser(
"data",
help="Data Factory tools",
parents=[parent_parser], # Inherit global flags like --debug
add_help=False, # Let the parent parser handle help if needed
)

data_subs = data_parser.add_subparsers(dest="data_command", required=True)

# Download Command
dl_parser = data_subs.add_parser(
"download", help="Fetch raw datasets (SQuAD, BIPIA)", parents=[parent_parser]
)
dl_parser.add_argument(
"source",
choices=["squad", "bipia"],
help="Name of the source dataset to download",
)
dl_parser.add_argument(
"--output-dir",
"-o",
help="Override default output directory (workspace/datasets/raw/ ...)",
)
dl_parser.set_defaults(func=handle_download)

# Build Command
build_parser = data_subs.add_parser(
"build", help="Generate/Inject a dataset from a recipe", parents=[parent_parser]
)
build_parser.add_argument(
"config", help="Path to the dataset configuration file (YAML)"
)
build_parser.add_argument(
"--name", help="Name for the built dataset (overrides config name)"
)
build_parser.add_argument(
"--overwrite",
action="store_true",
help="Overwrite existing dataset if it exists",
)
build_parser.set_defaults(func=handle_build)
38 changes: 38 additions & 0 deletions src/dcv_benchmark/cli/commands/experiment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import argparse
from typing import Any

from dcv_benchmark.cli.experiments import run_experiment


def handle_run(args: argparse.Namespace) -> None:
"""
Handles the 'experiment run' command.
"""
run_experiment(
config_path_str=args.config, limit=args.limit, debug_traces=args.debug_traces
)


def register_experiment_commands(subparsers: Any, parent_parser: Any) -> None:
"""Registers the 'experiment' subcommand group."""
exp_parser = subparsers.add_parser(
"experiment", help="Experiment execution tools", parents=[parent_parser]
)
exp_subs = exp_parser.add_subparsers(dest="experiment_command", required=True)

# Run Command
run_parser = exp_subs.add_parser(
"run", help="Execute an experiment from a config file", parents=[parent_parser]
)
run_parser.add_argument(
"config", help="Path to the experiment.yaml configuration file"
)
run_parser.add_argument(
"--limit", type=int, help="Limit execution to N samples (for debugging)"
)
run_parser.add_argument(
"--debug-traces",
action="store_true",
help="Enable verbose logging and full-text traces",
)
run_parser.set_defaults(func=handle_run)
196 changes: 89 additions & 107 deletions src/dcv_benchmark/cli/data.py
Original file line number Diff line number Diff line change
@@ -1,140 +1,122 @@
import argparse
import sys
from pathlib import Path
from typing import Any

import yaml

from dcv_benchmark.constants import BUILT_DATASETS_DIR
from dcv_benchmark.constants import BUILT_DATASETS_DIR, RAW_DATASETS_DIR
from dcv_benchmark.data_factory.builder import DatasetBuilder
from dcv_benchmark.data_factory.downloader import download_bipia, download_squad
from dcv_benchmark.data_factory.injector import AttackInjector
from dcv_benchmark.data_factory.loaders import SquadLoader
from dcv_benchmark.models.data_factory import DataFactoryConfig
from dcv_benchmark.utils.logger import get_logger, print_dataset_header
from dcv_benchmark.utils.logger import get_logger

logger = get_logger(__name__)


def load_factory_config(path: Path) -> DataFactoryConfig:
"""Helper to load and validate the Data Factory YAML config."""
def download_data(source: str, output_dir: str | None = None) -> None:
"""
Fetches raw datasets (SQuAD, BIPIA) to the workspace.
"""
# Determine output directory
if output_dir:
output_path = Path(output_dir)
else:
output_path = RAW_DATASETS_DIR / source

logger.info(f"Preparing to download '{source}' data to {output_path}...")

try:
with open(path, encoding="utf-8") as f:
raw = yaml.safe_load(f)
return DataFactoryConfig(**raw)
if source == "squad":
download_squad(output_path)
elif source == "bipia":
download_bipia(output_path)
else:
logger.error(f"Unknown source: '{source}'. Options: squad, bipia")
sys.exit(1)

logger.info(f"Download of '{source}' complete.")

except Exception as e:
logger.error(f"Failed to parse config file {path}: {e}")
logger.error(f"Download failed: {e}")
sys.exit(1)


def resolve_data_target(target: str) -> Path:
"""
Resolves the 'target' argument to a config file path.
1. Checks if 'target' is a dataset name (folder in BUILT_DATASETS_DIR).
If so, looks for 'dataset_config.yaml' inside it.
2. Checks if 'target' is a direct file path.
def build_data(
config_path_str: str, name: str | None = None, overwrite: bool = False
) -> None:
"""
# Try Dataset Name
dataset_dir = BUILT_DATASETS_DIR / target
if dataset_dir.exists() and dataset_dir.is_dir():
# It's a valid dataset folder
config_candidate = dataset_dir / "dataset_config.yaml"
if config_candidate.exists():
return config_candidate

# Try File Path
path = Path(target)
if path.exists():
return path

# Error Handling
if dataset_dir.exists() and dataset_dir.is_dir():
raise FileNotFoundError(
f"Dataset folder '{target}' found, but missing 'dataset_config.yaml' "
f"at {dataset_dir / 'dataset_config.yaml'}"
)

raise FileNotFoundError(
f"Target not found. Checked dataset folder '{dataset_dir}' and path '{path}'."
)


def generate_dataset(config: DataFactoryConfig, output_path: Path) -> None:
Generates (injects/builds) a dataset from a recipe config.
"""
Core logic to generate and save a dataset based on the provided config.
"""
print_dataset_header(config.model_dump())

loader = SquadLoader()
injector = AttackInjector(config=config)
config_path = Path(config_path_str)

logger.debug(f"Initializing DatasetBuilder for '{config.dataset_name}'...")
builder = DatasetBuilder(loader=loader, injector=injector, config=config)

logger.info("Starting build process (Indexing -> Retrieving -> Injecting)...")
dataset = builder.build()

logger.info(f"Saving dataset to {output_path}...")
builder.save(dataset, output_path)
logger.info("Done.")
# If directory provided, look for dataset_config.yaml
if config_path.is_dir():
potential = config_path / "dataset_config.yaml"
if potential.exists():
config_path = potential
else:
logger.error(
"Directory provided but 'dataset_config.yaml' not found in "
f"{config_path}"
)
sys.exit(1)

if not config_path.exists():
logger.error(f"Config file not found: {config_path}")
sys.exit(1)

def run_generate_dataset(args: argparse.Namespace) -> None:
"""Handler for the 'data generate' command."""
# Load Config
try:
config_path = resolve_data_target(args.target)
logger.debug(f"Resolved dataset config: {config_path}")
except FileNotFoundError as e:
logger.error(str(e))
with open(config_path, encoding="utf-8") as f:
raw_config = yaml.safe_load(f)
config = DataFactoryConfig(**raw_config)
except Exception as e:
logger.error(f"Failed to load config: {e}")
sys.exit(1)

logger.debug(f"Loading Data Factory config from {config_path}...")
config = load_factory_config(config_path)
# Determine Dataset Name (CLI override > Config > Folder Name)
if name:
dataset_name = name
# Update config to match the build name so metadata is consistent
config.dataset_name = dataset_name
else:
dataset_name = config.dataset_name

try:
# Determine output path
if args.output:
output_path = Path(args.output)
else:
# Default: dataset.json in the same folder as the config
output_path = config_path.parent / "dataset.json"
target_dir = BUILT_DATASETS_DIR / dataset_name

target_dir.mkdir(parents=True, exist_ok=True)
output_file = target_dir / "dataset.json"

# Safety check
if output_path.exists() and not args.force:
logger.error(f"Output file already exists: {output_path}")
logger.error("Use --force to overwrite it.")
# Check Overwrite
if output_file.exists():
if not overwrite:
logger.error(f"Dataset artifact '{output_file}' already exists.")
logger.info("Use --overwrite to replace it.")
sys.exit(1)
else:
logger.warning(f"Overwriting existing dataset artifact at {output_file}...")
output_file.unlink()

generate_dataset(config, output_path)
# Build Dataset
logger.info(f"Building dataset '{dataset_name}' from {config_path}...")

except Exception:
logger.exception("Fatal error during dataset generation")
sys.exit(1)
try:
# Note: We default to SquadLoader as it handles the JSON format.
loader = SquadLoader()
injector = AttackInjector(config)
builder = DatasetBuilder(loader=loader, injector=injector, config=config)

dataset = builder.build()

# Save Artifacts
builder.save(dataset, output_file)

def register_data_cli(subparsers: Any) -> None:
"""Registers the 'data' command group."""
# Create 'data' parent command
data_parser = subparsers.add_parser("data", help="Data Factory commands")
data_subs = data_parser.add_subparsers(dest="data_command", required=True)

# Create 'generate' subcommand
gen_parser = data_subs.add_parser(
"generate", help="Generate a synthetic RAG dataset from a config file."
)
gen_parser.add_argument(
"target",
type=str,
help="Scenario name (e.g. 'canary_naive') or path to config file.",
)
gen_parser.add_argument(
"-o", "--output", type=str, help="Custom output path for the JSON file."
)
gen_parser.add_argument(
"-f",
"--force",
action="store_true",
help="Overwrite the output file if it exists.",
)

# Map this command to the handler function
gen_parser.set_defaults(func=run_generate_dataset)
logger.info(f"Build successful! Artifacts saved to: {target_dir}")

except Exception as e:
logger.exception(f"Build failed: {e}")
# Cleanup partial build
if output_file.exists() and not overwrite:
output_file.unlink()
sys.exit(1)
Loading