Merge branch 'main' of https://github.com/EleutherAI/delphi

SrGonao · SrGonao · commit 2aebc58811d5 · 2025-11-07T14:40:46.000Z
diff --git a/README.md b/README.md
@@ -16,15 +16,15 @@ Install this library as a local editable installation. Run the following command
 
 To run the default pipeline from the command line, use the following command:
 
-`python -m delphi EleutherAI/pythia-160m EleutherAI/Pythia-160m-SST-k32-32k --n_tokens 10_000_000 --max_latents 100 --hookpoints layers.5 --scorers detection --filter_bos --name llama-3-8B`
+`python -m delphi EleutherAI/pythia-160m EleutherAI/Pythia-160m-SST-k32-32k --n_tokens 10_000_000 --max_latents 100 --hookpoints layers.5.mlp --scorers detection --filter_bos --name llama-3-8B`
 
 This command will:
 1. Cache activations for the first 10 million tokens of the default dataset, `EleutherAI/SmolLM2-135M-10B`.
 2. Generate explanations for the first 100 features of layer 5 using the default explainer model, `hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4`.
 3. Score the explanations using the detection scorer.
 4. Log summary metrics including per-scorer F1 scores and confusion matrices, and produce histograms of the scorer classification accuracies.
 
-The pipeline is highly configurable and can also be called programmatically (see the [end-to-end test](https://github.com/EleutherAI/delphi/blob/main/delphi/tests/e2e.py) for an example).
+The pipeline is highly configurable and can also be called programmatically (see the [end-to-end test](https://github.com/EleutherAI/delphi/blob/main/tests/e2e.py) for an example).
 
 To use experimental features, create a custom pipeline. You can take inspiration from the main pipeline in [delphi.\_\_main\_\_](https://github.com/EleutherAI/delphi/blob/main/delphi/__main__.py).
 
diff --git a/delphi/__main__.py b/delphi/__main__.py
@@ -256,6 +256,7 @@ def scorer_postprocess(result, score_dir):
                 n_examples_shown=run_cfg.num_examples_per_scorer_prompt,
                 verbose=run_cfg.verbose,
                 log_prob=run_cfg.log_probs,
+                fuzz_type=run_cfg.fuzz_type,
             )
         elif scorer_name == "detection":
             scorer = DetectionScorer(
diff --git a/delphi/config.py b/delphi/config.py
@@ -160,6 +160,10 @@ class RunConfig(Serializable):
     )
     """Scorer methods to score latent explanations. Options are 'fuzz', 'detection', and
     'simulation'."""
+    fuzz_type: Literal["default", "active"] = "default"
+    """Type of fuzzing to use for the fuzz scorer. Default uses non-activating
+    examples and highlights n_incorrect tokens. Active uses activating examples
+    and highlights non-activating tokens."""
 
     name: str = ""
     """The name of the run. Results are saved in a directory with this name."""
diff --git a/delphi/scorers/classifier/fuzz.py b/delphi/scorers/classifier/fuzz.py
@@ -38,6 +38,10 @@ def __init__(
                         it harder for models to generate anwers in the correct format.
             log_prob: Whether to use log probabilities to allow for AUC calculation.
             generation_kwargs: Additional generation kwargs.
+            temperature: Which temperature to use for the scorer model.
+            fuzz_type: Which type of fuzzing to use. Default uses non-activating
+                examples and highlights n_incorrect tokens. Active uses activating
+                examples and highlights non-activating tokens.
         """
         super().__init__(
             client=client,

Original file line number	Diff line number	Diff line change
`@@ -256,6 +256,7 @@ def scorer_postprocess(result, score_dir):`
`256`	`256`	`n_examples_shown=run_cfg.num_examples_per_scorer_prompt,`
`257`	`257`	`verbose=run_cfg.verbose,`
`258`	`258`	`log_prob=run_cfg.log_probs,`
	`259`	`+ fuzz_type=run_cfg.fuzz_type,`
`259`	`260`	`)`
`260`	`261`	`elif scorer_name == "detection":`
`261`	`262`	`scorer = DetectionScorer(`