decoderesearch · hannamw · Feb 23, 2026 · Oct 21, 2025 · Nov 6, 2025 · Nov 7, 2025
diff --git a/circuit_tracer/attribution/attribute.py b/circuit_tracer/attribution/attribute.py
@@ -1,7 +1,12 @@
 """
-Unified attribution interface that routes to the correct implementation based on the ReplacementModel backend.
+Unified attribution interface that routes to the correct backend implementation.
+
+This module provides a unified entry point for computing attribution graphs,
+automatically dispatching to either the TransformerLens or NNSight implementation
+based on the backend type of the provided ReplacementModel.
 """
 
+from collections.abc import Sequence
 from typing import TYPE_CHECKING, Literal
 
 import torch
@@ -19,6 +24,9 @@ def attribute(
     prompt: str | torch.Tensor | list[int],
     model: "NNSightReplacementModel | TransformerLensReplacementModel",
     *,
+    attribution_targets: (
+        Sequence[tuple[str, float, torch.Tensor] | int | str] | torch.Tensor | None
+    ) = None,
     max_n_logits: int = 10,
     desired_logit_prob: float = 0.95,
     batch_size: int = 512,
@@ -35,8 +43,19 @@ def attribute(
     Args:
         prompt: Text, token ids, or tensor - will be tokenized if str.
         model: Frozen ``ReplacementModel`` (either nnsight or transformerlens backend)
-        max_n_logits: Max number of logit nodes.
-        desired_logit_prob: Keep logits until cumulative prob >= this value.
+        attribution_targets: Flexible attribution target specification in one of several formats:
+                          - None: Auto-select salient logits based on probability threshold
+                          - torch.Tensor: Tensor of token indices
+                          - Sequence[tuple[str, float, torch.Tensor] | int | str]: Sequence where
+                            each element can be:
+                              * int or str: Token ID/string (auto-resolves probability and
+                                unembed vector)
+                              * tuple[str, float, torch.Tensor]: Fully specified logit spec with
+                                arbitrary string tokens (or functions thereof) that may not be in
+                                vocabulary
+        max_n_logits: Max number of logit nodes (used when attribution_targets is None).
+        desired_logit_prob: Keep logits until cumulative prob >= this value
+                           (used when attribution_targets is None).
         batch_size: How many source nodes to process per backward pass.
         max_feature_nodes: Max number of feature nodes to include in the graph.
         offload: Method for offloading model parameters to save memory.
@@ -55,6 +74,7 @@ def attribute(
         return attribute_nnsight(
             prompt=prompt,
             model=model,  # type: ignore[arg-type]
+            attribution_targets=attribution_targets,
             max_n_logits=max_n_logits,
             desired_logit_prob=desired_logit_prob,
             batch_size=batch_size,
@@ -69,6 +89,7 @@ def attribute(
         return attribute_transformerlens(
             prompt=prompt,
             model=model,  # type: ignore[arg-type]
+            attribution_targets=attribution_targets,
             max_n_logits=max_n_logits,
             desired_logit_prob=desired_logit_prob,
             batch_size=batch_size,

diff --git a/circuit_tracer/attribution/attribute_nnsight.py b/circuit_tracer/attribution/attribute_nnsight.py
@@ -1,7 +1,7 @@
 """
 Build an **attribution graph** that captures the *direct*, *linear* effects
 between features and next-token logits for a *prompt-specific*
-**local replacement model**.
+**local replacement model** using the NNSight backend.
 
 High-level algorithm (matches the 2025 ``Attribution Graphs`` paper):
 https://transformer-circuits.pub/2025/attribution-graphs/methods.html
@@ -22,21 +22,25 @@
 
 import logging
 import time
-from typing import Literal
+from collections.abc import Sequence
+from typing import Literal, cast
 
 import torch
 from tqdm import tqdm
 
+from circuit_tracer.attribution.targets import AttributionTargets
 from circuit_tracer.graph import Graph, compute_partial_influences
 from circuit_tracer.replacement_model.replacement_model_nnsight import NNSightReplacementModel
 from circuit_tracer.utils.disk_offload import offload_modules
-from circuit_tracer.utils.salient_logits import compute_salient_logits
 
 
 def attribute(
     prompt: str | torch.Tensor | list[int],
     model: NNSightReplacementModel,
     *,
+    attribution_targets: (
+        Sequence[tuple[str, float, torch.Tensor] | int | str] | torch.Tensor | None
+    ) = None,
     max_n_logits: int = 10,
     desired_logit_prob: float = 0.95,
     batch_size: int = 512,
@@ -45,13 +49,24 @@ def attribute(
     verbose: bool = False,
     update_interval: int = 4,
 ) -> Graph:
-    """Compute an attribution graph for *prompt*.
+    """Compute an attribution graph for *prompt* using NNSight backend.
 
     Args:
         prompt: Text, token ids, or tensor - will be tokenized if str.
         model: Frozen ``NNSightReplacementModel``
-        max_n_logits: Max number of logit nodes.
-        desired_logit_prob: Keep logits until cumulative prob >= this value.
+        attribution_targets: Flexible attribution target specification in one of several formats:
+                          - None: Auto-select salient logits based on probability threshold
+                          - torch.Tensor: Tensor of token indices
+                          - Sequence[tuple[str, float, torch.Tensor] | int | str]: Sequence where
+                            each element can be:
+                              * int or str: Token ID/string (auto-resolves probability and
+                                unembed vector)
+                              * tuple[str, float, torch.Tensor]: Fully specified logit spec with
+                                arbitrary string tokens (or functions thereof) that may not be in
+                                vocabulary
+        max_n_logits: Max number of logit nodes (used when attribution_targets is None).
+        desired_logit_prob: Keep logits until cumulative prob >= this value
+                           (used when attribution_targets is None).
         batch_size: How many source nodes to process per backward pass.
         max_feature_nodes: Max number of feature nodes to include in the graph.
         offload: Method for offloading model parameters to save memory.
@@ -81,6 +96,7 @@ def attribute(
         return _run_attribution(
             model=model,
             prompt=prompt,
+            attribution_targets=attribution_targets,
             max_n_logits=max_n_logits,
             desired_logit_prob=desired_logit_prob,
             batch_size=batch_size,
@@ -102,6 +118,7 @@ def attribute(
 def _run_attribution(
     model: NNSightReplacementModel,
     prompt,
+    attribution_targets,
     max_n_logits: int,
     desired_logit_prob: float,
     batch_size: int,
@@ -156,15 +173,21 @@ def _run_attribution(
     n_layers, n_pos, _ = activation_matrix.shape
     total_active_feats = activation_matrix._nnz()
 
-    logit_idx, logit_p, logit_vecs = compute_salient_logits(
-        ctx.logits[0, -1],
-        model.unembed_weight,  # type: ignore
+    # Create AttributionTargets using NNSight's unembed_weight accessor
+    targets = AttributionTargets(
+        attribution_targets=attribution_targets,
+        logits=ctx.logits[0, -1],
+        unembed_proj=cast(torch.Tensor, model.unembed_weight),  # NNSight uses unembed_weight
+        tokenizer=model.tokenizer,
         max_n_logits=max_n_logits,
         desired_logit_prob=desired_logit_prob,
     )
-    logger.info(
-        f"Selected {len(logit_idx)} logits with cumulative probability {logit_p.sum().item():.4f}"
-    )
+
+    if attribution_targets is None:
+        logger.info(
+            f"Selected {len(targets)} logits with cumulative probability "
+            f"{targets.logit_probabilities.sum().item():.4f}"
+        )
 
     if offload:
         offload_handles += offload_modules([model.embed_location], offload)
@@ -176,8 +199,7 @@ def _run_attribution(
             offload_handles += offload_modules([model.lm_head], offload)
 
     logit_offset = len(feat_layers) + (n_layers + 1) * n_pos
-    logit_offset = logit_offset
-    n_logits = len(logit_idx)
+    n_logits = len(targets)
     total_nodes = logit_offset + n_logits
 
     actual_max_feature_nodes = min(max_feature_nodes or total_active_feats, total_active_feats)
@@ -193,8 +215,8 @@ def _run_attribution(
     logger.info("Phase 3: Computing logit attributions")
     phase3_start = time.time()
     i = -1
-    for i in range(0, len(logit_idx), batch_size):
-        batch = logit_vecs[i : i + batch_size]
+    for i in range(0, len(targets), batch_size):
+        batch = targets.logit_vectors[i : i + batch_size]
         rows = ctx.compute_batch(
             layers=torch.full((batch.shape[0],), n_layers),
             positions=torch.full((batch.shape[0],), n_pos - 1),
@@ -225,7 +247,7 @@ def _run_attribution(
             pending = torch.arange(total_active_feats)
         else:
             influences = compute_partial_influences(
-                edge_matrix[:st], logit_p, row_to_node_index[:st]
+                edge_matrix[:st], targets.logit_probabilities, row_to_node_index[:st]
             )
             feature_rank = torch.argsort(influences[:total_active_feats], descending=True).cpu()
             queue_size = min(update_interval * batch_size, actual_max_feature_nodes - n_visited)
@@ -270,8 +292,7 @@ def _run_attribution(
     graph = Graph(
         input_string=model.tokenizer.decode(input_ids),
         input_tokens=input_ids,
-        logit_tokens=logit_idx,
-        logit_probabilities=logit_p,
+        attribution_targets=targets,
         active_features=activation_matrix.indices().T,
         activation_values=activation_matrix.values(),
         selected_features=selected_features,

diff --git a/circuit_tracer/attribution/attribute_transformerlens.py b/circuit_tracer/attribution/attribute_transformerlens.py
@@ -1,7 +1,7 @@
 """
 Build an **attribution graph** that captures the *direct*, *linear* effects
 between features and next-token logits for a *prompt-specific*
-**local replacement model**.
+**local replacement model** using the TransformerLens backend.
 
 High-level algorithm (matches the 2025 ``Attribution Graphs`` paper):
 https://transformer-circuits.pub/2025/attribution-graphs/methods.html
@@ -22,23 +22,27 @@
 
 import logging
 import time
+from collections.abc import Sequence
 from typing import Literal
 
 import torch
 from tqdm import tqdm
 
+from circuit_tracer.attribution.targets import AttributionTargets
 from circuit_tracer.graph import Graph, compute_partial_influences
 from circuit_tracer.replacement_model.replacement_model_transformerlens import (
     TransformerLensReplacementModel,
 )
 from circuit_tracer.utils.disk_offload import offload_modules
-from circuit_tracer.utils.salient_logits import compute_salient_logits
 
 
 def attribute(
     prompt: str | torch.Tensor | list[int],
     model: TransformerLensReplacementModel,
     *,
+    attribution_targets: (
+        Sequence[tuple[str, float, torch.Tensor] | int | str] | torch.Tensor | None
+    ) = None,
     max_n_logits: int = 10,
     desired_logit_prob: float = 0.95,
     batch_size: int = 512,
@@ -47,13 +51,24 @@ def attribute(
     verbose: bool = False,
     update_interval: int = 4,
 ) -> Graph:
-    """Compute an attribution graph for *prompt*.
+    """Compute an attribution graph for *prompt* using TransformerLens backend.
 
     Args:
         prompt: Text, token ids, or tensor - will be tokenized if str.
-        model: Frozen ``ReplacementModel``
-        max_n_logits: Max number of logit nodes.
-        desired_logit_prob: Keep logits until cumulative prob >= this value.
+        model: Frozen ``TransformerLensReplacementModel``
+        attribution_targets: Flexible attribution target specification in one of several formats:
+                          - None: Auto-select salient logits based on probability threshold
+                          - torch.Tensor: Tensor of token indices
+                          - Sequence[tuple[str, float, torch.Tensor] | int | str]: Sequence where
+                            each element can be:
+                              * int or str: Token ID/string (auto-resolves probability and
+                                unembed vector)
+                              * tuple[str, float, torch.Tensor]: Fully specified logit spec with
+                                arbitrary string tokens (or functions thereof) that may not be in
+                                vocabulary
+        max_n_logits: Max number of logit nodes (used when attribution_targets is None).
+        desired_logit_prob: Keep logits until cumulative prob >= this value
+                           (used when attribution_targets is None).
         batch_size: How many source nodes to process per backward pass.
         max_feature_nodes: Max number of feature nodes to include in the graph.
         offload: Method for offloading model parameters to save memory.
@@ -83,6 +98,7 @@ def attribute(
         return _run_attribution(
             model=model,
             prompt=prompt,
+            attribution_targets=attribution_targets,
             max_n_logits=max_n_logits,
             desired_logit_prob=desired_logit_prob,
             batch_size=batch_size,
@@ -104,6 +120,7 @@ def attribute(
 def _run_attribution(
     model,
     prompt,
+    attribution_targets,
     max_n_logits,
     desired_logit_prob,
     batch_size,
@@ -147,21 +164,26 @@ def _run_attribution(
     n_layers, n_pos, _ = activation_matrix.shape
     total_active_feats = activation_matrix._nnz()
 
-    logit_idx, logit_p, logit_vecs = compute_salient_logits(
-        ctx.logits[0, -1],
-        model.unembed.W_U,
+    targets = AttributionTargets(
+        attribution_targets=attribution_targets,
+        logits=ctx.logits[0, -1],
+        unembed_proj=model.unembed.W_U,
+        tokenizer=model.tokenizer,
         max_n_logits=max_n_logits,
         desired_logit_prob=desired_logit_prob,
     )
-    logger.info(
-        f"Selected {len(logit_idx)} logits with cumulative probability {logit_p.sum().item():.4f}"
-    )
+
+    if attribution_targets is None:
+        logger.info(
+            f"Selected {len(targets)} logits with cumulative probability "
+            f"{targets.logit_probabilities.sum().item():.4f}"
+        )
 
     if offload:
         offload_handles += offload_modules([model.unembed, model.embed], offload)
 
     logit_offset = len(feat_layers) + (n_layers + 1) * n_pos
-    n_logits = len(logit_idx)
+    n_logits = len(targets)
     total_nodes = logit_offset + n_logits
 
     max_feature_nodes = min(max_feature_nodes or total_active_feats, total_active_feats)
@@ -176,8 +198,8 @@ def _run_attribution(
     # Phase 3: logit attribution
     logger.info("Phase 3: Computing logit attributions")
     phase_start = time.time()
-    for i in range(0, len(logit_idx), batch_size):
-        batch = logit_vecs[i : i + batch_size]
+    for i in range(0, len(targets), batch_size):
+        batch = targets.logit_vectors[i : i + batch_size]
         rows = ctx.compute_batch(
             layers=torch.full((batch.shape[0],), n_layers),
             positions=torch.full((batch.shape[0],), n_pos - 1),
@@ -203,7 +225,7 @@ def _run_attribution(
             pending = torch.arange(total_active_feats)
         else:
             influences = compute_partial_influences(
-                edge_matrix[:st], logit_p, row_to_node_index[:st]
+                edge_matrix[:st], targets.logit_probabilities, row_to_node_index[:st]
             )
             feature_rank = torch.argsort(influences[:total_active_feats], descending=True).cpu()
             queue_size = min(update_interval * batch_size, max_feature_nodes - n_visited)
@@ -248,8 +270,7 @@ def _run_attribution(
     graph = Graph(
         input_string=model.tokenizer.decode(input_ids),
         input_tokens=input_ids,
-        logit_tokens=logit_idx,
-        logit_probabilities=logit_p,
+        attribution_targets=targets,
         active_features=activation_matrix.indices().T,
         activation_values=activation_matrix.values(),
         selected_features=selected_features,