brainglobe · matham · May 24, 2025 · May 25, 2025 · May 25, 2025 · Jun 5, 2025
diff --git a/benchmarks/filter_debug.py b/benchmarks/filter_debug.py
diff --git a/cellfinder/core/classify/classify.py b/cellfinder/core/classify/classify.py
@@ -19,8 +19,8 @@ def main(
     signal_array: types.array,
     background_array: types.array,
     n_free_cpus: int,
-    voxel_sizes: Tuple[int, int, int],
-    network_voxel_sizes: Tuple[int, int, int],
+    voxel_sizes: Tuple[float, float, float],
+    network_voxel_sizes: Tuple[float, float, float],
     batch_size: int,
     cube_height: int,
     cube_width: int,
@@ -35,6 +35,48 @@ def main(
     """
     Parameters
     ----------
+
+    points: List of Cell objects
+        The potential cells to classify.
+    signal_array : numpy.ndarray or dask array
+        3D array representing the signal data in z, y, x order.
+    background_array : numpy.ndarray or dask array
+        3D array representing the signal data in z, y, x order.
+    n_free_cpus : int
+        How many CPU cores to leave free.
+    voxel_sizes : 3-tuple of floats
+        Size of your voxels in the z, y, and x dimensions.
+    network_voxel_sizes : 3-tuple of floats
+        Size of the pre-trained network's voxels in the z, y, and x dimensions.
+    batch_size : int
+        How many potential cells to classify at one time. The GPU/CPU
+        memory must be able to contain at once this many data cubes for
+        the models. Tune to maximize memory usage without running
+        out. Check your GPU/CPU memory to verify it's not full.
+    cube_height: int
+        The height of the data cube centered on the cell used for
+        classification. Defaults to `50`.
+    cube_width: int
+        The width of the data cube centered on the cell used for
+        classification. Defaults to `50`.
+    cube_depth: int
+        The depth of the data cube centered on the cell used for
+        classification. Defaults to `20`.
+    trained_model : Optional[Path]
+        Trained model file path (home directory (default) -> pretrained
+        weights).
+    model_weights : Optional[Path]
+        Model weights path (home directory (default) -> pretrained
+        weights).
+    network_depth: str
+        The network depth to use during classification. Defaults to `"50"`.
+    max_workers: int
+        The number of sub-processes to use for data loading / processing.
+        Defaults to 8.
+    pin_memory: bool
+        Whether torch should pin any memory to be sent to the GPU. This results
+        in faster GPU uploaded, but, memory cannot be paged while it's in use.
+        So only use if you have enough RAM.
     callback : Callable[int], optional
         A callback function that is called during classification. Called with
         the batch number once that batch has been classified.

diff --git a/cellfinder/core/detect/detect.py b/cellfinder/core/detect/detect.py
@@ -49,10 +49,10 @@ def main(
     plane_directory: Optional[str] = None,
     batch_size: Optional[int] = None,
     torch_device: Optional[str] = None,
-    split_ball_xy_size: int = 3,
-    split_ball_z_size: int = 3,
+    split_ball_xy_size: int = 6,
+    split_ball_z_size: int = 15,
     split_ball_overlap_fraction: float = 0.8,
-    split_soma_diameter: int = 7,
+    n_splitting_iter: int = 10,
     *,
     callback: Optional[Callable[[int], None]] = None,
 ) -> List[Cell]:
@@ -61,77 +61,79 @@ def main(
 
     Parameters
     ----------
-    signal_array : numpy.ndarray
-        3D array representing the signal data.
-
+    signal_array : numpy.ndarray or dask array
+        3D array representing the signal data in z, y, x order.
     start_plane : int
-        Index of the starting plane for detection.
-
+        First plane to process (to process a subset of the data).
     end_plane : int
-        Index of the ending plane for detection.
-
-    voxel_sizes : Tuple[float, float, float]
-        Tuple of voxel sizes in each dimension (z, y, x).
-
+        Last plane to process (to process a subset of the data).
+    voxel_sizes : 3-tuple of floats
+        Size of your voxels in the z, y, and x dimensions.
     soma_diameter : float
-        Diameter of the soma in physical units.
-
-    max_cluster_size : float
-        Maximum size of a cluster in physical units.
-
+        The expected in-plane (xy) soma diameter (microns).
+    max_cluster_size : int
+        Largest detected cell cluster (in cubic um) where splitting
+        should be attempted. Clusters above this size will be labeled
+        as artifacts.
     ball_xy_size : float
-        Size of the XY ball used for filtering in physical units.
-
+        3d filter's in-plane (xy) filter ball size (microns).
     ball_z_size : float
-        Size of the Z ball used for filtering in physical units.
-
+        3d filter's axial (z) filter ball size (microns).
     ball_overlap_fraction : float
-        Fraction of overlap allowed between balls.
-
+        3d filter's fraction of the ball filter needed to be filled by
+        foreground voxels, centered on a voxel, to retain the voxel.
     soma_spread_factor : float
-        Spread factor for soma size.
-
+        Cell spread factor for determining the largest cell volume before
+        splitting up cell clusters. Structures with spherical volume of
+        diameter `soma_spread_factor * soma_diameter` or less will not be
+        split.
     n_free_cpus : int
-        Number of free CPU cores available for parallel processing.
-
+        How many CPU cores to leave free.
     log_sigma_size : float
-        Size of the sigma for the log filter.
-
+        Gaussian filter width (as a fraction of soma diameter) used during
+        2d in-plane filtering.
     n_sds_above_mean_thresh : float
-        Number of standard deviations above the mean threshold.
-
+        Intensity threshold (the number of standard deviations above
+        the mean) of the filtered 2d planes used to mark pixels as
+        foreground or background.
     outlier_keep : bool, optional
         Whether to keep outliers during detection. Defaults to False.
-
     artifact_keep : bool, optional
         Whether to keep artifacts during detection. Defaults to False.
-
     save_planes : bool, optional
         Whether to save the planes during detection. Defaults to False.
-
     plane_directory : str, optional
         Directory path to save the planes. Defaults to None.
-
-    batch_size : int, optional
-        The number of planes to process in each batch. Defaults to 1.
-        For CPU, there's no benefit for a larger batch size. Only a memory
-        usage increase. For CUDA, the larger the batch size the better the
-        performance. Until it fills up the GPU memory - after which it
-        becomes slower.
-
+    batch_size: int
+        The number of planes of the original data volume to process at
+        once. The GPU/CPU memory must be able to contain this many planes
+        for all the filters. Tune to maximize memory usage without running
+        out. Check your GPU/CPU memory to verify it's not full.
     torch_device : str, optional
         The device on which to run the computation. If not specified (None),
         "cuda" will be used if a GPU is available, otherwise "cpu".
         You can also manually specify "cuda" or "cpu".
-
+    split_ball_xy_size: int
+        Similar to `ball_xy_size`, except the value to use for the 3d
+        filter during cluster splitting.
+    split_ball_z_size: int
+        Similar to `ball_z_size`, except the value to use for the 3d filter
+        during cluster splitting.
+    split_ball_overlap_fraction: float
+        Similar to `ball_overlap_fraction`, except the value to use for the
+        3d filter during cluster splitting.
+    n_splitting_iter: int
+        The number of iterations to run the 3d filtering on a cluster. Each
+        iteration reduces the cluster size by the voxels not retained in
+        the previous iteration.
     callback : Callable[int], optional
         A callback function that is called every time a plane has finished
         being processed. Called with the plane number that has finished.
 
     Returns
     -------
     List[Cell]
-        List of detected cells.
+        List of detected potential cells and artifacts.
     """
     start_time = datetime.now()
     if torch_device is None:
@@ -187,19 +189,15 @@ def main(
         plane_directory=plane_directory,
         batch_size=batch_size,
         torch_device=torch_device,
+        n_splitting_iter=n_splitting_iter,
     )
 
     # replicate the settings specific to splitting, before we access anything
     # of the original settings, causing cached properties
     kwargs = dataclasses.asdict(settings)
-    kwargs["ball_z_size_um"] = split_ball_z_size * settings.z_pixel_size
-    kwargs["ball_xy_size_um"] = (
-        split_ball_xy_size * settings.in_plane_pixel_size
-    )
+    kwargs["ball_z_size_um"] = split_ball_z_size
+    kwargs["ball_xy_size_um"] = split_ball_xy_size
     kwargs["ball_overlap_fraction"] = split_ball_overlap_fraction
-    kwargs["soma_diameter_um"] = (
-        split_soma_diameter * settings.in_plane_pixel_size
-    )
     # always run on cpu because copying to gpu overhead is likely slower than
     # any benefit for detection on smallish volumes
     kwargs["torch_device"] = "cpu"

diff --git a/cellfinder/core/detect/filters/setup_filters.py b/cellfinder/core/detect/filters/setup_filters.py
@@ -85,18 +85,23 @@ class DetectionSettings:
     """
 
     soma_spread_factor: float = 1.4
-    """Spread factor for soma size - how much it may stretch in the images."""
+    """
+    Cell spread factor for determining the largest cell volume before
+    splitting up cell clusters. Structures with spherical volume of
+    diameter `soma_spread_factor * soma_diameter` or less will not be
+    split.
+    """
 
     soma_diameter_um: float = 16
     """
-    Diameter of a typical soma in um. Bright areas larger than this will be
-    split.
+    Diameter of a typical soma in-plane (xy) in microns.
     """
 
     max_cluster_size_um3: float = 100_000
     """
-    Maximum size of a cluster (bright area) that will be processed, in um.
-    Larger bright areas are skipped as artifacts.
+    Largest detected cell cluster (in cubic um) where splitting
+    should be attempted. Clusters above this size will be labeled
+    as artifacts.
     """
 
     ball_xy_size_um: float = 6
@@ -116,17 +121,21 @@ class DetectionSettings:
 
     ball_overlap_fraction: float = 0.6
     """
-    Fraction of overlap between a bright area and the spherical kernel,
-    for the area to be considered a single ball.
+    Fraction of the 3d ball filter needed to be filled by foreground voxels,
+    centered on a voxel, to retain the voxel.
     """
 
     log_sigma_size: float = 0.2
-    """Size of the sigma for the 2d Gaussian filter."""
+    """
+    Gaussian filter width (as a fraction of soma diameter) used during
+    2d in-plane filtering.
+    """
 
     n_sds_above_mean_thresh: float = 10
     """
-    Number of standard deviations above the mean intensity to use for a
-    threshold to define bright areas. Below it, it's not considered bright.
+    Intensity threshold (the number of standard deviations above
+    the mean) of the filtered 2d planes used to mark pixels as
+    foreground or background.
     """
 
     outlier_keep: bool = False
@@ -191,6 +200,8 @@ class DetectionSettings:
     """
     During the structure splitting phase we iteratively shrink the bright areas
     and re-filter with the 3d filter. This is the number of iterations to do.
+    Each iteration reduces the cluster size by the voxels not retained in the
+    previous iteration.
 
     This is a maximum because we also stop if there are no more structures left
     during any iteration.

diff --git a/cellfinder/core/detect/filters/volume/structure_splitting.py b/cellfinder/core/detect/filters/volume/structure_splitting.py
@@ -1,3 +1,4 @@
+from copy import copy
 from typing import List, Tuple, Type
 
 import numpy as np
@@ -224,6 +225,7 @@ def split_cells(
             where M is the number of individual cells and each centre is
             represented by its x, y, and z coordinates.
     """
+    settings = copy(settings)
     # these points are in x, y, z order columnwise, in absolute pixels
     orig_centre = get_structure_centre(cell_points)