diff --git a/cellfinder/core/classify/classify.py b/cellfinder/core/classify/classify.py index 37fc06cc..b9ca1417 100644 --- a/cellfinder/core/classify/classify.py +++ b/cellfinder/core/classify/classify.py @@ -29,6 +29,7 @@ def main( model_weights: Optional[os.PathLike], network_depth: depth_type, max_workers: int = 3, + pin_memory: bool = False, *, callback: Optional[Callable[[int], None]] = None, ) -> List[Cell]: @@ -74,6 +75,12 @@ def main( max_workers: int The number of sub-processes to use for data loading / processing. Defaults to 8. + pin_memory: bool + Pins data to be sent to the GPU to the CPU memory. This allows faster + GPU data speeds, but can only be used if the data used by the GPU can + stay in the CPU RAM while the GPU uses it. I.e. there's enough RAM. + Otherwise, if there's a risk of the RAM being paged, it shouldn't be + used. Defaults to False. callback : Callable[int], optional A callback function that is called during classification. Called with the batch number once that batch has been classified. diff --git a/cellfinder/core/detect/detect.py b/cellfinder/core/detect/detect.py index 076b3ae5..e6c0d398 100644 --- a/cellfinder/core/detect/detect.py +++ b/cellfinder/core/detect/detect.py @@ -49,6 +49,7 @@ def main( plane_directory: Optional[str] = None, batch_size: Optional[int] = None, torch_device: Optional[str] = None, + pin_memory: bool = False, split_ball_xy_size: float = 6, split_ball_z_size: float = 15, split_ball_overlap_fraction: float = 0.8, @@ -116,6 +117,12 @@ def main( The device on which to run the computation. If not specified (None), "cuda" will be used if a GPU is available, otherwise "cpu". You can also manually specify "cuda" or "cpu". + pin_memory: bool + Pins data to be sent to the GPU to the CPU memory. This allows faster + GPU data speeds, but can only be used if the data used by the GPU can + stay in the CPU RAM while the GPU uses it. I.e. there's enough RAM. + Otherwise, if there's a risk of the RAM being paged, it shouldn't be + used. Defaults to False. split_ball_xy_size: float Similar to `ball_xy_size`, except the value to use for the 3d filter during cluster splitting. @@ -192,6 +199,7 @@ def main( plane_directory=plane_directory, batch_size=batch_size, torch_device=torch_device, + pin_memory=pin_memory, n_splitting_iter=n_splitting_iter, ) diff --git a/cellfinder/core/detect/filters/setup_filters.py b/cellfinder/core/detect/filters/setup_filters.py index 1cf13ee8..ef2d0bf1 100644 --- a/cellfinder/core/detect/filters/setup_filters.py +++ b/cellfinder/core/detect/filters/setup_filters.py @@ -189,6 +189,14 @@ class DetectionSettings: to run on the first GPU. """ + pin_memory: bool = False + """ + Pins data to be sent to the GPU to the CPU memory. This allows faster GPU + data speeds, but can only be used if the data used by the GPU can stay in + the CPU RAM while the GPU uses it. I.e. there's enough RAM. Otherwise, if + there's a risk of the RAM being paged, it shouldn't be used. + """ + n_free_cpus: int = 2 """ Number of free CPU cores to keep available and not use during parallel diff --git a/cellfinder/core/detect/filters/volume/volume_filter.py b/cellfinder/core/detect/filters/volume/volume_filter.py index 1cf432bf..0ba8e816 100644 --- a/cellfinder/core/detect/filters/volume/volume_filter.py +++ b/cellfinder/core/detect/filters/volume/volume_filter.py @@ -140,7 +140,7 @@ def _get_filter_buffers( tensor = torch.empty( (batch_size, *self.settings.plane_shape), dtype=torch_dtype, - pin_memory=not cpu, + pin_memory=not cpu and self.settings.pin_memory, device="cpu", ) diff --git a/cellfinder/core/main.py b/cellfinder/core/main.py index 7ff3418b..e1167b3d 100644 --- a/cellfinder/core/main.py +++ b/cellfinder/core/main.py @@ -37,6 +37,7 @@ def main( detected_cells: List[Cell] = None, detection_batch_size: Optional[int] = None, torch_device: Optional[str] = None, + pin_memory: bool = False, split_ball_xy_size: float = 6, split_ball_z_size: float = 15, split_ball_overlap_fraction: float = 0.8, @@ -135,6 +136,12 @@ def main( The device on which to run the computation. If not specified (None), "cuda" will be used if a GPU is available, otherwise "cpu". You can also manually specify "cuda" or "cpu". + pin_memory: bool + Pins data to be sent to the GPU to the CPU memory. This allows faster + GPU data speeds, but can only be used if the data used by the GPU can + stay in the CPU RAM while the GPU uses it. I.e. there's enough RAM. + Otherwise, if there's a risk of the RAM being paged, it shouldn't be + used. Defaults to False. split_ball_xy_size: float Similar to `ball_xy_size`, except the value to use for the 3d filter during cluster splitting. @@ -180,6 +187,7 @@ def main( n_sds_above_mean_thresh, batch_size=detection_batch_size, torch_device=torch_device, + pin_memory=pin_memory, callback=detect_callback, split_ball_z_size=split_ball_z_size, split_ball_xy_size=split_ball_xy_size, diff --git a/cellfinder/napari/detect/detect.py b/cellfinder/napari/detect/detect.py index 2503d412..87cbf5c0 100644 --- a/cellfinder/napari/detect/detect.py +++ b/cellfinder/napari/detect/detect.py @@ -263,6 +263,7 @@ def widget( n_free_cpus: int, analyse_local: bool, use_gpu: bool, + pin_memory: bool, debug: bool, reset_button, ) -> None: @@ -336,6 +337,12 @@ def widget( Only analyse planes around the current position use_gpu : bool If True, use GPU for processing (if available); otherwise, use CPU. + pin_memory: bool + Pins data to be sent to the GPU to the CPU memory. This allows + faster GPU data speeds, but can only be used if the data used by + the GPU can stay in the CPU RAM while the GPU uses it. I.e. there's + enough RAM. Otherwise, if there's a risk of the RAM being paged, it + shouldn't be used. Defaults to False. debug : bool Increase logging reset_button : @@ -411,7 +418,13 @@ def widget( end_plane = len(signal_image.data) misc_inputs = MiscInputs( - start_plane, end_plane, n_free_cpus, analyse_local, use_gpu, debug + start_plane, + end_plane, + n_free_cpus, + analyse_local, + use_gpu, + pin_memory, + debug, ) worker = Worker( diff --git a/cellfinder/napari/detect/detect_containers.py b/cellfinder/napari/detect/detect_containers.py index 5a130853..2281311e 100644 --- a/cellfinder/napari/detect/detect_containers.py +++ b/cellfinder/napari/detect/detect_containers.py @@ -153,6 +153,7 @@ class MiscInputs(InputContainer): n_free_cpus: int = 2 analyse_local: bool = False use_gpu: bool = field(default_factory=lambda: torch.cuda.is_available()) + pin_memory: bool = False debug: bool = False def as_core_arguments(self) -> dict: @@ -179,5 +180,10 @@ def widget_representation(cls) -> dict: value=cls.defaults()["use_gpu"], enabled=torch.cuda.is_available(), ), + pin_memory=dict( + widget_type="CheckBox", + label="Pin data to memory", + value=cls.defaults()["pin_memory"], + ), debug=dict(value=cls.defaults()["debug"]), )