diff --git a/pymepix/main.py b/pymepix/main.py index f24096b..8735ebb 100644 --- a/pymepix/main.py +++ b/pymepix/main.py @@ -367,7 +367,7 @@ def main(): "--number_of_processes", dest="number_of_processes", type=int, - default=1, + default=4, help="The number of processes used for the centroiding (default: 1 => parallel processing disabled')", ) diff --git a/pymepix/processing/logic/centroid_calculator.py b/pymepix/processing/logic/centroid_calculator.py index c8f1bdb..67e43b3 100644 --- a/pymepix/processing/logic/centroid_calculator.py +++ b/pymepix/processing/logic/centroid_calculator.py @@ -22,6 +22,8 @@ from threading import current_thread from time import time +import os + import numpy as np import scipy.ndimage as nd from sklearn.cluster import DBSCAN @@ -159,7 +161,7 @@ class CentroidCalculator(ProcessingStep): def __init__( self, cent_timewalk_lut=None, - number_of_processes=4, + number_of_processes=1, clustering_args={}, dbscan_clustering = True, *args, @@ -386,12 +388,10 @@ def centroid_chunks_to_centroids(self, chunks): def perform_centroiding_dbscan(self, chunks): # with Pool(self.number_of_processes) as p: # return p.map(self.calculate_centroids_dbscan, chunks) - - if self.number_of_processes>1: -# return Parallel(n_jobs=self.number_of_processes)(delayed(calculate_centroids_dbscan)(c,self.tot_threshold, self._tof_scale, self.epsilon, self.min_samples, self._cent_timewalk_lut) for c in chunks) - return Parallel(n_jobs=None)(delayed(calculate_centroids_dbscan)(c,self.tot_threshold, self._tof_scale, self.epsilon, self.min_samples, self._cent_timewalk_lut) for c in chunks) - - return map(self.calculate_centroids_dbscan, chunks) + + return Parallel(n_jobs=self.number_of_processes)(delayed(calculate_centroids_dbscan)(c,self.tot_threshold, self._tof_scale, self.epsilon, self.min_samples, self._cent_timewalk_lut) for c in chunks) + +# return map(self.calculate_centroids_dbscan, chunks) def perform_centroiding_cluster_stream(self, chunks): self.cstream = ClusterStream(self.cs_sensor_size, self.cs_max_dist_tof,\ @@ -448,89 +448,8 @@ def calculate_centroids_cluster_stream(self, chunk): return None - def perform_clustering_dbscan(self, shot, x, y, tof): - """ The clustering with DBSCAN, which is performed in this function is dependent on the - order of the data in rare cases. Therefore, reordering in any means can - lead to slightly changed results, which should not be an issue. - - Martin Ester, Hans-Peter Kriegel, Jiirg Sander, Xiaowei Xu: A Density Based Algorithm for - Discovering Clusters [p. 229-230] (https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf) - A more specific explaination can be found here: - https://stats.stackexchange.com/questions/306829/why-is-dbscan-deterministic""" - if x.size >= 0: - X = np.column_stack( - (shot * self.epsilon * 1_000, x, y, tof * self._tof_scale) - ) - - dist = DBSCAN( - eps=self.epsilon, - min_samples=self.min_samples, - metric="euclidean", - n_jobs=1, - ).fit(X) - return dist.labels_ + 1 - - return None - - - def calculate_centroids_properties(self, shot, x, y, tof, tot, labels): - """ - Calculates the properties of the centroids from labeled data points. - - ATTENTION! The order of the points can have an impact on the result due to errors in - the floating point arithmetics. - - Very simple example: - arr = np.random.random(100) - arr.sum() - np.sort(arr).sum() - This example shows that there is a very small difference between the two sums. The inaccuracy of - floating point arithmetics can depend on the order of the values. Strongly simplified (3.2 + 3.4) + 2.7 - and 3.2 + (3.4 + 2.7) can be unequal for floating point numbers. - - Therefore there is no guarantee for strictly equal results. Even after sorting. The error we observed - can be about 10^-22 nano seconds. - Currently this is issue exists only for the TOF-column as the other columns are integer-based values. - """ - label_index, cluster_size = np.unique(labels, return_counts=True) - tot_max = np.array( - nd.maximum_position(tot, labels=labels, index=label_index) - ).flatten() - - tot_sum = nd.sum(tot, labels=labels, index=label_index) - tot_mean = nd.mean(tot, labels=labels, index=label_index) - cluster_x = np.array( - nd.sum(x * tot, labels=labels, index=label_index) / tot_sum - ).flatten() - cluster_y = np.array( - nd.sum(y * tot, labels=labels, index=label_index) / tot_sum - ).flatten() - cluster_tof = np.array( - nd.sum(tof * tot, labels=labels, index=label_index) / tot_sum - ).flatten() - cluster_totMax = tot[tot_max] - cluster_totAvg = tot_mean - cluster_shot = shot[tot_max] - - if self._cent_timewalk_lut is not None: - # cluster_tof -= self._timewalk_lut[(cluster_tot / 25).astype(np.int) - 1] - # cluster_tof *= 1e6 - cluster_tof -= ( - self._cent_timewalk_lut[np.int_(cluster_totMax // 25) - 1] * 1e3 - ) - # TODO: should totAvg not also be timewalk corrected?! - # cluster_tof *= 1e-6 - - return ( - cluster_shot, - cluster_x, - cluster_y, - cluster_tof, - cluster_totAvg, - cluster_totMax, - cluster_size, - ) class CentroidCalculatorPooled(CentroidCalculator):