Skip to content
This repository has been archived by the owner on Mar 25, 2024. It is now read-only.

Commit

Permalink
fixed issue with num_processes at centroid_calculator
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrey Samartsev committed Jan 18, 2024
1 parent 2491bbe commit 422bc76
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 89 deletions.
2 changes: 1 addition & 1 deletion pymepix/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ def main():
"--number_of_processes",
dest="number_of_processes",
type=int,
default=1,
default=4,
help="The number of processes used for the centroiding (default: 1 => parallel processing disabled')",
)

Expand Down
95 changes: 7 additions & 88 deletions pymepix/processing/logic/centroid_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from threading import current_thread
from time import time

import os

import numpy as np
import scipy.ndimage as nd
from sklearn.cluster import DBSCAN
Expand Down Expand Up @@ -159,7 +161,7 @@ class CentroidCalculator(ProcessingStep):
def __init__(
self,
cent_timewalk_lut=None,
number_of_processes=4,
number_of_processes=1,
clustering_args={},
dbscan_clustering = True,
*args,
Expand Down Expand Up @@ -386,12 +388,10 @@ def centroid_chunks_to_centroids(self, chunks):
def perform_centroiding_dbscan(self, chunks):
# with Pool(self.number_of_processes) as p:
# return p.map(self.calculate_centroids_dbscan, chunks)

if self.number_of_processes>1:
# return Parallel(n_jobs=self.number_of_processes)(delayed(calculate_centroids_dbscan)(c,self.tot_threshold, self._tof_scale, self.epsilon, self.min_samples, self._cent_timewalk_lut) for c in chunks)
return Parallel(n_jobs=None)(delayed(calculate_centroids_dbscan)(c,self.tot_threshold, self._tof_scale, self.epsilon, self.min_samples, self._cent_timewalk_lut) for c in chunks)

return map(self.calculate_centroids_dbscan, chunks)

return Parallel(n_jobs=self.number_of_processes)(delayed(calculate_centroids_dbscan)(c,self.tot_threshold, self._tof_scale, self.epsilon, self.min_samples, self._cent_timewalk_lut) for c in chunks)

# return map(self.calculate_centroids_dbscan, chunks)

def perform_centroiding_cluster_stream(self, chunks):
self.cstream = ClusterStream(self.cs_sensor_size, self.cs_max_dist_tof,\
Expand Down Expand Up @@ -448,89 +448,8 @@ def calculate_centroids_cluster_stream(self, chunk):

return None

def perform_clustering_dbscan(self, shot, x, y, tof):
""" The clustering with DBSCAN, which is performed in this function is dependent on the
order of the data in rare cases. Therefore, reordering in any means can
lead to slightly changed results, which should not be an issue.
Martin Ester, Hans-Peter Kriegel, Jiirg Sander, Xiaowei Xu: A Density Based Algorithm for
Discovering Clusters [p. 229-230] (https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf)
A more specific explaination can be found here:
https://stats.stackexchange.com/questions/306829/why-is-dbscan-deterministic"""
if x.size >= 0:
X = np.column_stack(
(shot * self.epsilon * 1_000, x, y, tof * self._tof_scale)
)

dist = DBSCAN(
eps=self.epsilon,
min_samples=self.min_samples,
metric="euclidean",
n_jobs=1,
).fit(X)

return dist.labels_ + 1

return None


def calculate_centroids_properties(self, shot, x, y, tof, tot, labels):
"""
Calculates the properties of the centroids from labeled data points.
ATTENTION! The order of the points can have an impact on the result due to errors in
the floating point arithmetics.
Very simple example:
arr = np.random.random(100)
arr.sum() - np.sort(arr).sum()
This example shows that there is a very small difference between the two sums. The inaccuracy of
floating point arithmetics can depend on the order of the values. Strongly simplified (3.2 + 3.4) + 2.7
and 3.2 + (3.4 + 2.7) can be unequal for floating point numbers.
Therefore there is no guarantee for strictly equal results. Even after sorting. The error we observed
can be about 10^-22 nano seconds.

Currently this is issue exists only for the TOF-column as the other columns are integer-based values.
"""
label_index, cluster_size = np.unique(labels, return_counts=True)
tot_max = np.array(
nd.maximum_position(tot, labels=labels, index=label_index)
).flatten()

tot_sum = nd.sum(tot, labels=labels, index=label_index)
tot_mean = nd.mean(tot, labels=labels, index=label_index)
cluster_x = np.array(
nd.sum(x * tot, labels=labels, index=label_index) / tot_sum
).flatten()
cluster_y = np.array(
nd.sum(y * tot, labels=labels, index=label_index) / tot_sum
).flatten()
cluster_tof = np.array(
nd.sum(tof * tot, labels=labels, index=label_index) / tot_sum
).flatten()
cluster_totMax = tot[tot_max]
cluster_totAvg = tot_mean
cluster_shot = shot[tot_max]

if self._cent_timewalk_lut is not None:
# cluster_tof -= self._timewalk_lut[(cluster_tot / 25).astype(np.int) - 1]
# cluster_tof *= 1e6
cluster_tof -= (
self._cent_timewalk_lut[np.int_(cluster_totMax // 25) - 1] * 1e3
)
# TODO: should totAvg not also be timewalk corrected?!
# cluster_tof *= 1e-6

return (
cluster_shot,
cluster_x,
cluster_y,
cluster_tof,
cluster_totAvg,
cluster_totMax,
cluster_size,
)


class CentroidCalculatorPooled(CentroidCalculator):
Expand Down

0 comments on commit 422bc76

Please sign in to comment.