Merge branch 'production'

ear-team · Dec 13, 2022 · 84a5973 · 84a5973
2 parents 1a30fd7 + 65ca74f
commit 84a5973
Show file tree

Hide file tree

Showing 9 changed files with 69 additions and 46 deletions.
diff --git a/bambird/cluster.py b/bambird/cluster.py
@@ -29,10 +29,13 @@
 from sklearn.neighbors import NearestNeighbors
 from sklearn.cluster import DBSCAN
 from sklearn.metrics import confusion_matrix
-# from sklearn.manifold import TSNE
+from sklearn.manifold import TSNE
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
 
+# umap
+import umap 
+
 # Kneed package to find the knee of a curve
 from kneed import KneeLocator
 
@@ -398,7 +401,6 @@ def find_cluster(
                 ).fit(X)
 
                 if verbose:
-                    print("filename {}".format(df_single_categories.filename))
                     print("HDBSCAN eps {} min_points {} Number of soundtypes found for {} : {}".format(eps, min_points,
                             categories, np.unique(cluster.labels_).size))
 
@@ -427,7 +429,7 @@ def find_cluster(
                 # find the cluster ID of the biggest cluster that is not noise
                 try :
                     biggest_cluster_ID = df_cluster.loc[(df_cluster["categories"] == categories) & (
-                                                     df_cluster["cluster_number"] >= 0)]["cluster_number"].value_counts().argmax()
+                                                     df_cluster["cluster_number"] >= 0)]["cluster_number"].value_counts().idxmax()
                     # set by default to 1 the auto_label of the biggest cluster
                     df_cluster.loc[(df_cluster["categories"] == categories) & (
                                     df_cluster["cluster_number"] == biggest_cluster_ID), "auto_label"] = int(1)
@@ -444,25 +446,48 @@ def find_cluster(
                                 df_cluster["cluster_number"] >= 0), "auto_label"] = int(1)
 
 
-            if display:
+            if display:                
                 # display the result in 2D (2D reduction of the dimension)
-                pca = PCA(n_components=2)
-                principalComponents = pca.fit_transform(X)
-                df_PCA = pd.DataFrame(
-                    data=principalComponents,
-                    columns=["principal component 1", "principal component 2"],
+                # compute the dimensionality reduction.
+
+                ##### pca
+                # pca = PCA(n_components=2)
+                # principalComponents = pca.fit_transform(X)
+                # Y = pd.DataFrame(
+
+                ##### tsne
+                # tsne = TSNE(n_components=2, 
+                #             init='pca', 
+                #             n_jobs = -1,
+                #             random_state=cfg.RANDOM_SEED)
+                # Y = tsne.fit_transform(X)
+
+                ##### umap
+                umap_red = umap.UMAP(
+                        n_neighbors=min_points,
+                        n_components=2,
+                        random_state=cfg.RANDOM_SEED)
+                Y = umap_red.fit_transform(X)
+
+
+                df_reducdim = pd.DataFrame(
+                    data=Y,
+                    columns=["dim1", "dim2"],
                 )
-                ax[count].set_xlabel("PC 1", fontsize=10)
-                ax[count].set_ylabel("PC 2", fontsize=10)
+
+                ax[count].set_xlabel("dim 1", fontsize=10)
+                ax[count].set_ylabel("dim 2", fontsize=10)
                 ax[count].set_title(categories, fontsize=12)
 
                 ax[count].scatter(
-                    df_PCA["principal component 1"],
-                    df_PCA["principal component 2"],
+                    df_reducdim["dim1"],
+                    df_reducdim["dim2"],
                     c=cluster.labels_,
                     s=50,
                     alpha=0.8,
-                )
+                )        
+
+
 
         # increment
         count += 1

diff --git a/bambird/config.py b/bambird/config.py
@@ -91,6 +91,7 @@
 }
 
 PARAMS = {
+    'RANDOM_SEED' : RANDOM_SEED,
     'PARAMS_XC' : PARAMS_XC,
     'PARAMS_EXTRACT' : PARAMS_EXTRACT,
     'PARAMS_FEATURES' : PARAMS_FEATURES,
@@ -157,6 +158,7 @@ def load_config(fullfilename = None):
     """    
 
     global PARAMS  
+    global RANDOM_SEED
     global PARAMS_XC
     global PARAMS_EXTRACT
     global PARAMS_FEATURES
@@ -165,6 +167,7 @@ def load_config(fullfilename = None):
     if os.path.isfile(str(fullfilename)): 
         with open(fullfilename) as f:
             PARAMS = yaml.load(f, Loader=_get_loader())
+            RANDOM_SEED = PARAMS['RANDOM_SEED']
             PARAMS_XC = PARAMS['PARAMS_XC']
             PARAMS_EXTRACT = PARAMS['PARAMS_EXTRACT']
             PARAMS_FEATURES = PARAMS['PARAMS_FEATURES']
@@ -176,6 +179,7 @@ def load_config(fullfilename = None):
 
 def get_config() :
     PARAMS = {
+        'RANDOM_SEED' : RANDOM_SEED,
         'PARAMS_XC' : PARAMS_XC,
         'PARAMS_EXTRACT' : PARAMS_EXTRACT,
         'PARAMS_FEATURES' : PARAMS_FEATURES,

diff --git a/bambird/segmentation_extract_rois_full_sig.py b/bambird/segmentation_extract_rois_full_sig.py
@@ -136,15 +136,16 @@ def _centroid_features(Sxx, rois=None, im_rois=None):
         centroid['area_xy'] = Sxx.shape[0] * Sxx.shape[1]
         centroid['duration_x'] = Sxx.shape[1]
         centroid['bandwidth_y'] = Sxx.shape[0]
-        centroid['snr'] = np.percentile(Sxx, 0.99)
+        # centroid['snr'] = np.percentile(Sxx, 0.99)
+        centroid['snr'] = mean_dB(add_dB(Sxx,axis=0))
     else: 
         if im_rois is not None : 
             # real centroid and area
             rprops = measure.regionprops(im_rois, intensity_image=Sxx)
             centroid = [roi.weighted_centroid for roi in rprops]
             area = [roi.area for roi in rprops]
-            snr = [power2dB(np.percentile(roi.image_intensity,99)) for roi in rprops]
-            # snr = [mean_dB(add_dB(roi.image_intensity,axis=0)) for roi in rprops]
+            # snr = [power2dB(np.percentile(roi.image_intensity,99)) for roi in rprops]
+            snr = [power2dB(np.mean(np.sum(roi.image_intensity,axis=0))) for roi in rprops]
         else:
             # rectangular area (overestimation) 
             area = (rois.max_y -rois.min_y) * (rois.max_x -rois.min_x)  
@@ -154,8 +155,8 @@ def _centroid_features(Sxx, rois=None, im_rois=None):
                 im_blobs = maad.rois.rois_to_imblobs(np.zeros(Sxx.shape), row)     
                 rprops = measure.regionprops(im_blobs, intensity_image=Sxx)
                 centroid.append(rprops.pop().weighted_centroid) 
-                snr.append(power2dB(np.percentile(rprops.pop().image_intensity,99)))
-                # snr.append(mean_dB(add_dB(rprops.pop().image_intensity,axis=0))) 
+                # snr.append(power2dB(np.percentile(rprops.pop().image_intensity,99)))
+                snr.append(power2dB(np.mean(np.sum(rprops.pop().image_intensity,axis=0)))) 
 
         centroid = pd.DataFrame(centroid, columns=['centroid_y', 'centroid_x'], index=rois.index)
 
@@ -524,14 +525,6 @@ def extract_rois_full_sig(
         # Keep only events with duration longer than MIN_DURATION
         df_rois = df_rois[((df_rois["max_t"]-df_rois["min_t"])>params["MIN_DURATION"])]
 
-        # 8. remove rois with ratio >max_ratio_xy (they are mostly artefact 
-        # such as wind, ain or clipping)
-        # add ratio x/y
-        df_rois['ratio_yx'] = (df_rois.max_y -df_rois.min_y) / (df_rois.max_x -df_rois.min_x) 
-
-        if params["MAX_RATIO_YX"] is not None :
-            df_rois = df_rois[df_rois['ratio_yx'] < params["MAX_RATIO_YX"]] 
-
         if verbose:
             print("=> AFTER MERGING FOUND {} ROIS".format(len(df_rois)))
 

diff --git a/bambird/version.py b/bambird/version.py
@@ -1,4 +1,4 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 """Version info"""
-__version__ = '0.2.1'
+__version__ = '0.3.0'
diff --git a/config_default.yaml b/config_default.yaml
@@ -31,9 +31,9 @@ PARAMS_EXTRACT:
   LOW_FREQ: 250           # Low frequency in Hz of the bandpass filter applied to the audio
   HIGH_FREQ: 12000        # High frequency in Hz of the bandpass filter applied to the audio
   BUTTER_ORDER: 1         # butterworth filter order to select the bandwidth corresponding to the ROI
-  AUDIO_DURATION: 30      # Max duration of the audio file (in seconds)
+  AUDIO_DURATION: 60      # Max duration of the audio file (in seconds)
   CHUNK_DURATION: 10       # Split the audio signal in chunks of duration = CHUNK_DURATION (in seconds)
-  OVLP: 0               # Define the overlap ratio between each chunk (0=> no overlap, 0.75=> 75% of overlap between 2 consecutive chunks)
+  OVLP: 0.5               # Define the overlap ratio between each chunk (0=> no overlap, 0.75=> 75% of overlap between 2 consecutive chunks)
   # Spectrogram
   MODE_RMBCKG: median     # Mode to compute the remove_background {'mean', 'median'}
   NFFT: 1024              # Number of points used for the SFFT to compute the spectrogram
@@ -42,7 +42,7 @@ PARAMS_EXTRACT:
   MASK_PARAM1: 26
   MASK_PARAM2: 10
   # Select and merge bbox parameters
-  MAX_RATIO_YX: 7        # ratio Y/X between the high (Y in px) and the width (X in px) of the ROI
+  MAX_RATIO_YX: 7         # ratio Y/X between the high (Y in px) and the width (X in px) of the ROI
   MIN_DURATION: 0.1       # minimum event duration in s
   MARGIN_T_LEFT: 0.2      # overlapping time margin in s on the left side of the ROI to merge
   MARGIN_T_RIGHT: 0.2     # overlapping time margin in s on the right side of the ROI to merge
@@ -61,8 +61,8 @@ PARAMS_FEATURES:
   SAMPLE_RATE: 48000      # Sampling frequency in Hz
   # Audio preprocess
   LOW_FREQ: 250           # Low frequency in Hz of the bandpass filter applied to the audio
-  HIGH_FREQ: 11000        # High frequency in Hz of the bandpass filter applied to the audio
-  BUTTER_ORDER: 1         # butterworth filter order to select the bandwidth corresponding to the ROI
+  HIGH_FREQ: 12000        # High frequency in Hz of the bandpass filter applied to the audio
+  BUTTER_ORDER: 5         # butterworth filter order to select the bandwidth corresponding to the ROI
   # Spectrogram
   NFFT: 1024               # Number of points of the spectrogram
   SHAPE_RES: 'high'        # Resolution of the shapes {low, med, high}
@@ -71,7 +71,7 @@ PARAMS_FEATURES:
 # CLUSTERING PARAMETERS
 ################################
 PARAMS_CLUSTER:
-  FEATURES: ['shp', 'centroid_f']   # choose the features used to cluster {'shp', 'centroid_f', 'peak_f', 'duration_t', 'bandwidth_f', 'bandwidth_min_f', 'bandwidth_max_f', 'min_f', 'max_f' }
+  FEATURES: ['shp', 'centroid_f', 'peak_f', 'duration_t', 'bandwidth_f', 'bandwidth_min_f', 'bandwidth_max_f']   # choose the features used to cluster {'shp', 'centroid_f', 'peak_f', 'duration_t', 'bandwidth_f', 'bandwidth_min_f', 'bandwidth_max_f', 'min_f', 'max_f' }
   SCALER: MINMAXSCALER              # scaler method to prepare the features before the clustering {STANDARDSCALER, ROBUSTSCALER, MINMAXSCALER}
   PERCENTAGE_PTS: 5                 # minimum number of ROIs to form a cluster (in % of the total number of ROIs) {number between 0 and 1 or blank}
   MIN_PTS:                          # minimum number of ROIs to form a cluster {integer or blank}

diff --git a/examples/workflow_multiple_species.py b/examples/workflow_multiple_species.py
@@ -32,15 +32,16 @@
 ROIS_NAME       = Path(str(DATASET_NAME) +'_ROIS')
 
 # List of species to build a clean dataset
-SCIENTIC_NAME_LIST = [
-                        "Regulus regulus",
+SCIENTIC_NAME_LIST = [ 
+                        "Columba palumbus",
+                        # "Regulus regulus",
                         "Phylloscopus collybita",
-                        "Anthus triviali", 
+                        # "Anthus triviali", 
                         "Fringilla coelebs", 
-                        "Troglodytes troglodytes", 
-                        "Phoenicurus phoenicurus", 
-                        "Strix aluco", 
-                        "Aegithalos caudatus",
+                        # "Troglodytes troglodytes", 
+                        # "Phoenicurus phoenicurus", 
+                        # "Strix aluco", 
+                        # "Aegithalos caudatus",
                       ]
 
 CONFIG_FILE = '../config_default.yaml' 
@@ -83,7 +84,6 @@
     # ROIS extraction of the full dataset
     df_rois, csv_rois = bambird.multicpu_extract_rois(
                         dataset     = df_xc,
-                        fun         = params['PARAMS_EXTRACT']['FUNC'],
                         params      = params['PARAMS_EXTRACT'],
                         save_path   = TEMP_DIR / ROIS_NAME,
                         overwrite   = True,

diff --git a/examples/workflow_single_file.py b/examples/workflow_single_file.py
@@ -31,9 +31,9 @@
 DATASET_NAME    = Path('WORKFLOW_SINGLE_FILE')
 ROIS_NAME       = Path(str(DATASET_NAME) +'_ROIS')
 
-# Xeno-Canto number (ie. 473724 for a audio of Columba palumbus https://xeno-canto.org/758652)
+# Xeno-Canto number (ie. 758652 for a audio of Columba palumbus https://xeno-canto.org/758652)
 # Without XC
-XC_NUMBER = 473724
+XC_NUMBER = 758652
 
 CONFIG_FILE = '../config_default.yaml' 
 

diff --git a/examples/workflow_single_species.py b/examples/workflow_single_species.py
@@ -98,7 +98,6 @@
     # ROIS extraction of the full dataset
     df_rois, csv_rois = bambird.multicpu_extract_rois(
                         dataset     = df_xc,
-                        fun         = params['PARAMS_EXTRACT']['FUNC'],
                         params      = params['PARAMS_EXTRACT'],
                         save_path   = TEMP_DIR / ROIS_NAME,
                         overwrite   = True,

diff --git a/setup.py b/setup.py
@@ -47,11 +47,13 @@ def run(self):
       cmdclass={'clean': CleanCommand},
       license_file = 'LICENSE',                     
       python_requires='>=3.5',
-      install_requires = ['scikit-maad>=1.3.12',
+      install_requires = ['scikit-image>=0.19.2',
+                          'scikit-maad>=1.3.12',
                           'librosa>=0.8.0',
                           'scikit-learn>=1.0',
                           'hdbscan',
                           'matplotlib', 
+                          'umap-learn',
                           'tqdm',
                           'kneed',
                           'pyyaml'],