From 1c29da9cc46049a9552c9f9503778b4850cd9520 Mon Sep 17 00:00:00 2001 From: Sylvain Haupert Date: Fri, 9 Dec 2022 01:02:56 +0100 Subject: [PATCH 01/13] Update segmentation_extract_rois_full_sig.py --- bambird/segmentation_extract_rois_full_sig.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/bambird/segmentation_extract_rois_full_sig.py b/bambird/segmentation_extract_rois_full_sig.py index e51c2af..ac94819 100644 --- a/bambird/segmentation_extract_rois_full_sig.py +++ b/bambird/segmentation_extract_rois_full_sig.py @@ -524,14 +524,6 @@ def extract_rois_full_sig( # Keep only events with duration longer than MIN_DURATION df_rois = df_rois[((df_rois["max_t"]-df_rois["min_t"])>params["MIN_DURATION"])] - # 8. remove rois with ratio >max_ratio_xy (they are mostly artefact - # such as wind, ain or clipping) - # add ratio x/y - df_rois['ratio_yx'] = (df_rois.max_y -df_rois.min_y) / (df_rois.max_x -df_rois.min_x) - - if params["MAX_RATIO_YX"] is not None : - df_rois = df_rois[df_rois['ratio_yx'] < params["MAX_RATIO_YX"]] - if verbose: print("=> AFTER MERGING FOUND {} ROIS".format(len(df_rois))) From aed5f6a875453421afb16a8c44bdba0597f9b349 Mon Sep 17 00:00:00 2001 From: Sylvain Haupert Date: Fri, 9 Dec 2022 16:25:37 +0100 Subject: [PATCH 02/13] Update __init__.py and segmentation_extract_rois_full_sig.py --- __init__.py | 0 bambird/segmentation_extract_rois_full_sig.py | 11 ++++++----- 2 files changed, 6 insertions(+), 5 deletions(-) create mode 100644 __init__.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bambird/segmentation_extract_rois_full_sig.py b/bambird/segmentation_extract_rois_full_sig.py index ac94819..a103a78 100644 --- a/bambird/segmentation_extract_rois_full_sig.py +++ b/bambird/segmentation_extract_rois_full_sig.py @@ -136,15 +136,16 @@ def _centroid_features(Sxx, rois=None, im_rois=None): centroid['area_xy'] = Sxx.shape[0] * Sxx.shape[1] centroid['duration_x'] = Sxx.shape[1] centroid['bandwidth_y'] = Sxx.shape[0] - centroid['snr'] = np.percentile(Sxx, 0.99) + # centroid['snr'] = np.percentile(Sxx, 0.99) + centroid['snr'] = mean_dB(add_dB(Sxx,axis=0)) else: if im_rois is not None : # real centroid and area rprops = measure.regionprops(im_rois, intensity_image=Sxx) centroid = [roi.weighted_centroid for roi in rprops] area = [roi.area for roi in rprops] - snr = [power2dB(np.percentile(roi.image_intensity,99)) for roi in rprops] - # snr = [mean_dB(add_dB(roi.image_intensity,axis=0)) for roi in rprops] + # snr = [power2dB(np.percentile(roi.image_intensity,99)) for roi in rprops] + snr = [power2dB(np.mean(np.sum(roi.image_intensity,axis=0))) for roi in rprops] else: # rectangular area (overestimation) area = (rois.max_y -rois.min_y) * (rois.max_x -rois.min_x) @@ -154,8 +155,8 @@ def _centroid_features(Sxx, rois=None, im_rois=None): im_blobs = maad.rois.rois_to_imblobs(np.zeros(Sxx.shape), row) rprops = measure.regionprops(im_blobs, intensity_image=Sxx) centroid.append(rprops.pop().weighted_centroid) - snr.append(power2dB(np.percentile(rprops.pop().image_intensity,99))) - # snr.append(mean_dB(add_dB(rprops.pop().image_intensity,axis=0))) + # snr.append(power2dB(np.percentile(rprops.pop().image_intensity,99))) + snr.append(power2dB(np.mean(np.sum(rprops.pop().image_intensity,axis=0)))) centroid = pd.DataFrame(centroid, columns=['centroid_y', 'centroid_x'], index=rois.index) From c908512063650409615a15dc9bdef3ceb925fd52 Mon Sep 17 00:00:00 2001 From: Sylvain Haupert Date: Tue, 13 Dec 2022 16:02:21 +0100 Subject: [PATCH 03/13] Update cluster.py and setup.py --- bambird/cluster.py | 51 +++++++++++++++++++++++++++++++++------------- setup.py | 1 + 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/bambird/cluster.py b/bambird/cluster.py index c6cea99..fd37fb8 100644 --- a/bambird/cluster.py +++ b/bambird/cluster.py @@ -29,10 +29,13 @@ from sklearn.neighbors import NearestNeighbors from sklearn.cluster import DBSCAN from sklearn.metrics import confusion_matrix -# from sklearn.manifold import TSNE +from sklearn.manifold import TSNE from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler +# umap +import umap + # Kneed package to find the knee of a curve from kneed import KneeLocator @@ -398,7 +401,6 @@ def find_cluster( ).fit(X) if verbose: - print("filename {}".format(df_single_categories.filename)) print("HDBSCAN eps {} min_points {} Number of soundtypes found for {} : {}".format(eps, min_points, categories, np.unique(cluster.labels_).size)) @@ -427,7 +429,7 @@ def find_cluster( # find the cluster ID of the biggest cluster that is not noise try : biggest_cluster_ID = df_cluster.loc[(df_cluster["categories"] == categories) & ( - df_cluster["cluster_number"] >= 0)]["cluster_number"].value_counts().argmax() + df_cluster["cluster_number"] >= 0)]["cluster_number"].value_counts().idxmax() # set by default to 1 the auto_label of the biggest cluster df_cluster.loc[(df_cluster["categories"] == categories) & ( df_cluster["cluster_number"] == biggest_cluster_ID), "auto_label"] = int(1) @@ -444,25 +446,46 @@ def find_cluster( df_cluster["cluster_number"] >= 0), "auto_label"] = int(1) - if display: + if display: # display the result in 2D (2D reduction of the dimension) - pca = PCA(n_components=2) - principalComponents = pca.fit_transform(X) - df_PCA = pd.DataFrame( - data=principalComponents, - columns=["principal component 1", "principal component 2"], + # compute the dimensionality reduction. + + ##### pca + # pca = PCA(n_components=2) + # principalComponents = pca.fit_transform(X) + # Y = pd.DataFrame( + + ##### tsne + # tsne = TSNE(n_components=2, + # init='pca', + # n_jobs = -1) + # Y = tsne.fit_transform(X) + + ##### umap + umap_red = umap.UMAP(n_neighbors=min_points, + n_components=2) + + Y = umap_red.fit_transform(X) + + + df_reducdim = pd.DataFrame( + data=Y, + columns=["dim1", "dim2"], ) - ax[count].set_xlabel("PC 1", fontsize=10) - ax[count].set_ylabel("PC 2", fontsize=10) + + ax[count].set_xlabel("dim 1", fontsize=10) + ax[count].set_ylabel("dim 2", fontsize=10) ax[count].set_title(categories, fontsize=12) ax[count].scatter( - df_PCA["principal component 1"], - df_PCA["principal component 2"], + df_reducdim["dim1"], + df_reducdim["dim2"], c=cluster.labels_, s=50, alpha=0.8, - ) + ) + + # increment count += 1 diff --git a/setup.py b/setup.py index 7e9c815..a3b47b5 100644 --- a/setup.py +++ b/setup.py @@ -52,6 +52,7 @@ def run(self): 'scikit-learn>=1.0', 'hdbscan', 'matplotlib', + 'umap-learn', 'tqdm', 'kneed', 'pyyaml'], From 0879caebd844df636ca6e8bac6e1e637daaa4521 Mon Sep 17 00:00:00 2001 From: Sylvain Haupert Date: Tue, 13 Dec 2022 16:05:08 +0100 Subject: [PATCH 04/13] Update version.py --- bambird/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bambird/version.py b/bambird/version.py index cc2ff8b..ab36371 100644 --- a/bambird/version.py +++ b/bambird/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- """Version info""" -__version__ = '0.2.1' +__version__ = '0.3.0' From 312bf6d02db56b801bfc6108f52db04a04e6d8d6 Mon Sep 17 00:00:00 2001 From: Sylvain HAUPERT <43136040+shaupert@users.noreply.github.com> Date: Tue, 13 Dec 2022 16:06:15 +0100 Subject: [PATCH 05/13] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 2229878..4e98133 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ bambird dependencies: - kneed - hdbscan - tqdm +- umap-learn **bambird** is hosted on PyPI. To install, run the following command in your Python environment: From 4591b4e8aab2196f55f6088a9078cba8ec71b0c9 Mon Sep 17 00:00:00 2001 From: Sylvain Haupert Date: Tue, 13 Dec 2022 16:20:53 +0100 Subject: [PATCH 06/13] Update __init__.py --- __init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 __init__.py diff --git a/__init__.py b/__init__.py deleted file mode 100644 index e69de29..0000000 From 8d8a6387bdc8c73efbdfe467c4e9f2f85f15dc18 Mon Sep 17 00:00:00 2001 From: Sylvain Haupert Date: Tue, 13 Dec 2022 17:32:57 +0100 Subject: [PATCH 07/13] Update cluster.py --- bambird/cluster.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/bambird/cluster.py b/bambird/cluster.py index fd37fb8..171c795 100644 --- a/bambird/cluster.py +++ b/bambird/cluster.py @@ -456,15 +456,17 @@ def find_cluster( # Y = pd.DataFrame( ##### tsne - # tsne = TSNE(n_components=2, - # init='pca', - # n_jobs = -1) - # Y = tsne.fit_transform(X) + tsne = TSNE(n_components=2, + init='pca', + n_jobs = -1, + random_state=cfg.RANDOM_SEED) + Y = tsne.fit_transform(X) ##### umap - umap_red = umap.UMAP(n_neighbors=min_points, - n_components=2) - + umap_red = umap.UMAP( + n_neighbors=min_points, + n_components=2, + random_state=cfg.RANDOM_SEED) Y = umap_red.fit_transform(X) From 032d0129d8c5727655571bbbe9c72ac40f2902f2 Mon Sep 17 00:00:00 2001 From: Sylvain Haupert Date: Tue, 13 Dec 2022 17:56:14 +0100 Subject: [PATCH 08/13] Update config_default.yaml --- config_default.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/config_default.yaml b/config_default.yaml index 18ed303..1b741c6 100644 --- a/config_default.yaml +++ b/config_default.yaml @@ -31,9 +31,9 @@ PARAMS_EXTRACT: LOW_FREQ: 250 # Low frequency in Hz of the bandpass filter applied to the audio HIGH_FREQ: 12000 # High frequency in Hz of the bandpass filter applied to the audio BUTTER_ORDER: 1 # butterworth filter order to select the bandwidth corresponding to the ROI - AUDIO_DURATION: 30 # Max duration of the audio file (in seconds) + AUDIO_DURATION: 60 # Max duration of the audio file (in seconds) CHUNK_DURATION: 10 # Split the audio signal in chunks of duration = CHUNK_DURATION (in seconds) - OVLP: 0 # Define the overlap ratio between each chunk (0=> no overlap, 0.75=> 75% of overlap between 2 consecutive chunks) + OVLP: 0.5 # Define the overlap ratio between each chunk (0=> no overlap, 0.75=> 75% of overlap between 2 consecutive chunks) # Spectrogram MODE_RMBCKG: median # Mode to compute the remove_background {'mean', 'median'} NFFT: 1024 # Number of points used for the SFFT to compute the spectrogram @@ -42,7 +42,7 @@ PARAMS_EXTRACT: MASK_PARAM1: 26 MASK_PARAM2: 10 # Select and merge bbox parameters - MAX_RATIO_YX: 7 # ratio Y/X between the high (Y in px) and the width (X in px) of the ROI + MAX_RATIO_YX: 7 # ratio Y/X between the high (Y in px) and the width (X in px) of the ROI MIN_DURATION: 0.1 # minimum event duration in s MARGIN_T_LEFT: 0.2 # overlapping time margin in s on the left side of the ROI to merge MARGIN_T_RIGHT: 0.2 # overlapping time margin in s on the right side of the ROI to merge @@ -61,8 +61,8 @@ PARAMS_FEATURES: SAMPLE_RATE: 48000 # Sampling frequency in Hz # Audio preprocess LOW_FREQ: 250 # Low frequency in Hz of the bandpass filter applied to the audio - HIGH_FREQ: 11000 # High frequency in Hz of the bandpass filter applied to the audio - BUTTER_ORDER: 1 # butterworth filter order to select the bandwidth corresponding to the ROI + HIGH_FREQ: 12000 # High frequency in Hz of the bandpass filter applied to the audio + BUTTER_ORDER: 5 # butterworth filter order to select the bandwidth corresponding to the ROI # Spectrogram NFFT: 1024 # Number of points of the spectrogram SHAPE_RES: 'high' # Resolution of the shapes {low, med, high} @@ -71,7 +71,7 @@ PARAMS_FEATURES: # CLUSTERING PARAMETERS ################################ PARAMS_CLUSTER: - FEATURES: ['shp', 'centroid_f'] # choose the features used to cluster {'shp', 'centroid_f', 'peak_f', 'duration_t', 'bandwidth_f', 'bandwidth_min_f', 'bandwidth_max_f', 'min_f', 'max_f' } + FEATURES: ['shp', 'centroid_f', 'peak_f', 'duration_t', 'bandwidth_f', 'bandwidth_min_f', 'bandwidth_max_f'] # choose the features used to cluster {'shp', 'centroid_f', 'peak_f', 'duration_t', 'bandwidth_f', 'bandwidth_min_f', 'bandwidth_max_f', 'min_f', 'max_f' } SCALER: MINMAXSCALER # scaler method to prepare the features before the clustering {STANDARDSCALER, ROBUSTSCALER, MINMAXSCALER} PERCENTAGE_PTS: 5 # minimum number of ROIs to form a cluster (in % of the total number of ROIs) {number between 0 and 1 or blank} MIN_PTS: # minimum number of ROIs to form a cluster {integer or blank} From 74a49e56bcef277c884a16602c6c53ea957b82ad Mon Sep 17 00:00:00 2001 From: Sylvain Haupert Date: Tue, 13 Dec 2022 23:27:35 +0100 Subject: [PATCH 09/13] Update cluster.py and config.py --- bambird/cluster.py | 10 +++++----- bambird/config.py | 3 +++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/bambird/cluster.py b/bambird/cluster.py index 171c795..b4643ae 100644 --- a/bambird/cluster.py +++ b/bambird/cluster.py @@ -456,11 +456,11 @@ def find_cluster( # Y = pd.DataFrame( ##### tsne - tsne = TSNE(n_components=2, - init='pca', - n_jobs = -1, - random_state=cfg.RANDOM_SEED) - Y = tsne.fit_transform(X) + # tsne = TSNE(n_components=2, + # init='pca', + # n_jobs = -1, + # random_state=cfg.RANDOM_SEED) + # Y = tsne.fit_transform(X) ##### umap umap_red = umap.UMAP( diff --git a/bambird/config.py b/bambird/config.py index c9801e4..39968f7 100644 --- a/bambird/config.py +++ b/bambird/config.py @@ -157,6 +157,7 @@ def load_config(fullfilename = None): """ global PARAMS + global RANDOM_SEED global PARAMS_XC global PARAMS_EXTRACT global PARAMS_FEATURES @@ -165,6 +166,7 @@ def load_config(fullfilename = None): if os.path.isfile(str(fullfilename)): with open(fullfilename) as f: PARAMS = yaml.load(f, Loader=_get_loader()) + RANDOM_SEED = PARAMS['RANDOM_SEED'] PARAMS_XC = PARAMS['PARAMS_XC'] PARAMS_EXTRACT = PARAMS['PARAMS_EXTRACT'] PARAMS_FEATURES = PARAMS['PARAMS_FEATURES'] @@ -176,6 +178,7 @@ def load_config(fullfilename = None): def get_config() : PARAMS = { + 'RANDOM_SEED' : RANDOM_SEED, 'PARAMS_XC' : PARAMS_XC, 'PARAMS_EXTRACT' : PARAMS_EXTRACT, 'PARAMS_FEATURES' : PARAMS_FEATURES, From b61101c461a6b300b2db79423ae01d9cdddc3b8c Mon Sep 17 00:00:00 2001 From: Sylvain Haupert Date: Tue, 13 Dec 2022 23:33:58 +0100 Subject: [PATCH 10/13] Update config.py --- bambird/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bambird/config.py b/bambird/config.py index 39968f7..2c65988 100644 --- a/bambird/config.py +++ b/bambird/config.py @@ -91,6 +91,7 @@ } PARAMS = { + 'RANDOM_SEED' : RANDOM_SEED, 'PARAMS_XC' : PARAMS_XC, 'PARAMS_EXTRACT' : PARAMS_EXTRACT, 'PARAMS_FEATURES' : PARAMS_FEATURES, From 704fb6f7d0bf2e8b3ecedde331859c7228095fac Mon Sep 17 00:00:00 2001 From: Sylvain Haupert Date: Tue, 13 Dec 2022 23:43:53 +0100 Subject: [PATCH 11/13] Update setup.py --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a3b47b5..c79b2f2 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,8 @@ def run(self): cmdclass={'clean': CleanCommand}, license_file = 'LICENSE', python_requires='>=3.5', - install_requires = ['scikit-maad>=1.3.12', + install_requires = ['scikit-image>=0.19.2', + 'scikit-maad>=1.3.12', 'librosa>=0.8.0', 'scikit-learn>=1.0', 'hdbscan', From 614d930795dba834657ffc599538b9604ff60f64 Mon Sep 17 00:00:00 2001 From: Sylvain Haupert Date: Wed, 14 Dec 2022 00:22:43 +0100 Subject: [PATCH 12/13] Update workflow_multiple_species.py, workflow_single_file.py, and workflow_single_species.py --- examples/workflow_multiple_species.py | 20 ++++++++++---------- examples/workflow_single_file.py | 4 ++-- examples/workflow_single_species.py | 1 - 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/examples/workflow_multiple_species.py b/examples/workflow_multiple_species.py index 73d4da6..7a3d51f 100644 --- a/examples/workflow_multiple_species.py +++ b/examples/workflow_multiple_species.py @@ -32,15 +32,16 @@ ROIS_NAME = Path(str(DATASET_NAME) +'_ROIS') # List of species to build a clean dataset -SCIENTIC_NAME_LIST = [ - "Regulus regulus", +SCIENTIC_NAME_LIST = [ + "Columba palumbus", + # "Regulus regulus", "Phylloscopus collybita", - "Anthus triviali", + # "Anthus triviali", "Fringilla coelebs", - "Troglodytes troglodytes", - "Phoenicurus phoenicurus", - "Strix aluco", - "Aegithalos caudatus", + # "Troglodytes troglodytes", + # "Phoenicurus phoenicurus", + # "Strix aluco", + # "Aegithalos caudatus", ] CONFIG_FILE = '../config_default.yaml' @@ -83,7 +84,6 @@ # ROIS extraction of the full dataset df_rois, csv_rois = bambird.multicpu_extract_rois( dataset = df_xc, - fun = params['PARAMS_EXTRACT']['FUNC'], params = params['PARAMS_EXTRACT'], save_path = TEMP_DIR / ROIS_NAME, overwrite = True, @@ -131,8 +131,8 @@ bambird.overlay_rois( cluster = df_cluster, params = params['PARAMS_EXTRACT'], - column_labels = 'cluster_number', #auto_label cluster_number - unique_labels = np.sort(df_cluster.cluster_number.unique()), + column_labels = 'auto_label', #auto_label cluster_number + unique_labels = np.sort(df_cluster.auto_label.unique()), filename = None, random_seed = None, verbose = True diff --git a/examples/workflow_single_file.py b/examples/workflow_single_file.py index 1de7e83..e20fec5 100644 --- a/examples/workflow_single_file.py +++ b/examples/workflow_single_file.py @@ -31,9 +31,9 @@ DATASET_NAME = Path('WORKFLOW_SINGLE_FILE') ROIS_NAME = Path(str(DATASET_NAME) +'_ROIS') -# Xeno-Canto number (ie. 473724 for a audio of Columba palumbus https://xeno-canto.org/758652) +# Xeno-Canto number (ie. 758652 for a audio of Columba palumbus https://xeno-canto.org/758652) # Without XC -XC_NUMBER = 473724 +XC_NUMBER = 758652 CONFIG_FILE = '../config_default.yaml' diff --git a/examples/workflow_single_species.py b/examples/workflow_single_species.py index b771c62..762fe16 100644 --- a/examples/workflow_single_species.py +++ b/examples/workflow_single_species.py @@ -98,7 +98,6 @@ # ROIS extraction of the full dataset df_rois, csv_rois = bambird.multicpu_extract_rois( dataset = df_xc, - fun = params['PARAMS_EXTRACT']['FUNC'], params = params['PARAMS_EXTRACT'], save_path = TEMP_DIR / ROIS_NAME, overwrite = True, From 65ca74f13a5384f917e99547dfc0e32e00eee685 Mon Sep 17 00:00:00 2001 From: Sylvain Haupert Date: Wed, 14 Dec 2022 00:23:31 +0100 Subject: [PATCH 13/13] Update workflow_multiple_species.py --- examples/workflow_multiple_species.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/workflow_multiple_species.py b/examples/workflow_multiple_species.py index 7a3d51f..561b802 100644 --- a/examples/workflow_multiple_species.py +++ b/examples/workflow_multiple_species.py @@ -131,8 +131,8 @@ bambird.overlay_rois( cluster = df_cluster, params = params['PARAMS_EXTRACT'], - column_labels = 'auto_label', #auto_label cluster_number - unique_labels = np.sort(df_cluster.auto_label.unique()), + column_labels = 'cluster_number', #auto_label cluster_number + unique_labels = np.sort(df_cluster.cluster_number.unique()), filename = None, random_seed = None, verbose = True