Skip to content

Commit

Permalink
Merge branch 'production'
Browse files Browse the repository at this point in the history
  • Loading branch information
shaupert committed Dec 13, 2022
2 parents 1a30fd7 + 65ca74f commit 84a5973
Show file tree
Hide file tree
Showing 9 changed files with 69 additions and 46 deletions.
53 changes: 39 additions & 14 deletions bambird/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,13 @@
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.metrics import confusion_matrix
# from sklearn.manifold import TSNE
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

# umap
import umap

# Kneed package to find the knee of a curve
from kneed import KneeLocator

Expand Down Expand Up @@ -398,7 +401,6 @@ def find_cluster(
).fit(X)

if verbose:
print("filename {}".format(df_single_categories.filename))
print("HDBSCAN eps {} min_points {} Number of soundtypes found for {} : {}".format(eps, min_points,
categories, np.unique(cluster.labels_).size))

Expand Down Expand Up @@ -427,7 +429,7 @@ def find_cluster(
# find the cluster ID of the biggest cluster that is not noise
try :
biggest_cluster_ID = df_cluster.loc[(df_cluster["categories"] == categories) & (
df_cluster["cluster_number"] >= 0)]["cluster_number"].value_counts().argmax()
df_cluster["cluster_number"] >= 0)]["cluster_number"].value_counts().idxmax()
# set by default to 1 the auto_label of the biggest cluster
df_cluster.loc[(df_cluster["categories"] == categories) & (
df_cluster["cluster_number"] == biggest_cluster_ID), "auto_label"] = int(1)
Expand All @@ -444,25 +446,48 @@ def find_cluster(
df_cluster["cluster_number"] >= 0), "auto_label"] = int(1)


if display:
if display:
# display the result in 2D (2D reduction of the dimension)
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X)
df_PCA = pd.DataFrame(
data=principalComponents,
columns=["principal component 1", "principal component 2"],
# compute the dimensionality reduction.

##### pca
# pca = PCA(n_components=2)
# principalComponents = pca.fit_transform(X)
# Y = pd.DataFrame(

##### tsne
# tsne = TSNE(n_components=2,
# init='pca',
# n_jobs = -1,
# random_state=cfg.RANDOM_SEED)
# Y = tsne.fit_transform(X)

##### umap
umap_red = umap.UMAP(
n_neighbors=min_points,
n_components=2,
random_state=cfg.RANDOM_SEED)
Y = umap_red.fit_transform(X)


df_reducdim = pd.DataFrame(
data=Y,
columns=["dim1", "dim2"],
)
ax[count].set_xlabel("PC 1", fontsize=10)
ax[count].set_ylabel("PC 2", fontsize=10)

ax[count].set_xlabel("dim 1", fontsize=10)
ax[count].set_ylabel("dim 2", fontsize=10)
ax[count].set_title(categories, fontsize=12)

ax[count].scatter(
df_PCA["principal component 1"],
df_PCA["principal component 2"],
df_reducdim["dim1"],
df_reducdim["dim2"],
c=cluster.labels_,
s=50,
alpha=0.8,
)
)



# increment
count += 1
Expand Down
4 changes: 4 additions & 0 deletions bambird/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
}

PARAMS = {
'RANDOM_SEED' : RANDOM_SEED,
'PARAMS_XC' : PARAMS_XC,
'PARAMS_EXTRACT' : PARAMS_EXTRACT,
'PARAMS_FEATURES' : PARAMS_FEATURES,
Expand Down Expand Up @@ -157,6 +158,7 @@ def load_config(fullfilename = None):
"""

global PARAMS
global RANDOM_SEED
global PARAMS_XC
global PARAMS_EXTRACT
global PARAMS_FEATURES
Expand All @@ -165,6 +167,7 @@ def load_config(fullfilename = None):
if os.path.isfile(str(fullfilename)):
with open(fullfilename) as f:
PARAMS = yaml.load(f, Loader=_get_loader())
RANDOM_SEED = PARAMS['RANDOM_SEED']
PARAMS_XC = PARAMS['PARAMS_XC']
PARAMS_EXTRACT = PARAMS['PARAMS_EXTRACT']
PARAMS_FEATURES = PARAMS['PARAMS_FEATURES']
Expand All @@ -176,6 +179,7 @@ def load_config(fullfilename = None):

def get_config() :
PARAMS = {
'RANDOM_SEED' : RANDOM_SEED,
'PARAMS_XC' : PARAMS_XC,
'PARAMS_EXTRACT' : PARAMS_EXTRACT,
'PARAMS_FEATURES' : PARAMS_FEATURES,
Expand Down
19 changes: 6 additions & 13 deletions bambird/segmentation_extract_rois_full_sig.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,15 +136,16 @@ def _centroid_features(Sxx, rois=None, im_rois=None):
centroid['area_xy'] = Sxx.shape[0] * Sxx.shape[1]
centroid['duration_x'] = Sxx.shape[1]
centroid['bandwidth_y'] = Sxx.shape[0]
centroid['snr'] = np.percentile(Sxx, 0.99)
# centroid['snr'] = np.percentile(Sxx, 0.99)
centroid['snr'] = mean_dB(add_dB(Sxx,axis=0))
else:
if im_rois is not None :
# real centroid and area
rprops = measure.regionprops(im_rois, intensity_image=Sxx)
centroid = [roi.weighted_centroid for roi in rprops]
area = [roi.area for roi in rprops]
snr = [power2dB(np.percentile(roi.image_intensity,99)) for roi in rprops]
# snr = [mean_dB(add_dB(roi.image_intensity,axis=0)) for roi in rprops]
# snr = [power2dB(np.percentile(roi.image_intensity,99)) for roi in rprops]
snr = [power2dB(np.mean(np.sum(roi.image_intensity,axis=0))) for roi in rprops]
else:
# rectangular area (overestimation)
area = (rois.max_y -rois.min_y) * (rois.max_x -rois.min_x)
Expand All @@ -154,8 +155,8 @@ def _centroid_features(Sxx, rois=None, im_rois=None):
im_blobs = maad.rois.rois_to_imblobs(np.zeros(Sxx.shape), row)
rprops = measure.regionprops(im_blobs, intensity_image=Sxx)
centroid.append(rprops.pop().weighted_centroid)
snr.append(power2dB(np.percentile(rprops.pop().image_intensity,99)))
# snr.append(mean_dB(add_dB(rprops.pop().image_intensity,axis=0)))
# snr.append(power2dB(np.percentile(rprops.pop().image_intensity,99)))
snr.append(power2dB(np.mean(np.sum(rprops.pop().image_intensity,axis=0))))

centroid = pd.DataFrame(centroid, columns=['centroid_y', 'centroid_x'], index=rois.index)

Expand Down Expand Up @@ -524,14 +525,6 @@ def extract_rois_full_sig(
# Keep only events with duration longer than MIN_DURATION
df_rois = df_rois[((df_rois["max_t"]-df_rois["min_t"])>params["MIN_DURATION"])]

# 8. remove rois with ratio >max_ratio_xy (they are mostly artefact
# such as wind, ain or clipping)
# add ratio x/y
df_rois['ratio_yx'] = (df_rois.max_y -df_rois.min_y) / (df_rois.max_x -df_rois.min_x)

if params["MAX_RATIO_YX"] is not None :
df_rois = df_rois[df_rois['ratio_yx'] < params["MAX_RATIO_YX"]]

if verbose:
print("=> AFTER MERGING FOUND {} ROIS".format(len(df_rois)))

Expand Down
2 changes: 1 addition & 1 deletion bambird/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Version info"""
__version__ = '0.2.1'
__version__ = '0.3.0'
12 changes: 6 additions & 6 deletions config_default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ PARAMS_EXTRACT:
LOW_FREQ: 250 # Low frequency in Hz of the bandpass filter applied to the audio
HIGH_FREQ: 12000 # High frequency in Hz of the bandpass filter applied to the audio
BUTTER_ORDER: 1 # butterworth filter order to select the bandwidth corresponding to the ROI
AUDIO_DURATION: 30 # Max duration of the audio file (in seconds)
AUDIO_DURATION: 60 # Max duration of the audio file (in seconds)
CHUNK_DURATION: 10 # Split the audio signal in chunks of duration = CHUNK_DURATION (in seconds)
OVLP: 0 # Define the overlap ratio between each chunk (0=> no overlap, 0.75=> 75% of overlap between 2 consecutive chunks)
OVLP: 0.5 # Define the overlap ratio between each chunk (0=> no overlap, 0.75=> 75% of overlap between 2 consecutive chunks)
# Spectrogram
MODE_RMBCKG: median # Mode to compute the remove_background {'mean', 'median'}
NFFT: 1024 # Number of points used for the SFFT to compute the spectrogram
Expand All @@ -42,7 +42,7 @@ PARAMS_EXTRACT:
MASK_PARAM1: 26
MASK_PARAM2: 10
# Select and merge bbox parameters
MAX_RATIO_YX: 7 # ratio Y/X between the high (Y in px) and the width (X in px) of the ROI
MAX_RATIO_YX: 7 # ratio Y/X between the high (Y in px) and the width (X in px) of the ROI
MIN_DURATION: 0.1 # minimum event duration in s
MARGIN_T_LEFT: 0.2 # overlapping time margin in s on the left side of the ROI to merge
MARGIN_T_RIGHT: 0.2 # overlapping time margin in s on the right side of the ROI to merge
Expand All @@ -61,8 +61,8 @@ PARAMS_FEATURES:
SAMPLE_RATE: 48000 # Sampling frequency in Hz
# Audio preprocess
LOW_FREQ: 250 # Low frequency in Hz of the bandpass filter applied to the audio
HIGH_FREQ: 11000 # High frequency in Hz of the bandpass filter applied to the audio
BUTTER_ORDER: 1 # butterworth filter order to select the bandwidth corresponding to the ROI
HIGH_FREQ: 12000 # High frequency in Hz of the bandpass filter applied to the audio
BUTTER_ORDER: 5 # butterworth filter order to select the bandwidth corresponding to the ROI
# Spectrogram
NFFT: 1024 # Number of points of the spectrogram
SHAPE_RES: 'high' # Resolution of the shapes {low, med, high}
Expand All @@ -71,7 +71,7 @@ PARAMS_FEATURES:
# CLUSTERING PARAMETERS
################################
PARAMS_CLUSTER:
FEATURES: ['shp', 'centroid_f'] # choose the features used to cluster {'shp', 'centroid_f', 'peak_f', 'duration_t', 'bandwidth_f', 'bandwidth_min_f', 'bandwidth_max_f', 'min_f', 'max_f' }
FEATURES: ['shp', 'centroid_f', 'peak_f', 'duration_t', 'bandwidth_f', 'bandwidth_min_f', 'bandwidth_max_f'] # choose the features used to cluster {'shp', 'centroid_f', 'peak_f', 'duration_t', 'bandwidth_f', 'bandwidth_min_f', 'bandwidth_max_f', 'min_f', 'max_f' }
SCALER: MINMAXSCALER # scaler method to prepare the features before the clustering {STANDARDSCALER, ROBUSTSCALER, MINMAXSCALER}
PERCENTAGE_PTS: 5 # minimum number of ROIs to form a cluster (in % of the total number of ROIs) {number between 0 and 1 or blank}
MIN_PTS: # minimum number of ROIs to form a cluster {integer or blank}
Expand Down
16 changes: 8 additions & 8 deletions examples/workflow_multiple_species.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,16 @@
ROIS_NAME = Path(str(DATASET_NAME) +'_ROIS')

# List of species to build a clean dataset
SCIENTIC_NAME_LIST = [
"Regulus regulus",
SCIENTIC_NAME_LIST = [
"Columba palumbus",
# "Regulus regulus",
"Phylloscopus collybita",
"Anthus triviali",
# "Anthus triviali",
"Fringilla coelebs",
"Troglodytes troglodytes",
"Phoenicurus phoenicurus",
"Strix aluco",
"Aegithalos caudatus",
# "Troglodytes troglodytes",
# "Phoenicurus phoenicurus",
# "Strix aluco",
# "Aegithalos caudatus",
]

CONFIG_FILE = '../config_default.yaml'
Expand Down Expand Up @@ -83,7 +84,6 @@
# ROIS extraction of the full dataset
df_rois, csv_rois = bambird.multicpu_extract_rois(
dataset = df_xc,
fun = params['PARAMS_EXTRACT']['FUNC'],
params = params['PARAMS_EXTRACT'],
save_path = TEMP_DIR / ROIS_NAME,
overwrite = True,
Expand Down
4 changes: 2 additions & 2 deletions examples/workflow_single_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@
DATASET_NAME = Path('WORKFLOW_SINGLE_FILE')
ROIS_NAME = Path(str(DATASET_NAME) +'_ROIS')

# Xeno-Canto number (ie. 473724 for a audio of Columba palumbus https://xeno-canto.org/758652)
# Xeno-Canto number (ie. 758652 for a audio of Columba palumbus https://xeno-canto.org/758652)
# Without XC
XC_NUMBER = 473724
XC_NUMBER = 758652

CONFIG_FILE = '../config_default.yaml'

Expand Down
1 change: 0 additions & 1 deletion examples/workflow_single_species.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@
# ROIS extraction of the full dataset
df_rois, csv_rois = bambird.multicpu_extract_rois(
dataset = df_xc,
fun = params['PARAMS_EXTRACT']['FUNC'],
params = params['PARAMS_EXTRACT'],
save_path = TEMP_DIR / ROIS_NAME,
overwrite = True,
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,13 @@ def run(self):
cmdclass={'clean': CleanCommand},
license_file = 'LICENSE',
python_requires='>=3.5',
install_requires = ['scikit-maad>=1.3.12',
install_requires = ['scikit-image>=0.19.2',
'scikit-maad>=1.3.12',
'librosa>=0.8.0',
'scikit-learn>=1.0',
'hdbscan',
'matplotlib',
'umap-learn',
'tqdm',
'kneed',
'pyyaml'],
Expand Down

0 comments on commit 84a5973

Please sign in to comment.