Skip to content

Commit

Permalink
[ENH] Sort out clustering base class (#2251)
Browse files Browse the repository at this point in the history
* remove y from predict

* remove score

* remove score, add fit_predict

* remove score, add fit_predict

* removed score and n_clusters from base class

* fit_predict

* fixed predict proba

* fixed dnn tests

* fixed dnn tests

* fix notebook

* fixed

* removed deep learner n_clusters and assert labels_ exists

* cont

* fix dnns

* pipeline clusterer

* fix pipeline

* revert

* fix

* remove score from dnns

* remove score from notebooks

* remove score from remaining

* fix notebook

* fixed kmeans bug stopping tests working

* fixed

* docstring fix

---------

Co-authored-by: chrisholder <[email protected]>
  • Loading branch information
TonyBagnall and chrisholder authored Nov 20, 2024
1 parent 7209b66 commit 980e8bb
Show file tree
Hide file tree
Showing 29 changed files with 92 additions and 337 deletions.
6 changes: 2 additions & 4 deletions aeon/clustering/_clara.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def __init__(
self.distance_params = distance_params
self.n_samples = n_samples
self.n_sampling_iters = n_sampling_iters
self.n_clusters = n_clusters

self.cluster_centers_ = None
self.labels_ = None
Expand All @@ -148,7 +149,7 @@ def __init__(
self._random_state = None
self._kmedoids_instance = None

super().__init__(n_clusters)
super().__init__()

def _predict(self, X: np.ndarray, y=None) -> np.ndarray:
return self._kmedoids_instance.predict(X)
Expand Down Expand Up @@ -207,9 +208,6 @@ def _fit(self, X: np.ndarray, y=None):
self.n_iter_ = best_pam.n_iter_
self._kmedoids_instance = best_pam

def _score(self, X, y=None):
return -self.inertia_

@classmethod
def _get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.
Expand Down
6 changes: 2 additions & 4 deletions aeon/clustering/_elastic_som.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ def __init__(
self.init = init
self.sigma_decay_function = sigma_decay_function
self.custom_alignment_path = custom_alignment_path
self.n_clusters = n_clusters

self._random_state = None
self._alignment_path_callable = None
Expand All @@ -191,7 +192,7 @@ def __init__(

self.labels_ = None
self.cluster_centers_ = None
super().__init__(n_clusters=n_clusters)
super().__init__()

def _fit(self, X, y=None):
self._check_params(X)
Expand Down Expand Up @@ -219,9 +220,6 @@ def _fit(self, X, y=None):
def _predict(self, X, y=None):
return self._find_bmu(X, self.cluster_centers_)

def _score(self, X, y=None):
raise NotImplementedError("TimeSeriesSOM does not support scoring")

def _find_bmu(self, x, weights):
pairwise_matrix = pairwise_distance(
x,
Expand Down
8 changes: 3 additions & 5 deletions aeon/clustering/_k_means.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ def __init__(
self.distance_params = distance_params
self.average_params = average_params
self.averaging_method = averaging_method
self.n_clusters = n_clusters

self.cluster_centers_ = None
self.labels_ = None
Expand All @@ -203,7 +204,7 @@ def __init__(
self._averaging_method = None
self._average_params = None

super().__init__(n_clusters)
super().__init__()

def _fit(self, X: np.ndarray, y=None):
self._check_params(X)
Expand Down Expand Up @@ -267,7 +268,7 @@ def _fit_one_init(self, X: np.ndarray) -> tuple:
prev_inertia = curr_inertia
prev_labels = curr_labels

if change_in_centres < self.tol:
if change_in_centres < self.tol or (i + 1) == self.max_iter:
break

# Compute new cluster centres
Expand All @@ -281,9 +282,6 @@ def _fit_one_init(self, X: np.ndarray) -> tuple:

return prev_labels, cluster_centres, prev_inertia, i + 1

def _score(self, X, y=None):
return -self.inertia_

def _predict(self, X: np.ndarray, y=None) -> np.ndarray:
if isinstance(self.distance, str):
pairwise_matrix = pairwise_distance(
Expand Down
6 changes: 2 additions & 4 deletions aeon/clustering/_k_medoids.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ def __init__(
self.random_state = random_state
self.distance_params = distance_params
self.method = method
self.n_clusters = n_clusters

self.cluster_centers_ = None
self.labels_ = None
Expand All @@ -184,7 +185,7 @@ def __init__(
self._fit_method = None

self._distance_params = {}
super().__init__(n_clusters)
super().__init__()

def _fit(self, X: np.ndarray, y=None):
self._check_params(X)
Expand All @@ -207,9 +208,6 @@ def _fit(self, X: np.ndarray, y=None):
self.cluster_centers_ = best_centers
self.n_iter_ = best_iters

def _score(self, X, y=None):
return -self.inertia_

def _predict(self, X: np.ndarray, y=None) -> np.ndarray:
if isinstance(self.distance, str):
pairwise_matrix = pairwise_distance(
Expand Down
8 changes: 3 additions & 5 deletions aeon/clustering/_k_shape.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def __init__(
self.tol = tol
self.verbose = verbose
self.random_state = random_state
self.n_clusters = n_clusters

self.cluster_centers_ = None
self.labels_ = None
Expand All @@ -97,7 +98,7 @@ def __init__(

self._tslearn_k_shapes = None

super().__init__(n_clusters=n_clusters)
super().__init__()

def _fit(self, X, y=None):
"""Fit time series clusterer to training data.
Expand Down Expand Up @@ -130,7 +131,7 @@ def _fit(self, X, y=None):

self._tslearn_k_shapes.fit(_X)
self._cluster_centers = self._tslearn_k_shapes.cluster_centers_
self.labels_ = self._tslearn_k_shapes.labels_
self.labels_ = self._tslearn_k_shapes.predict(_X)
self.inertia_ = self._tslearn_k_shapes.inertia_
self.n_iter_ = self._tslearn_k_shapes.n_iter_

Expand Down Expand Up @@ -179,6 +180,3 @@ def _get_test_params(cls, parameter_set="default"):
"verbose": False,
"random_state": 1,
}

def _score(self, X, y=None):
return np.abs(self.inertia_)
8 changes: 3 additions & 5 deletions aeon/clustering/_k_shapes.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def __init__(
self.tol = tol
self.verbose = verbose
self.random_state = random_state
self.n_clusters = n_clusters

self.cluster_centers_ = None
self.labels_ = None
Expand All @@ -98,7 +99,7 @@ def __init__(

self._tslearn_k_shapes = None

super().__init__(n_clusters=n_clusters)
super().__init__()

def _fit(self, X, y=None):
"""Fit time series clusterer to training data.
Expand Down Expand Up @@ -131,7 +132,7 @@ def _fit(self, X, y=None):

self._tslearn_k_shapes.fit(_X)
self._cluster_centers = self._tslearn_k_shapes.cluster_centers_
self.labels_ = self._tslearn_k_shapes.labels_
self.labels_ = self._tslearn_k_shapes.predict(_X)
self.inertia_ = self._tslearn_k_shapes.inertia_
self.n_iter_ = self._tslearn_k_shapes.n_iter_

Expand Down Expand Up @@ -180,6 +181,3 @@ def _get_test_params(cls, parameter_set="default"):
"verbose": False,
"random_state": 1,
}

def _score(self, X, y=None):
return np.abs(self.inertia_)
6 changes: 2 additions & 4 deletions aeon/clustering/_kernel_k_means.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def __init__(
self.verbose = verbose
self.n_jobs = n_jobs
self.random_state = random_state
self.n_clusters = n_clusters

self.cluster_centers_ = None
self.labels_ = None
Expand All @@ -116,7 +117,7 @@ def __init__(

self._tslearn_kernel_k_means = None

super().__init__(n_clusters=n_clusters)
super().__init__()

def _fit(self, X, y=None):
"""Fit time series clusterer to training data.
Expand Down Expand Up @@ -204,6 +205,3 @@ def _get_test_params(cls, parameter_set="default") -> dict:
"n_jobs": 1,
"random_state": 1,
}

def _score(self, X, y=None) -> float:
return np.abs(self.inertia_)
33 changes: 14 additions & 19 deletions aeon/clustering/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
"""Base class for clustering."""

from typing import Optional

__maintainer__ = []
__all__ = ["BaseClusterer"]

Expand All @@ -28,8 +26,7 @@ class BaseClusterer(BaseCollectionEstimator):
"fit_is_empty": False,
}

def __init__(self, n_clusters: Optional[int] = None):
self.n_clusters = n_clusters
def __init__(self):
# required for compatibility with some sklearn interfaces e.g.
# CalibratedClassifierCV
self._estimator_type = "clusterer"
Expand Down Expand Up @@ -125,6 +122,7 @@ def predict_proba(self, X) -> np.ndarray:
self._check_shape(X)
return self._predict_proba(X)

@final
def fit_predict(self, X, y=None) -> np.ndarray:
"""Compute cluster centers and predict cluster index for each time series.
Expand All @@ -143,11 +141,10 @@ def fit_predict(self, X, y=None) -> np.ndarray:
np.ndarray (1d array of shape (n_cases,))
Index of the cluster each time series in X belongs to.
"""
self.fit(X)
return self.predict(X)
return self._fit_predict(X, y)

def score(self, X, y=None) -> float:
"""Score the quality of the clusterer.
def _fit_predict(self, X, y=None) -> np.ndarray:
"""Fit predict using base methods.
Parameters
----------
Expand All @@ -159,13 +156,11 @@ def score(self, X, y=None) -> float:
Returns
-------
score : float
Score of the clusterer.
np.ndarray (1d array of shape (n_cases,))
Index of the cluster each time series in X belongs to.
"""
self._check_is_fitted()
X = self._preprocess_collection(X, store_metadata=False)
self._check_shape(X)
return self._score(X, y)
self.fit(X)
return self.labels_

def _predict_proba(self, X) -> np.ndarray:
"""Predicts labels probabilities for sequences in X.
Expand Down Expand Up @@ -198,17 +193,17 @@ def _predict_proba(self, X) -> np.ndarray:
for i, u in enumerate(unique):
preds[preds == u] = i
n_cases = len(preds)
n_clusters = self.n_clusters
if hasattr(self, "n_clusters"):
n_clusters = self.n_clusters
else:
n_clusters = len(np.unique(preds))
if n_clusters is None:
n_clusters = int(max(preds)) + 1
dists = np.zeros((X.shape[0], n_clusters))
dists = np.zeros((len(X), n_clusters))
for i in range(n_cases):
dists[i, preds[i]] = 1
return dists

@abstractmethod
def _score(self, X, y=None): ...

@abstractmethod
def _predict(self, X) -> np.ndarray:
"""Predict the closest cluster each sample in X belongs to.
Expand Down
7 changes: 3 additions & 4 deletions aeon/clustering/compose/_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,9 @@ def __init__(self, transformers, clusterer, random_state=None):
)

def _fit(self, X, y=None):
return super()._fit(X, y)

def _score(self, X, y=None):
raise NotImplementedError("Pipeline does not support scoring.")
super()._fit(X, y)
self.labels_ = self.steps_[-1][1].labels_
return self

@classmethod
def _get_test_params(cls, parameter_set="default"):
Expand Down
10 changes: 0 additions & 10 deletions aeon/clustering/deep_learning/_ae_abgru.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ class AEAttentionBiGRUClusterer(BaseDeepClusterer):
Parameters
----------
n_clusters : int, default=None
Number of clusters for the deep learnign model.
clustering_algorithm : str, default="deprecated"
Use 'estimator' parameter instead.
clustering_params : dict, default=None
Expand Down Expand Up @@ -106,7 +104,6 @@ class AEAttentionBiGRUClusterer(BaseDeepClusterer):

def __init__(
self,
n_clusters=None,
estimator=None,
clustering_algorithm="deprecated",
clustering_params=None,
Expand Down Expand Up @@ -153,7 +150,6 @@ def __init__(
self.random_state = random_state

super().__init__(
n_clusters=n_clusters,
clustering_algorithm=clustering_algorithm,
clustering_params=clustering_params,
estimator=estimator,
Expand Down Expand Up @@ -302,12 +298,6 @@ def _fit(self, X):

return self

def _score(self, X, y=None):
# Transpose to conform to Keras input style.
X = X.transpose(0, 2, 1)
latent_space = self.model_.layers[1].predict(X)
return self._estimator.score(latent_space)

@classmethod
def _get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.
Expand Down
10 changes: 0 additions & 10 deletions aeon/clustering/deep_learning/_ae_bgru.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ class AEBiGRUClusterer(BaseDeepClusterer):
Parameters
----------
n_clusters : int, default=None
Number of clusters for the deep learnign model.
clustering_algorithm : str, default="deprecated"
Use 'estimator' parameter instead.
clustering_params : dict, default=None
Expand Down Expand Up @@ -105,7 +103,6 @@ class AEBiGRUClusterer(BaseDeepClusterer):

def __init__(
self,
n_clusters=None,
clustering_algorithm="deprecated",
estimator=None,
clustering_params=None,
Expand Down Expand Up @@ -152,7 +149,6 @@ def __init__(
self.random_state = random_state

super().__init__(
n_clusters=n_clusters,
clustering_algorithm=clustering_algorithm,
clustering_params=clustering_params,
estimator=estimator,
Expand Down Expand Up @@ -300,12 +296,6 @@ def _fit(self, X):

return self

def _score(self, X, y=None):
# Transpose to conform to Keras input style.
X = X.transpose(0, 2, 1)
latent_space = self.model_.layers[1].predict(X)
return self._estimator.score(latent_space)

@classmethod
def _get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.
Expand Down
Loading

0 comments on commit 980e8bb

Please sign in to comment.