Skip to content

Commit

Permalink
[MNT] Unit testing revamp part 4: clustering (#1877)
Browse files Browse the repository at this point in the history
* classification checks in progress

* rework yield checks

* rework yield checks to allow for class input

* fixes

* fix

* pr testing split

* classification fixes

* regressor tests

* exclude tapnet

* missing value test data

* clusterer tests

* fix

* dummy random state

* dummy fix

* fixes and skips

---------

Co-authored-by: Tony Bagnall <[email protected]>
  • Loading branch information
MatthewMiddlehurst and TonyBagnall authored Sep 18, 2024
1 parent 230c38d commit c6dfffa
Show file tree
Hide file tree
Showing 9 changed files with 117 additions and 115 deletions.

This file was deleted.

28 changes: 19 additions & 9 deletions aeon/clustering/dummy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Implements DummyClusterer to be used as Baseline."""

import numpy as np
from sklearn.utils import check_random_state

from aeon.clustering.base import BaseClusterer

Expand All @@ -22,10 +23,17 @@ class DummyClusterer(BaseClusterer):
- "random": Assign clusters randomly.
- "uniform": Distribute clusters uniformly among samples.
- "single_cluster": Assign all samples to a single cluster.
n_clusters : int, default=3
The number of clusters to generate. This is relevant for "random"
and "uniform" strategies.
random_state : int, np.random.RandomState instance or None, default=None
Determines random number generation for centroid initialization.
Only used when `strategy` is "random".
If `int`, random_state is the seed used by the random number generator;
If `np.random.RandomState` instance,
random_state is the random number generator;
If `None`, the random number generator is the `RandomState` instance used
by `np.random`.
Attributes
----------
Expand All @@ -38,19 +46,19 @@ class DummyClusterer(BaseClusterer):
>>> import numpy as np
>>> X = np.array([[1, 2], [3, 4], [5, 6]])
>>> clusterer = DummyClusterer(strategy="uniform", n_clusters=2)
>>> clusterer._fit(X)
>>> clusterer.fit(X)
DummyClusterer(n_clusters=2, strategy='uniform')
>>> clusterer.labels_
array([0, 1, 0])
>>> clusterer._predict(X)
>>> clusterer.predict(X)
array([0, 1, 0])
"""

def __init__(self, strategy="random", n_clusters=3):
super().__init__()
def __init__(self, strategy="random", n_clusters=3, random_state=None):
self.strategy = strategy
self.n_clusters = n_clusters
self.labels_ = None
self.random_state = random_state

super().__init__(n_clusters=n_clusters)

def _fit(self, X, y=None):
"""
Expand All @@ -72,7 +80,8 @@ def _fit(self, X, y=None):
n_samples = X.shape[0]

if self.strategy == "random":
self.labels_ = np.random.randint(0, self.n_clusters, n_samples)
rng = check_random_state(self.random_state)
self.labels_ = rng.randint(self.n_clusters, size=n_samples)
elif self.strategy == "uniform":
self.labels_ = np.tile(
np.arange(self.n_clusters), n_samples // self.n_clusters + 1
Expand Down Expand Up @@ -103,7 +112,8 @@ def _predict(self, X, y=None) -> np.ndarray:
"""
n_samples = X.shape[0]
if self.strategy == "random":
return np.random.randint(0, self.n_clusters, n_samples)
rng = check_random_state(self.random_state)
return rng.randint(self.n_clusters, size=n_samples)
elif self.strategy == "uniform":
return np.tile(
np.arange(self.n_clusters), n_samples // self.n_clusters + 1
Expand Down
2 changes: 1 addition & 1 deletion aeon/clustering/feature_based/_catch22.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def _predict_proba(self, X) -> np.ndarray:
n_clusters = self.n_clusters
if n_clusters is None:
n_clusters = int(max(preds)) + 1
dists = np.zeros((X.shape[0], n_clusters))
dists = np.zeros((len(X), n_clusters))
for i in range(n_cases):
dists[i, preds[i]] = 1
return dists
Expand Down
43 changes: 0 additions & 43 deletions aeon/clustering/tests/test_all_clusterers.py

This file was deleted.

84 changes: 84 additions & 0 deletions aeon/testing/estimator_checking/_yield_clustering_checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""Tests for all clusterers."""

from functools import partial

import numpy as np

from aeon.base._base import _clone_estimator
from aeon.clustering.deep_learning import BaseDeepClusterer
from aeon.testing.testing_data import FULL_TEST_DATA_DICT


def _yield_clustering_checks(estimator_class, estimator_instances, datatypes):
"""Yield all clustering checks for an aeon clusterer."""
# only class required
yield partial(check_clusterer_tags_consistent, estimator_class=estimator_class)
yield partial(
check_clusterer_does_not_override_final_methods, estimator_class=estimator_class
)

# test class instances
for i, estimator in enumerate(estimator_instances):
# data type irrelevant
if isinstance(estimator, BaseDeepClusterer):
yield partial(
check_clustering_random_state_deep_learning,
estimator=estimator,
datatype=datatypes[i][0],
)


def check_clusterer_tags_consistent(estimator_class):
"""Test all estimators capability tags reflect their capabilities."""
# Test the tag X_inner_type is consistent with capability:unequal_length
unequal_length = estimator_class.get_class_tag("capability:unequal_length")
valid_types = {"np-list", "df-list", "pd-multivariate", "nested_univ"}
if unequal_length: # one of X_inner_types must be capable of storing unequal length
internal_types = estimator_class.get_class_tag("X_inner_type")
if isinstance(internal_types, str):
assert internal_types in valid_types
else: # must be a list
assert bool(set(internal_types) & valid_types)
# Test can actually fit/predict with multivariate if tag is set
multivariate = estimator_class.get_class_tag("capability:multivariate")
if multivariate:
X = np.random.random((10, 2, 10))
inst = estimator_class.create_test_instance(parameter_set="default")
inst.fit(X)
inst.predict(X)
inst.predict_proba(X)


def check_clusterer_does_not_override_final_methods(estimator_class):
"""Test does not override final methods."""
assert "fit" not in estimator_class.__dict__
assert "predict" not in estimator_class.__dict__


def check_clustering_random_state_deep_learning(estimator, datatype):
"""Test Deep Clusterer seeding."""
random_state = 42

deep_clr1 = _clone_estimator(estimator, random_state=random_state)
deep_clr1.fit(FULL_TEST_DATA_DICT[datatype]["train"][0])

layers1 = deep_clr1.training_model_.layers[1:]

deep_clr2 = _clone_estimator(estimator, random_state=random_state)
deep_clr2.fit(FULL_TEST_DATA_DICT[datatype]["train"][0])

layers2 = deep_clr2.training_model_.layers[1:]

assert len(layers1) == len(layers2)

for i in range(len(layers1)):
weights1 = layers1[i].get_weights()
weights2 = layers2[i].get_weights()

assert len(weights1) == len(weights2)

for j in range(len(weights1)):
_weight1 = np.asarray(weights1[j])
_weight2 = np.asarray(weights2[j])

np.testing.assert_almost_equal(_weight1, _weight2, 4)
9 changes: 9 additions & 0 deletions aeon/testing/estimator_checking/_yield_estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,16 @@
from aeon.base._base import _clone_estimator
from aeon.classification import BaseClassifier
from aeon.classification.deep_learning.base import BaseDeepClassifier
from aeon.clustering import BaseClusterer
from aeon.clustering.deep_learning.base import BaseDeepClusterer
from aeon.regression import BaseRegressor
from aeon.regression.deep_learning.base import BaseDeepRegressor
from aeon.testing.estimator_checking._yield_classification_checks import (
_yield_classification_checks,
)
from aeon.testing.estimator_checking._yield_clustering_checks import (
_yield_clustering_checks,
)
from aeon.testing.estimator_checking._yield_regression_checks import (
_yield_regression_checks,
)
Expand Down Expand Up @@ -88,6 +92,11 @@ def _yield_all_aeon_checks(
estimator_class, estimator_instances, datatypes
)

if issubclass(estimator_class, BaseClusterer):
yield from _yield_clustering_checks(
estimator_class, estimator_instances, datatypes
)


def _yield_estimator_checks(estimator_class, estimator_instances, datatypes):
"""Yield all general checks for an aeon estimator."""
Expand Down
2 changes: 1 addition & 1 deletion aeon/testing/test_all_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def _all_estimators(self):
estimator_types=getattr(self, "estimator_type_filter", None),
return_names=False,
exclude_estimators=EXCLUDE_ESTIMATORS,
exclude_estimator_types=["classifier", "regressor"],
exclude_estimator_types=["classifier", "regressor", "clusterer"],
)

# subsample estimators by OS & python version
Expand Down
2 changes: 2 additions & 0 deletions aeon/testing/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
# needs investigation
"SASTClassifier": ["check_fit_deterministic"],
"RSASTClassifier": ["check_fit_deterministic"],
"AEFCNClusterer": ["check_fit_updates_state"],
"AEResNetClusterer": ["check_fit_updates_state"],
}

# We use estimator tags in addition to class hierarchies to further distinguish
Expand Down
2 changes: 1 addition & 1 deletion aeon/testing/tests/test_all_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from aeon.utils.sampling import random_partition

ALL_ESTIMATORS = all_estimators(
estimator_types=["classifier", "regressor"],
estimator_types=["classifier", "regressor", "clusterer"],
return_names=False,
)

Expand Down

0 comments on commit c6dfffa

Please sign in to comment.