Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MNT] Unit testing revamp part 4: clustering #1877

Merged
merged 25 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
ab45778
classification checks in progress
MatthewMiddlehurst Jul 8, 2024
a57186f
Merge branch 'main' of https://github.com/aeon-toolkit/aeon into mm/c…
MatthewMiddlehurst Jul 8, 2024
bf357d1
merge
MatthewMiddlehurst Jul 9, 2024
3c209bc
rework yield checks
MatthewMiddlehurst Jul 11, 2024
0d8440e
rework yield checks to allow for class input
MatthewMiddlehurst Jul 11, 2024
55a2810
fixes
MatthewMiddlehurst Jul 11, 2024
18f1b14
fix
MatthewMiddlehurst Jul 11, 2024
f1fec2c
Merge branch 'main' of https://github.com/aeon-toolkit/aeon into mm/c…
MatthewMiddlehurst Jul 11, 2024
c1688ea
pr testing split
MatthewMiddlehurst Jul 11, 2024
7b8f7c9
Merge branch 'main' of https://github.com/aeon-toolkit/aeon into mm/r…
MatthewMiddlehurst Jul 30, 2024
254bfed
classification fixes
MatthewMiddlehurst Jul 30, 2024
e8ea3c9
regressor tests
MatthewMiddlehurst Jul 31, 2024
e61f8e5
Merge branch 'main' of https://github.com/aeon-toolkit/aeon into mm/r…
MatthewMiddlehurst Jul 31, 2024
833d8fc
exclude tapnet
MatthewMiddlehurst Jul 31, 2024
b437f3c
missing value test data
MatthewMiddlehurst Jul 31, 2024
e1296bd
clusterer tests
MatthewMiddlehurst Jul 31, 2024
f9bdf97
fix
MatthewMiddlehurst Jul 31, 2024
a7817d0
Merge branch 'mm/regression-testing' of https://github.com/aeon-toolk…
MatthewMiddlehurst Jul 31, 2024
e1c8481
Merge branch 'main' of https://github.com/aeon-toolkit/aeon into mm/c…
MatthewMiddlehurst Aug 15, 2024
4ab5a71
dummy random state
MatthewMiddlehurst Aug 15, 2024
47d598e
dummy fix
MatthewMiddlehurst Aug 15, 2024
9c5b1c5
fixes and skips
MatthewMiddlehurst Aug 15, 2024
37215ad
Merge branch 'main' into mm/clustering-testing
MatthewMiddlehurst Sep 4, 2024
b781247
Merge branch 'main' into mm/clustering-testing
TonyBagnall Sep 12, 2024
0d8cb32
Merge branch 'main' into mm/clustering-testing
TonyBagnall Sep 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

This file was deleted.

28 changes: 19 additions & 9 deletions aeon/clustering/dummy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Implements DummyClusterer to be used as Baseline."""

import numpy as np
from sklearn.utils import check_random_state

from aeon.clustering.base import BaseClusterer

Expand All @@ -22,10 +23,17 @@ class DummyClusterer(BaseClusterer):
- "random": Assign clusters randomly.
- "uniform": Distribute clusters uniformly among samples.
- "single_cluster": Assign all samples to a single cluster.

n_clusters : int, default=3
The number of clusters to generate. This is relevant for "random"
and "uniform" strategies.
random_state : int, np.random.RandomState instance or None, default=None
Determines random number generation for centroid initialization.
Only used when `strategy` is "random".
If `int`, random_state is the seed used by the random number generator;
If `np.random.RandomState` instance,
random_state is the random number generator;
If `None`, the random number generator is the `RandomState` instance used
by `np.random`.

Attributes
----------
Expand All @@ -38,19 +46,19 @@ class DummyClusterer(BaseClusterer):
>>> import numpy as np
>>> X = np.array([[1, 2], [3, 4], [5, 6]])
>>> clusterer = DummyClusterer(strategy="uniform", n_clusters=2)
>>> clusterer._fit(X)
>>> clusterer.fit(X)
DummyClusterer(n_clusters=2, strategy='uniform')
>>> clusterer.labels_
array([0, 1, 0])
>>> clusterer._predict(X)
>>> clusterer.predict(X)
array([0, 1, 0])
"""

def __init__(self, strategy="random", n_clusters=3):
super().__init__()
def __init__(self, strategy="random", n_clusters=3, random_state=None):
self.strategy = strategy
self.n_clusters = n_clusters
self.labels_ = None
self.random_state = random_state

super().__init__(n_clusters=n_clusters)

def _fit(self, X, y=None):
"""
Expand All @@ -72,7 +80,8 @@ def _fit(self, X, y=None):
n_samples = X.shape[0]

if self.strategy == "random":
self.labels_ = np.random.randint(0, self.n_clusters, n_samples)
rng = check_random_state(self.random_state)
self.labels_ = rng.randint(self.n_clusters, size=n_samples)
elif self.strategy == "uniform":
self.labels_ = np.tile(
np.arange(self.n_clusters), n_samples // self.n_clusters + 1
Expand Down Expand Up @@ -103,7 +112,8 @@ def _predict(self, X, y=None) -> np.ndarray:
"""
n_samples = X.shape[0]
if self.strategy == "random":
return np.random.randint(0, self.n_clusters, n_samples)
rng = check_random_state(self.random_state)
return rng.randint(self.n_clusters, size=n_samples)
elif self.strategy == "uniform":
return np.tile(
np.arange(self.n_clusters), n_samples // self.n_clusters + 1
Expand Down
2 changes: 1 addition & 1 deletion aeon/clustering/feature_based/_catch22.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def _predict_proba(self, X) -> np.ndarray:
n_clusters = self.n_clusters
if n_clusters is None:
n_clusters = int(max(preds)) + 1
dists = np.zeros((X.shape[0], n_clusters))
dists = np.zeros((len(X), n_clusters))
for i in range(n_cases):
dists[i, preds[i]] = 1
return dists
Expand Down
43 changes: 0 additions & 43 deletions aeon/clustering/tests/test_all_clusterers.py

This file was deleted.

84 changes: 84 additions & 0 deletions aeon/testing/estimator_checking/_yield_clustering_checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""Tests for all clusterers."""

from functools import partial

import numpy as np

from aeon.base._base import _clone_estimator
from aeon.clustering.deep_learning import BaseDeepClusterer
from aeon.testing.testing_data import FULL_TEST_DATA_DICT


def _yield_clustering_checks(estimator_class, estimator_instances, datatypes):
"""Yield all clustering checks for an aeon clusterer."""
# only class required
yield partial(check_clusterer_tags_consistent, estimator_class=estimator_class)
yield partial(
check_clusterer_does_not_override_final_methods, estimator_class=estimator_class
)

# test class instances
for i, estimator in enumerate(estimator_instances):
# data type irrelevant
if isinstance(estimator, BaseDeepClusterer):
yield partial(
check_clustering_random_state_deep_learning,
estimator=estimator,
datatype=datatypes[i][0],
)


def check_clusterer_tags_consistent(estimator_class):
"""Test all estimators capability tags reflect their capabilities."""
# Test the tag X_inner_type is consistent with capability:unequal_length
unequal_length = estimator_class.get_class_tag("capability:unequal_length")
valid_types = {"np-list", "df-list", "pd-multivariate", "nested_univ"}
if unequal_length: # one of X_inner_types must be capable of storing unequal length
internal_types = estimator_class.get_class_tag("X_inner_type")
if isinstance(internal_types, str):
assert internal_types in valid_types
else: # must be a list
assert bool(set(internal_types) & valid_types)
# Test can actually fit/predict with multivariate if tag is set
multivariate = estimator_class.get_class_tag("capability:multivariate")
if multivariate:
X = np.random.random((10, 2, 10))
inst = estimator_class.create_test_instance(parameter_set="default")
inst.fit(X)
inst.predict(X)
inst.predict_proba(X)


def check_clusterer_does_not_override_final_methods(estimator_class):
"""Test does not override final methods."""
assert "fit" not in estimator_class.__dict__
assert "predict" not in estimator_class.__dict__


def check_clustering_random_state_deep_learning(estimator, datatype):
"""Test Deep Clusterer seeding."""
random_state = 42

deep_clr1 = _clone_estimator(estimator, random_state=random_state)
deep_clr1.fit(FULL_TEST_DATA_DICT[datatype]["train"][0])

layers1 = deep_clr1.training_model_.layers[1:]

deep_clr2 = _clone_estimator(estimator, random_state=random_state)
deep_clr2.fit(FULL_TEST_DATA_DICT[datatype]["train"][0])

layers2 = deep_clr2.training_model_.layers[1:]

assert len(layers1) == len(layers2)

for i in range(len(layers1)):
weights1 = layers1[i].get_weights()
weights2 = layers2[i].get_weights()

assert len(weights1) == len(weights2)

for j in range(len(weights1)):
_weight1 = np.asarray(weights1[j])
_weight2 = np.asarray(weights2[j])

np.testing.assert_almost_equal(_weight1, _weight2, 4)
9 changes: 9 additions & 0 deletions aeon/testing/estimator_checking/_yield_estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,16 @@
from aeon.base._base import _clone_estimator
from aeon.classification import BaseClassifier
from aeon.classification.deep_learning.base import BaseDeepClassifier
from aeon.clustering import BaseClusterer
from aeon.clustering.deep_learning.base import BaseDeepClusterer
from aeon.regression import BaseRegressor
from aeon.regression.deep_learning.base import BaseDeepRegressor
from aeon.testing.estimator_checking._yield_classification_checks import (
_yield_classification_checks,
)
from aeon.testing.estimator_checking._yield_clustering_checks import (
_yield_clustering_checks,
)
from aeon.testing.estimator_checking._yield_regression_checks import (
_yield_regression_checks,
)
Expand Down Expand Up @@ -88,6 +92,11 @@ def _yield_all_aeon_checks(
estimator_class, estimator_instances, datatypes
)

if issubclass(estimator_class, BaseClusterer):
yield from _yield_clustering_checks(
estimator_class, estimator_instances, datatypes
)


def _yield_estimator_checks(estimator_class, estimator_instances, datatypes):
"""Yield all general checks for an aeon estimator."""
Expand Down
2 changes: 1 addition & 1 deletion aeon/testing/test_all_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def _all_estimators(self):
estimator_types=getattr(self, "estimator_type_filter", None),
return_names=False,
exclude_estimators=EXCLUDE_ESTIMATORS,
exclude_estimator_types=["classifier", "regressor"],
exclude_estimator_types=["classifier", "regressor", "clusterer"],
)

# subsample estimators by OS & python version
Expand Down
2 changes: 2 additions & 0 deletions aeon/testing/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@
# needs investigation
"SASTClassifier": ["check_fit_deterministic"],
"RSASTClassifier": ["check_fit_deterministic"],
"AEFCNClusterer": ["check_fit_updates_state"],
"AEResNetClusterer": ["check_fit_updates_state"],
}

# We use estimator tags in addition to class hierarchies to further distinguish
Expand Down
2 changes: 1 addition & 1 deletion aeon/testing/tests/test_all_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from aeon.utils.sampling import random_partition

ALL_ESTIMATORS = all_estimators(
estimator_types=["classifier", "regressor"],
estimator_types=["classifier", "regressor", "clusterer"],
return_names=False,
)

Expand Down