Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Shapelet visualization tools #1715

Merged
merged 16 commits into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ aeon/transformations/theta.py @GuzalBulatova

aeon/utils/numba/ @baraline @MatthewMiddlehurst

aeon/visualisation/ @baraline

.github/ @aeon-toolkit/aeon-infrastructure-workgroup
build_tools/ @aeon-toolkit/aeon-infrastructure-workgroup

Expand Down
7 changes: 6 additions & 1 deletion aeon/classification/shapelet_based/_rdst.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ class RDSTClassifier(BaseClassifier):
If True, restrict the value of the shapelet dilation parameter to be prime
values. This can greatly speed-up the algorithm for long time series and/or
short shapelet length, possibly at the cost of some accuracy.
distance: str="manhattan"
Name of the distance function to be used. By default this is the
manhattan distance. Other distances from the aeon distance modules can be used.
estimator : BaseEstimator or None, default=None
Base estimator for the ensemble, can be supplied a sklearn `BaseEstimator`. If
`None` a default `RidgeClassifierCV` classifier is used with standard scalling.
Expand Down Expand Up @@ -134,6 +137,7 @@ def __init__(
use_prime_dilations: bool = False,
estimator=None,
save_transformed_data: bool = False,
distance: str = "manhattan",
n_jobs: int = 1,
random_state: Union[int, Type[np.random.RandomState], None] = None,
) -> None:
Expand All @@ -143,7 +147,7 @@ def __init__(
self.threshold_percentiles = threshold_percentiles
self.alpha_similarity = alpha_similarity
self.use_prime_dilations = use_prime_dilations

self.distance = distance
self.estimator = estimator
self.save_transformed_data = save_transformed_data
self.random_state = random_state
Expand Down Expand Up @@ -184,6 +188,7 @@ def _fit(self, X, y):
use_prime_dilations=self.use_prime_dilations,
n_jobs=self.n_jobs,
random_state=self.random_state,
distance=self.distance,
)
if self.estimator is None:
self._estimator = make_pipeline(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from numba.typed import List
from sklearn.preprocessing import LabelEncoder

from aeon.distances import manhattan_distance
from aeon.distances import get_distance_function
from aeon.transformations.collection import BaseCollectionTransformer
from aeon.utils.numba.general import (
AEON_NUMBA_STD_THRESHOLD,
Expand Down Expand Up @@ -83,6 +83,9 @@ class RandomDilatedShapeletTransform(BaseCollectionTransformer):
If True, restrict the value of the shapelet dilation parameter to be prime
values. This can greatly speed up the algorithm for long time series and/or
short shapelet length, possibly at the cost of some accuracy.
distance: str="manhattan"
Name of the distance function to be used. By default this is the
manhattan distance. Other distances from the aeon distance modules can be used.
n_jobs : int, default=1
The number of threads used for both `fit` and `transform`.
random_state : int or None, default=None
Expand Down Expand Up @@ -153,6 +156,7 @@ def __init__(
alpha_similarity=0.5,
use_prime_dilations=False,
random_state=None,
distance="manhattan",
n_jobs=1,
):
self.max_shapelets = max_shapelets
Expand All @@ -162,6 +166,7 @@ def __init__(
self.alpha_similarity = alpha_similarity
self.use_prime_dilations = use_prime_dilations
self.random_state = random_state
self.distance = distance
self.n_jobs = n_jobs

super().__init__()
Expand All @@ -183,7 +188,8 @@ def _fit(self, X, y=None):
self : RandomDilatedShapeletTransform
This estimator.
"""
# Numba does not yet support new random numpy API with generator
self.distance_func = get_distance_function(self.distance)

if isinstance(self.random_state, int):
self._random_state = np.int32(self.random_state)
else:
Expand Down Expand Up @@ -218,6 +224,7 @@ def _fit(self, X, y=None):
self.alpha_similarity,
self.use_prime_dilations,
self._random_state,
self.distance_func,
)
if len(self.shapelets_[0]) == 0:
raise RuntimeError(
Expand Down Expand Up @@ -259,7 +266,11 @@ def _transform(self, X, y=None):
"calling transform."
)

X_new = dilated_shapelet_transform(X, self.shapelets_)
X_new = dilated_shapelet_transform(
X,
self.shapelets_,
self.distance_func,
)
if np.isinf(X_new).any() or np.isnan(X_new).any():
warnings.warn(
"Some invalid values (inf or nan) where converted from to 0 during the"
Expand Down Expand Up @@ -482,6 +493,7 @@ def random_dilated_shapelet_extraction(
alpha_similarity,
use_prime_dilations,
seed,
distance,
):
"""Randomly generate a set of shapelets given the input parameters.

Expand Down Expand Up @@ -518,6 +530,10 @@ def random_dilated_shapelet_extraction(
short shapelet length, possibly at the cost of some accuracy.
seed : int
Seed for random number generation.
distance: CPUDispatcher
A Numba function used to compute the distance between two multidimensional
time series of shape (n_channels, length). Used as distance function between
shapelets and candidate subsequences

Returns
-------
Expand Down Expand Up @@ -641,7 +657,7 @@ def random_dilated_shapelet_extraction(
X[id_test], length, dilation
)
X_subs = normalize_subsequences(X_subs, X_means, X_stds)
x_dist = compute_shapelet_dist_vector(X_subs, _val, length)
x_dist = compute_shapelet_dist_vector(X_subs, _val, length, distance)

lower_bound = np.percentile(x_dist, threshold_percentiles[0])
upper_bound = np.percentile(x_dist, threshold_percentiles[1])
Expand Down Expand Up @@ -669,7 +685,7 @@ def random_dilated_shapelet_extraction(


@njit(fastmath=True, cache=True, parallel=True)
def dilated_shapelet_transform(X, shapelets):
def dilated_shapelet_transform(X, shapelets, distance):
"""Perform the shapelet transform with a set of shapelets and a set of time series.

Parameters
Expand All @@ -692,6 +708,10 @@ def dilated_shapelet_transform(X, shapelets):
Means of the shapelets
- stds : array, shape (n_shapelets, n_channels)
Standard deviation of the shapelets
distance: CPUDispatcher
A Numba function used to compute the distance between two multidimensional
time series of shape (n_channels, length).


Returns
-------
Expand Down Expand Up @@ -728,7 +748,7 @@ def dilated_shapelet_transform(X, shapelets):
for i_shp in idx_no_norm:
X_new[i_x, (n_ft * i_shp) : (n_ft * i_shp + n_ft)] = (
compute_shapelet_features(
X_subs, values[i_shp], length, threshold[i_shp]
X_subs, values[i_shp], length, threshold[i_shp], distance
)
)

Expand All @@ -739,7 +759,7 @@ def dilated_shapelet_transform(X, shapelets):
for i_shp in idx_norm:
X_new[i_x, (n_ft * i_shp) : (n_ft * i_shp + n_ft)] = (
compute_shapelet_features(
X_subs, values[i_shp], length, threshold[i_shp]
X_subs, values[i_shp], length, threshold[i_shp], distance
)
)
return X_new
Expand Down Expand Up @@ -808,7 +828,7 @@ def get_all_subsequences(X, length, dilation):


@njit(fastmath=True, cache=True)
def compute_shapelet_features(X_subs, values, length, threshold):
def compute_shapelet_features(X_subs, values, length, threshold, distance):
"""Extract the features from a shapelet distance vector.

Given a shapelet and a time series, extract three features from the resulting
Expand All @@ -826,10 +846,11 @@ def compute_shapelet_features(X_subs, values, length, threshold):
The value array of the shapelet
length : int
Length of the shapelet
values : array, shape (n_channels, length)
The resulting subsequence
threshold : float
The threshold parameter of the shapelet
distance: CPUDispatcher
A Numba function used to compute the distance between two multidimensional
time series of shape (n_channels, length).

Returns
-------
Expand All @@ -843,7 +864,7 @@ def compute_shapelet_features(X_subs, values, length, threshold):
n_subsequences = X_subs.shape[0]

for i_sub in prange(n_subsequences):
_dist = manhattan_distance(X_subs[i_sub], values[:, :length])
_dist = distance(X_subs[i_sub], values[:, :length])
if _dist < _min:
_min = _dist
_argmin = i_sub
Expand All @@ -854,7 +875,7 @@ def compute_shapelet_features(X_subs, values, length, threshold):


@njit(fastmath=True, cache=True)
def compute_shapelet_dist_vector(X_subs, values, length):
def compute_shapelet_dist_vector(X_subs, values, length, distance):
"""Extract the features from a shapelet distance vector.

Given a shapelet and a time series, extract three features from the resulting
Expand All @@ -872,20 +893,17 @@ def compute_shapelet_dist_vector(X_subs, values, length):
The value array of the shapelet
length : int
Length of the shapelet
dilation : int
Dilation of the shapelet
values : array, shape (n_channels, length)
The resulting subsequence
threshold : float
The threshold parameter of the shapelet
distance: CPUDispatcher
A Numba function used to compute the distance between two multidimensional
time series of shape (n_channels, length).

Returns
-------
min, argmin, shapelet occurence
The three computed features as float dtypes
dist_vector : array, shape = (n_timestamps-(length-1)*dilation)
The distance vector between the shapelets and candidate subsequences
"""
n_subsequences = X_subs.shape[0]
dist_vector = np.zeros(n_subsequences)
for i_sub in prange(n_subsequences):
dist_vector[i_sub] = manhattan_distance(X_subs[i_sub], values[:, :length])
dist_vector[i_sub] = distance(X_subs[i_sub], values[:, :length])
return dist_vector
3 changes: 1 addition & 2 deletions aeon/transformations/collection/shapelet_based/_rsast.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,6 @@ def _fit(self, X, y):
# 2--calculate PACF and ACF for each TS chosen in each class

for i, c in enumerate(classes):

X_c = X_[y == c]

cnt = np.min([self.nb_inst_per_class, X_c.shape[0]]).astype(int)
Expand Down Expand Up @@ -313,7 +312,7 @@ def _transform(self, X, y=None):

Returns
-------
X_transformed: np.ndarray shape (n_cases, n_timepoints),
X_transformed: np.ndarray shape (n_cases, n_kernels),
The transformed data
"""
X_ = np.reshape(X, (X.shape[0], X.shape[-1]))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,9 @@ def test_compute_shapelet_features(dtype):
dilation = 1
threshold = 0.01
X_subs = get_all_subsequences(X, length, dilation)
_min, _argmin, SO = compute_shapelet_features(X_subs, values, length, threshold)
_min, _argmin, SO = compute_shapelet_features(
X_subs, values, length, threshold, manhattan_distance
)

# On some occasion, float32 precision with fasmath retruns things like
# 2.1835059227370834e-07 instead of 0
Expand All @@ -155,7 +157,9 @@ def test_compute_shapelet_features(dtype):
dilation = 2
threshold = 0.1
X_subs = get_all_subsequences(X, length, dilation)
_min, _argmin, SO = compute_shapelet_features(X_subs, values, length, threshold)
_min, _argmin, SO = compute_shapelet_features(
X_subs, values, length, threshold, manhattan_distance
)

assert_almost_equal(_min, 0.0, decimal=4)
assert _argmin == 7.0
Expand All @@ -164,7 +168,9 @@ def test_compute_shapelet_features(dtype):
dilation = 4
threshold = 2
X_subs = get_all_subsequences(X, length, dilation)
_min, _argmin, SO = compute_shapelet_features(X_subs, values, length, threshold)
_min, _argmin, SO = compute_shapelet_features(
X_subs, values, length, threshold, manhattan_distance
)

assert_almost_equal(_min, 0.0, decimal=4)
assert _argmin == 3.0
Expand All @@ -179,7 +185,9 @@ def test_compute_shapelet_dist_vector(dtype):
for dilation in [1, 3, 5]:
values = np.random.rand(3, length).astype(dtype)
X_subs = get_all_subsequences(X, length, dilation)
d_vect = compute_shapelet_dist_vector(X_subs, values, length)
d_vect = compute_shapelet_dist_vector(
X_subs, values, length, manhattan_distance
)
true_vect = np.zeros(X.shape[1] - (length - 1) * dilation)
for i_sub in range(true_vect.shape[0]):
_idx = [i_sub + j * dilation for j in range(length)]
Expand Down
8 changes: 8 additions & 0 deletions aeon/visualisation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,18 @@
"plot_series_with_profiles",
"plot_cluster_algorithm",
"plot_temporal_importance_curves",
"ShapeletVisualizer",
"ShapeletTransformerVisualizer",
"ShapeletClassifierVisualizer",
]

from aeon.visualisation.estimator._clasp import plot_series_with_profiles
from aeon.visualisation.estimator._clustering import plot_cluster_algorithm
from aeon.visualisation.estimator._shapelets import (
ShapeletClassifierVisualizer,
ShapeletTransformerVisualizer,
ShapeletVisualizer,
)
from aeon.visualisation.estimator._temporal_importance_curves import (
plot_temporal_importance_curves,
)
Expand Down
Loading