From 8b34cbc39bbc0c80d21512dca21202688ef32aa3 Mon Sep 17 00:00:00 2001 From: MatthewMiddlehurst Date: Thu, 1 Aug 2024 13:33:49 +0100 Subject: [PATCH 1/3] comments and lower bounding --- aeon/distances/_dft_sfa_mindist.py | 2 +- aeon/distances/_sfa_mindist.py | 2 +- aeon/distances/tests/test_symbolic_mindist.py | 4 +- .../collection/dictionary_based/_sfa.py | 86 ++++++++--------- .../collection/dictionary_based/_sfa_fast.py | 95 +++++++++---------- 5 files changed, 85 insertions(+), 104 deletions(-) diff --git a/aeon/distances/_dft_sfa_mindist.py b/aeon/distances/_dft_sfa_mindist.py index 1e6fe51d17..1502564532 100644 --- a/aeon/distances/_dft_sfa_mindist.py +++ b/aeon/distances/_dft_sfa_mindist.py @@ -49,7 +49,7 @@ def dft_sfa_mindist( ... alphabet_size=8, ... window_size=x.shape[-1], ... norm=True, - ... lower_bounding_distances=True # This must be set! + ... lower_bounding=True # This must be set! ... ) >>> transform.fit(x) SFAFast(...) diff --git a/aeon/distances/_sfa_mindist.py b/aeon/distances/_sfa_mindist.py index d7f4cfe45d..0649f8cf9b 100644 --- a/aeon/distances/_sfa_mindist.py +++ b/aeon/distances/_sfa_mindist.py @@ -47,7 +47,7 @@ def sfa_mindist(x: np.ndarray, y: np.ndarray, breakpoints: np.ndarray) -> float: ... alphabet_size=8, ... window_size=x.shape[-1], ... norm=True, - ... lower_bounding_distances=True # This must be set! + ... lower_bounding=True # This must be set! ... ) >>> transform.fit(x) SFAFast(...) diff --git a/aeon/distances/tests/test_symbolic_mindist.py b/aeon/distances/tests/test_symbolic_mindist.py index 6aa518db4e..d18f5d18a0 100644 --- a/aeon/distances/tests/test_symbolic_mindist.py +++ b/aeon/distances/tests/test_symbolic_mindist.py @@ -70,7 +70,7 @@ def test_sfa_mindist(): binning_method=histogram_type, norm=True, variance=True, - lower_bounding_distances=True, # This must be set! + lower_bounding=True, # This must be set! ) sfa_old = SFA( @@ -79,7 +79,7 @@ def test_sfa_mindist(): window_size=X_train.shape[-1], binning_method=histogram_type, norm=True, - lower_bounding_distances=True, # This must be set! + lower_bounding=True, # This must be set! ) transforms = [sfa_old, sfa_fast] diff --git a/aeon/transformations/collection/dictionary_based/_sfa.py b/aeon/transformations/collection/dictionary_based/_sfa.py index df3f51604a..b623f75adc 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa.py +++ b/aeon/transformations/collection/dictionary_based/_sfa.py @@ -3,7 +3,7 @@ Configurable SFA transform for discretising time series into words. """ -__maintainer__ = [] +__maintainer__ = ["patrickzib", "MatthewMiddlehurst"] __all__ = ["SFA"] import math @@ -21,13 +21,14 @@ from aeon.transformations.collection import BaseCollectionTransformer -# The binning methods to use: equi-depth, equi-width, information gain or kmeans +# The binning methods to use binning_methods = { "equi-depth", "equi-width", "information-gain", "information-gain-mae", "kmeans", + "quantile", } @@ -40,47 +41,42 @@ class SFA(BaseCollectionTransformer): shorten the series with DFT discretise the shortened series into bins set by MFC form a word from these discrete values - by default SFA produces a single word per series (window_size=0) - if a window is used, it forms a histogram of counts of words. + SFA returns a dictionary of word counts for each series + + This is a slower but more flexible version of the SFA transform, which can store + greater than 64 bit words. This is at the cost of efficiency, however. Parameters ---------- - word_length: int, default = 8 - length of word to shorten window to (using DFT) - - alphabet_size: int, default = 4 - number of values to discretise each value to - - window_size: int, default = 12 - size of window for sliding. Input series - length for whole series transform - - norm: boolean, default = False - mean normalise words by dropping first fourier coefficient - - binning_method: {"equi-depth", "equi-width", "information-gain", - "information-gain-mae", "kmeans"}, default="equi-depth" - the binning method used to derive the breakpoints. - - anova: boolean, default = False - If True, the Fourier coefficient selection is done via a one-way - ANOVA test. If False, the first Fourier coefficients are selected. - Only applicable if labels are given - - bigrams: boolean, default = False - whether to create bigrams of SFA words - - skip_grams: boolean, default = False - whether to create skip-grams of SFA words - - remove_repeat_words: boolean, default = False - whether to use numerosity reduction (default False) + word_length : int, default=8 + Length of word to shorten window to (using PAA). + alphabet_size : int, default=4 + Number of values to discretise each value to. + window_size : int, default=12 + Size of window for sliding. Input series length for whole series transform. + norm : boolean, default=False + Mean normalise words by dropping first fourier coefficient. + binning_method : str, default="equi-depth" + The binning method used to derive the breakpoints. One of {"equi-depth", + "equi-width", "information-gain", "information-gain-mae", "kmeans", + "quantile"}. + anova : boolean, default=False + If True, the Fourier coefficient selection is done via a one-way ANOVA test. + If False, the first Fourier coefficients are selected. Only applicable if + labels are given. + bigrams : boolean, default=False + Whether to create bigrams of SFA words. + skip_grams : boolean, default=False + Whether to create skip-grams of SFA words. + levels: int, default=1 + Number of spatial pyramid levels + remove_repeat_words : boolean, default=False + Whether to use numerosity reduction. lower_bounding_distances : boolean, default = None If set to True, the FFT is normed to allow for ED lower bounding. - levels: int, default = 1 - Number of spatial pyramid levels + save_words: boolean, default = False whether to save the words generated for each series (default False) @@ -123,10 +119,9 @@ def __init__( anova=False, bigrams=False, skip_grams=False, - remove_repeat_words=False, levels=1, - lower_bounding=True, - lower_bounding_distances=None, + remove_repeat_words=False, + lower_bounding=False, save_words=False, keep_binning_dft=False, use_fallback_dft=False, @@ -151,13 +146,8 @@ def __init__( self.norm = norm self.lower_bounding = lower_bounding - self.lower_bounding_distances = lower_bounding_distances - self.inverse_sqrt_win_size = ( - 1.0 / math.sqrt(window_size) - if (not lower_bounding or lower_bounding_distances) - else 1.0 - ) + self.inverse_sqrt_win_size = 1.0 / math.sqrt(window_size) self.remove_repeat_words = remove_repeat_words @@ -626,7 +616,7 @@ def _binning_dft(self, series, num_windows_per_inst): self.dft_length, self.norm, self.inverse_sqrt_win_size, - self.lower_bounding or self.lower_bounding_distances, + self.lower_bounding, ) if self._use_fallback_dft else self._fast_fourier_transform(row) @@ -664,7 +654,7 @@ def _fast_fourier_transform(self, series): dft = np.empty((length,), dtype=reals.dtype) dft[0::2] = reals[: np.uint32(length / 2)] dft[1::2] = imags[: np.uint32(length / 2)] - if self.lower_bounding or self.lower_bounding_distances: + if self.lower_bounding: dft[1::2] = dft[1::2] * -1 # lower bounding dft *= self.inverse_sqrt_win_size / std return dft[start:] @@ -741,7 +731,7 @@ def _mft(self, series): self.dft_length, self.norm, self.inverse_sqrt_win_size, - self.lower_bounding or self.lower_bounding_distances, + self.lower_bounding, apply_normalising_factor=False, cut_start_if_norm=False, ) diff --git a/aeon/transformations/collection/dictionary_based/_sfa_fast.py b/aeon/transformations/collection/dictionary_based/_sfa_fast.py index 044217158e..cf8bbcc41e 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_fast.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_fast.py @@ -1,25 +1,17 @@ """Symbolic Fourier Approximation (SFA) Transformer. -Configurable SFA transform for discretising time series into words. - +Efficient but rigid SFA transform for discretising time series into words. """ -__maintainer__ = [] +__maintainer__ = ["patrickzib", "MatthewMiddlehurst"] __all__ = ["SFAFast"] import math import sys -from warnings import simplefilter import numpy as np import pandas as pd -from numba import ( - NumbaPendingDeprecationWarning, - NumbaTypeSafetyWarning, - njit, - objmode, - prange, -) +from numba import njit, objmode, prange from numba.core import types from numba.typed import Dict from scipy.sparse import csr_matrix @@ -30,7 +22,7 @@ from aeon.transformations.collection import BaseCollectionTransformer -# The binning methods to use: equi-depth, equi-width, information gain or kmeans +# The binning methods to use binning_methods = { "equi-depth", "equi-width", @@ -40,9 +32,6 @@ "quantile", } -simplefilter(action="ignore", category=NumbaPendingDeprecationWarning) -simplefilter(action="ignore", category=NumbaTypeSafetyWarning) - class SFAFast(BaseCollectionTransformer): """Symbolic Fourier Approximation (SFA) Transformer. @@ -53,37 +42,44 @@ class SFAFast(BaseCollectionTransformer): shorten the series with DFT discretise the shortened series into bins set by MFC form a word from these discrete values - by default SFA produces a single word per series (window_size=0) - if a window is used, it forms a histogram of counts of words. + SFA returns an array of word counts for each series consisting of a column for + each word found in fit. + + This is a faster but more rigid version of the SFA transform, which can only use + up to 64 bit words and does not store the actual words found in its transformed + array. Parameters ---------- - word_length : int, default = 8 + word_length : int, default=8 Length of word to shorten window to (using DFT). - alphabet_size : int, default = 4 + alphabet_size : int, default=4 Number of values to discretise each value to. - window_size : int, default = 12 + window_size : int, default=12 Size of window for sliding. Input series length for whole series transform. - norm : boolean, default = False + norm : boolean, default=False Mean normalise words by dropping first fourier coefficient. binning_method : str, default="equi-depth" The binning method used to derive the breakpoints. One of {"equi-depth", - "equi-width", "information-gain", "information-gain-mae", "kmeans"}, - anova : boolean, default = False + "equi-width", "information-gain", "information-gain-mae", "kmeans", + "quantile"}. + anova : boolean, default=False If True, the Fourier coefficient selection is done via a one-way ANOVA test. If False, the first Fourier coefficients are selected. Only applicable if labels are given. - variance : boolean, default = False + variance : boolean, default=False If True, the Fourier coefficient selection is done via the largest variance. If False, the first Fourier coefficients are selected. Only applicable if labels are given. + bigrams : boolean, default=False + Whether to create bigrams of SFA words. + skip_grams : boolean, default=False + Whether to create skip-grams of SFA words. dilation : int, default = 0 When set to dilation > 1, adds dilation to the sliding window operation. - save_words : boolean, default = False - whether to save the words generated for each series (default False) - bigrams : boolean, default = False - Whether to create bigrams of SFA words. - feature_selection : {"chi2", "chi2_top_k", "none", "random"}, default: none + remove_repeat_words : boolean, default=False + Whether to use numerosity reduction. + feature_selection : {"chi2", "chi2_top_k", "random", None}, default=None Sets the feature selections strategy to be used. Large amounts of memory may be needed depending on the setting of bigrams (true is more) or alpha (larger is more). @@ -92,19 +88,20 @@ class SFAFast(BaseCollectionTransformer): dropping values based on p-value. 'random' reduces the number to at most 'max_feature_count', by randomly selecting features. - 'none' does not apply any feature selection and yields large bag of words, - p_threshold : int, default=0.05 (disabled by default) + None does not apply any feature selection and yields large bag of words, + p_threshold : float, default=0.05 If feature_selection=chi2 is chosen, feature selection is applied based on the chi-squared test. This is the p-value threshold to use for chi-squared test on bag-of-words (lower means more strict). 1 indicates that the test should not be performed. - max_feature_count : int, default=256 + max_feature_count : int, default=256 If feature_selection=random is chosen, this parameter defines the number of randomly chosen unique words used. - skip_grams : boolean, default = False - Whether to create skip-grams of SFA words. - remove_repeat_words : boolean, default = False - Whether to use numerosity reduction. + + + + save_words : boolean, default = False + whether to save the words generated for each series (default False) lower_bounding_distances : boolean, default = None If set to True, the FFT is normed to allow for ED lower bounding. return_sparse : boolean, default=True @@ -150,8 +147,7 @@ def __init__( bigrams=False, skip_grams=False, remove_repeat_words=False, - lower_bounding=True, - lower_bounding_distances=None, + lower_bounding=False, save_words=False, dilation=0, first_difference=False, @@ -174,13 +170,8 @@ def __init__( self.norm = norm self.lower_bounding = lower_bounding - self.lower_bounding_distances = lower_bounding_distances - self.inverse_sqrt_win_size = ( - 1.0 / math.sqrt(window_size) - if (not lower_bounding or lower_bounding_distances) - else 1.0 - ) + self.inverse_sqrt_win_size = 1.0 / math.sqrt(window_size) self.remove_repeat_words = remove_repeat_words @@ -287,7 +278,7 @@ def _fit_transform(self, X, y=None): self.bigrams, self.skip_grams, self.inverse_sqrt_win_size, - self.lower_bounding or self.lower_bounding_distances, + self.lower_bounding, ) if self.remove_repeat_words: @@ -348,12 +339,12 @@ def _transform(self, X, y=None): self.bigrams, self.skip_grams, self.inverse_sqrt_win_size, - self.lower_bounding or self.lower_bounding_distances, + self.lower_bounding, ) - # only save at fit - # if self.save_words: - # self.words = words + # TODO only save at fit? + if self.save_words: + self.words = words # transform: applies the feature selection strategy empty_dict = Dict.empty( @@ -406,7 +397,7 @@ def transform_mft(self, X): self.anova, self.variance, self.inverse_sqrt_win_size, - self.lower_bounding or self.lower_bounding_distances, + self.lower_bounding, ) def transform_to_bag(self, words, word_len, y=None): @@ -512,7 +503,7 @@ def _binning(self, X, y=None): self.dft_length, self.norm, self.inverse_sqrt_win_size, - self.lower_bounding or self.lower_bounding_distances, + self.lower_bounding, ) if y is not None: @@ -703,7 +694,7 @@ def transform_words(self, X): self.anova, self.variance, self.inverse_sqrt_win_size, - self.lower_bounding or self.lower_bounding_distances, + self.lower_bounding, self.word_length, self.alphabet_size, self.breakpoints, From 6323ab3faca7ee579bb6fffd08047ba22f2e7afb Mon Sep 17 00:00:00 2001 From: MatthewMiddlehurst Date: Thu, 1 Aug 2024 20:27:31 +0100 Subject: [PATCH 2/3] SFA lower bounding --- .../collection/dictionary_based/_sfa.py | 2 +- .../collection/dictionary_based/_sfa_fast.py | 2 +- .../dictionary_based/tests/test_sfa.py | 18 +++++++++++------- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/aeon/transformations/collection/dictionary_based/_sfa.py b/aeon/transformations/collection/dictionary_based/_sfa.py index b623f75adc..cee55e65b9 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa.py +++ b/aeon/transformations/collection/dictionary_based/_sfa.py @@ -121,7 +121,7 @@ def __init__( skip_grams=False, levels=1, remove_repeat_words=False, - lower_bounding=False, + lower_bounding=True, save_words=False, keep_binning_dft=False, use_fallback_dft=False, diff --git a/aeon/transformations/collection/dictionary_based/_sfa_fast.py b/aeon/transformations/collection/dictionary_based/_sfa_fast.py index cf8bbcc41e..c1a70db1d4 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_fast.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_fast.py @@ -147,7 +147,7 @@ def __init__( bigrams=False, skip_grams=False, remove_repeat_words=False, - lower_bounding=False, + lower_bounding=True, save_words=False, dilation=0, first_difference=False, diff --git a/aeon/transformations/collection/dictionary_based/tests/test_sfa.py b/aeon/transformations/collection/dictionary_based/tests/test_sfa.py index 7304f38ee8..df0cd7f2a6 100644 --- a/aeon/transformations/collection/dictionary_based/tests/test_sfa.py +++ b/aeon/transformations/collection/dictionary_based/tests/test_sfa.py @@ -35,7 +35,8 @@ def test_transformer(binning_method): @pytest.mark.parametrize("use_fallback_dft", [True, False]) @pytest.mark.parametrize("norm", [True, False]) -def test_dft_mft(use_fallback_dft, norm): +@pytest.mark.parametrize("lower_bounding", [True, False]) +def test_dft_mft(use_fallback_dft, norm, lower_bounding): """Test the DFT and MFT of the SFA transformer.""" # load training data X = np.random.rand(10, 1, 150) @@ -54,10 +55,13 @@ def test_dft_mft(use_fallback_dft, norm): window_size=window_size, norm=norm, use_fallback_dft=use_fallback_dft, + lower_bounding=lower_bounding, ).fit(X, y) if use_fallback_dft: - dft = p._discrete_fourier_transform(X_tab[0], word_length, norm, 1, True) + dft = p._discrete_fourier_transform( + X_tab[0], word_length, norm, p.inverse_sqrt_win_size, lower_bounding + ) else: dft = p._fast_fourier_transform(X_tab[0]) @@ -80,7 +84,11 @@ def test_dft_mft(use_fallback_dft, norm): for i in range(len(X_tab[0]) - window_size + 1): if use_fallback_dft: dft = p._discrete_fourier_transform( - X_tab[0, i : window_size + i], word_length, norm, 1, True + X_tab[0, i : window_size + i], + word_length, + norm, + p.inverse_sqrt_win_size, + lower_bounding, ) else: dft = p._fast_fourier_transform(X_tab[0, i : window_size + i]) @@ -91,9 +99,6 @@ def test_dft_mft(use_fallback_dft, norm): assert len(mft[0]) == word_length -test_dft_mft(True, True) - - @pytest.mark.parametrize("binning_method", ["equi-depth", "information-gain"]) def test_sfa_anova(binning_method): """Test the SFA transformer with ANOVA one-sided test.""" @@ -131,7 +136,6 @@ def test_sfa_anova(binning_method): _ = p2.transform(X, y) -# @pytest.mark.parametrize("word_length", [6, 7]) @pytest.mark.parametrize("alphabet_size", [4, 5]) @pytest.mark.parametrize("window_size", [5, 6]) From 6f47dcb9e630b80b08dc70d2784f56059a9c7b80 Mon Sep 17 00:00:00 2001 From: MatthewMiddlehurst Date: Thu, 1 Aug 2024 20:49:04 +0100 Subject: [PATCH 3/3] SFA lower bounding --- .../collection/dictionary_based/tests/test_sfa.py | 1 + 1 file changed, 1 insertion(+) diff --git a/aeon/transformations/collection/dictionary_based/tests/test_sfa.py b/aeon/transformations/collection/dictionary_based/tests/test_sfa.py index df0cd7f2a6..a7eb4e07ad 100644 --- a/aeon/transformations/collection/dictionary_based/tests/test_sfa.py +++ b/aeon/transformations/collection/dictionary_based/tests/test_sfa.py @@ -78,6 +78,7 @@ def test_dft_mft(use_fallback_dft, norm, lower_bounding): window_size=window_size, norm=norm, use_fallback_dft=use_fallback_dft, + lower_bounding=lower_bounding, ).fit(X, y) mft = p._mft(X_tab[0])