diff --git a/docs/examples.md b/docs/examples.md index 7b4b269b2f..6c6f418fff 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -183,6 +183,17 @@ Overview of Transformations ::: +:::{grid-item-card} +:img-top: examples/transformations/img/preprocessing.png +:class-img-top: aeon-card-image-m +:link: /examples/transformations/preprocessing.ipynb +:link-type: ref +:text-align: center + +Preprocessing time series + +::: + :::{grid-item-card} :img-top: examples/transformations/img/tsfresh.png :class-img-top: aeon-card-image-m @@ -238,17 +249,6 @@ SAST transform ::: -:::{grid-item-card} -:img-top: examples/transformations/img/interpolation.png -:class-img-top: aeon-card-image-m -:link: /examples/transformations/interpolation.ipynb -:link-type: ref -:text-align: center - -Interpolation - -::: - :::{grid-item-card} :img-top: examples/transformations/img/signature.png :class-img-top: aeon-card-image-m @@ -260,17 +260,6 @@ Signature method ::: -:::{grid-item-card} -:img-top: examples/transformations/img/theta.png -:class-img-top: aeon-card-image-m -:link: /examples/transformations/theta_transform.ipynb -:link-type: ref -:text-align: center - -Theta transform - -::: - :::: ## Segmentation @@ -343,7 +332,7 @@ Using aeon distances with scikit-learn :::: -## Similarity search +## Similarity Search ::::{grid} 2 3 4 4 :gutter: 1 diff --git a/examples/transformations/img/preprocessing.png b/examples/transformations/img/preprocessing.png new file mode 100644 index 0000000000..db183dde2e Binary files /dev/null and b/examples/transformations/img/preprocessing.png differ diff --git a/examples/transformations/preprocessing.ipynb b/examples/transformations/preprocessing.ipynb index 6b82b3d226..9d3bbab247 100644 --- a/examples/transformations/preprocessing.ipynb +++ b/examples/transformations/preprocessing.ipynb @@ -6,11 +6,12 @@ "# Preprocessing time series with aeon\n", "\n", "It is common to need to preprocess time series data before applying machine learning\n", - "algorithms. Transformers in `aeon` can be used to preprocess collections of time\n", - "series. This notebook demonstrates three common use cases\n", + "algorithms. So algorithms can handle these characteristics, or `aeon` transformers can be used to preprocess collections of time\n", + "series into standard format. This notebook demonstrates three common use cases\n", "\n", - "1. Rescaling time series\n", - "2. Resizing time series\n" + "1. [Rescaling time series](#Rescaling-time-series)\n", + "2. [Resizing time series](#Resizing-time-series)\n", + "3. [Dealing with missing values](#missing-values)\n" ], "metadata": { "collapsed": false @@ -34,17 +35,6 @@ }, { "cell_type": "code", - "execution_count": 2, - "outputs": [ - { - "data": { - "text/plain": "array([[561.875 ],\n [604.95833333],\n [629.16666667],\n [801.45833333],\n [540.75 ]])" - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "import numpy as np\n", "\n", @@ -54,28 +44,59 @@ "np.mean(X, axis=-1)[0:5]" ], "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": 3, + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-11-12T16:32:56.353179Z", + "start_time": "2024-11-12T16:32:56.335570Z" + } + }, "outputs": [ { "data": { - "text/plain": "array([[428.95224215],\n [483.35481095],\n [514.90052977],\n [629.00847763],\n [389.10059218]])" + "text/plain": [ + "array([[561.875 ],\n", + " [604.95833333],\n", + " [629.16666667],\n", + " [801.45833333],\n", + " [540.75 ]])" + ] }, - "execution_count": 3, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], + "execution_count": 31 + }, + { + "cell_type": "code", "source": [ "np.std(X, axis=-1)[0:5]" ], "metadata": { - "collapsed": false - } + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-11-12T16:32:56.916707Z", + "start_time": "2024-11-12T16:32:56.909707Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[428.95224215],\n", + " [483.35481095],\n", + " [514.90052977],\n", + " [629.00847763],\n", + " [389.10059218]])" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 32 }, { "cell_type": "markdown", @@ -90,17 +111,6 @@ }, { "cell_type": "code", - "execution_count": 4, - "outputs": [ - { - "data": { - "text/plain": "array([[ 0.],\n [-0.],\n [ 0.],\n [-0.],\n [-0.]])" - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "from aeon.transformations.collection import Normalizer\n", "\n", @@ -109,28 +119,59 @@ "np.round(np.mean(X2, axis=-1)[0:5], 6)" ], "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": 5, + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-11-12T16:32:58.247416Z", + "start_time": "2024-11-12T16:32:58.240777Z" + } + }, "outputs": [ { "data": { - "text/plain": "array([[1.],\n [1.],\n [1.],\n [1.],\n [1.]])" + "text/plain": [ + "array([[ 0.],\n", + " [-0.],\n", + " [ 0.],\n", + " [-0.],\n", + " [-0.]])" + ] }, - "execution_count": 5, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], + "execution_count": 33 + }, + { + "cell_type": "code", "source": [ "np.round(np.std(X2, axis=-1)[0:5], 6)" ], "metadata": { - "collapsed": false - } + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-11-12T16:32:59.026173Z", + "start_time": "2024-11-12T16:32:59.006225Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1.],\n", + " [1.],\n", + " [1.],\n", + " [1.],\n", + " [1.]])" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 34 }, { "cell_type": "markdown", @@ -143,17 +184,6 @@ }, { "cell_type": "code", - "execution_count": 6, - "outputs": [ - { - "data": { - "text/plain": "array([[ 0.],\n [-0.],\n [ 0.],\n [-0.],\n [ 0.]])" - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "from aeon.transformations.collection import Centerer\n", "\n", @@ -162,8 +192,29 @@ "np.round(np.mean(X3, axis=-1)[0:5], 6)" ], "metadata": { - "collapsed": false - } + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-11-12T16:33:00.283438Z", + "start_time": "2024-11-12T16:33:00.263489Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0.],\n", + " [-0.],\n", + " [ 0.],\n", + " [-0.],\n", + " [ 0.]])" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 35 }, { "cell_type": "markdown", @@ -176,17 +227,6 @@ }, { "cell_type": "code", - "execution_count": 7, - "outputs": [ - { - "data": { - "text/plain": "array([[0.],\n [0.],\n [0.],\n [0.],\n [0.]])" - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "from aeon.transformations.collection import MinMaxScaler\n", "\n", @@ -195,28 +235,59 @@ "np.round(np.min(X4, axis=-1)[0:5], 6)" ], "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": 8, + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-11-12T16:33:01.726406Z", + "start_time": "2024-11-12T16:33:01.712241Z" + } + }, "outputs": [ { "data": { - "text/plain": "array([[1.],\n [1.],\n [1.],\n [1.],\n [1.]])" + "text/plain": [ + "array([[0.],\n", + " [0.],\n", + " [0.],\n", + " [0.],\n", + " [0.]])" + ] }, - "execution_count": 8, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], + "execution_count": 36 + }, + { + "cell_type": "code", "source": [ "np.round(np.max(X4, axis=-1)[0:5], 6)" ], "metadata": { - "collapsed": false - } + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-11-12T16:33:02.556739Z", + "start_time": "2024-11-12T16:33:02.536132Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1.],\n", + " [1.],\n", + " [1.],\n", + " [1.],\n", + " [1.]])" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 37 }, { "cell_type": "markdown", @@ -232,7 +303,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Resizing time series\n", + "## Resizing time series\n", "\n", "Suppose we have a collections of time series with different lengths, i.e. different\n", "number of time points. Currently, most of aeon's collection estimators\n", @@ -243,7 +314,6 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2020-12-19T14:31:58.456171Z", @@ -253,13 +323,19 @@ }, "pycharm": { "is_executing": true + }, + "ExecuteTime": { + "end_time": "2024-11-12T16:33:06.040725Z", + "start_time": "2024-11-12T16:33:06.031583Z" } }, - "outputs": [], "source": [ "from aeon.classification.convolution_based import RocketClassifier\n", - "from aeon.datasets import load_basic_motions, load_plaid" - ] + "from aeon.datasets import load_basic_motions, load_japanese_vowels, load_plaid\n", + "from aeon.utils.validation import has_missing, is_equal_length, is_univariate" + ], + "outputs": [], + "execution_count": 38 }, { "cell_type": "markdown", @@ -277,7 +353,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Unequal or equal length collections time series\n", + "### Unequal or equal length collections time series\n", "\n", "If a collection contains all equal length series, it will store the data in a 3D\n", "numpy of shape `(n_cases, n_channels, n_timepoints)`. If it is unequal length, it is\n", @@ -286,7 +362,6 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2020-12-19T14:31:59.194445Z", @@ -296,66 +371,105 @@ }, "pycharm": { "is_executing": true + }, + "ExecuteTime": { + "end_time": "2024-11-12T16:33:11.372396Z", + "start_time": "2024-11-12T16:33:11.334983Z" } }, - "outputs": [], "source": [ "# Equal length multivariate data\n", "bm_X, bm_y = load_basic_motions()\n", - "print(type(bm_X), \"\\n\", bm_X.shape)" - ] + "X = bm_X\n", + "print(f\"{type(X)}, {X.shape}\")\n", + "print(\n", + " f\"univariate = {is_univariate(X)}, has missing ={has_missing(X)}, equal \"\n", + " f\"length = {is_equal_length(X)}\"\n", + ")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ", (80, 6, 100)\n", + "univariate = False, has missing =False, equal length = True\n" + ] + } + ], + "execution_count": 39 }, { "cell_type": "code", - "execution_count": null, - "outputs": [], "source": [ "# Unequal length univariate data\n", "plaid_X, plaid_y = load_plaid()\n", - "print(type(plaid_X), \"\\n\", plaid_X[0].shape, \"\\n\", plaid_X[10].shape)" + "X = plaid_X\n", + "print(type(plaid_X), \"\\n\", plaid_X[0].shape, \"\\n\", plaid_X[10].shape)\n", + "print(\n", + " f\"univariate = {is_univariate(X)}, has missing ={has_missing(X)}, equal \"\n", + " f\"length = {is_equal_length(X)}\"\n", + ")" ], "metadata": { "collapsed": false, "pycharm": { "is_executing": true + }, + "ExecuteTime": { + "end_time": "2024-11-12T16:33:15.995745Z", + "start_time": "2024-11-12T16:33:15.838016Z" } - } - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If time series are unequal length, collection estimators will raise an error if they\n", - "do not have the capability to handle this characteristic.\n" - ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + " (1, 500) \n", + " (1, 300)\n", + "univariate = True, has missing =False, equal length = False\n" + ] + } + ], + "execution_count": 40 }, { - "cell_type": "code", - "execution_count": null, "metadata": { - "execution": { - "iopub.execute_input": "2020-12-19T14:32:01.026183Z", - "iopub.status.busy": "2020-12-19T14:32:01.025650Z", - "iopub.status.idle": "2020-12-19T14:32:01.239714Z", - "shell.execute_reply": "2020-12-19T14:32:01.240542Z" - }, - "pycharm": { - "is_executing": true + "ExecuteTime": { + "end_time": "2024-11-12T16:33:49.713555Z", + "start_time": "2024-11-12T16:33:49.675458Z" } }, - "outputs": [], + "cell_type": "code", "source": [ - "rc = RocketClassifier()\n", - "try:\n", - " rc.fit(plaid_X, plaid_y)\n", - "except ValueError as e:\n", - " print(f\"ValueError: {e}\")" - ] + "\n", + "vowels_X, vowels_y = load_japanese_vowels(split=\"train\")\n", + "X = vowels_X\n", + "print(\n", + " f\"univariate = {is_univariate(X)}, has missing ={has_missing(X)}, equal \"\n", + " f\"length = {is_equal_length(X)}\"\n", + ")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "univariate = False, has missing =False, equal length = False\n" + ] + } + ], + "execution_count": 42 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "\n" }, { "cell_type": "code", - "execution_count": null, - "outputs": [], "source": [ "series_lengths = [array.shape[1] for array in plaid_X]\n", "\n", @@ -368,14 +482,142 @@ "collapsed": false, "pycharm": { "is_executing": true + }, + "ExecuteTime": { + "end_time": "2024-11-12T16:33:54.002965Z", + "start_time": "2024-11-12T16:33:53.996513Z" } - } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Min length = 100 max length = 1344\n" + ] + } + ], + "execution_count": 43 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "There are two basic strategies for unequal length problems\n", + "1. Use an estimator that can internally handle missing values\n", + "2. Transform the data to be equal length by, for example, truncating or padding series\n", + "\n", + "Estimators with the tag `\"capability:unequal_length\": True` have the capability to\n", + "handle unequal length series. For classification, regression and\n", + "clusterign, the\n", + "current list is" + ] + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-12T16:34:00.658671Z", + "start_time": "2024-11-12T16:34:00.546786Z" + } + }, + "cell_type": "code", + "source": [ + "from aeon.utils.discovery import all_estimators\n", + "\n", + "all_estimators(\n", + " type_filter=[\"classifier\", \"regressor\", \"clusterer\"],\n", + " tag_filter={\"capability:unequal_length\": True},\n", + ")" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "[('Catch22Classifier',\n", + " aeon.classification.feature_based._catch22.Catch22Classifier),\n", + " ('Catch22Clusterer', aeon.clustering.feature_based._catch22.Catch22Clusterer),\n", + " ('Catch22Regressor', aeon.regression.feature_based._catch22.Catch22Regressor),\n", + " ('DummyClassifier', aeon.classification.dummy.DummyClassifier),\n", + " ('DummyRegressor', aeon.regression._dummy.DummyRegressor),\n", + " ('ElasticEnsemble',\n", + " aeon.classification.distance_based._elastic_ensemble.ElasticEnsemble),\n", + " ('KNeighborsTimeSeriesClassifier',\n", + " aeon.classification.distance_based._time_series_neighbors.KNeighborsTimeSeriesClassifier),\n", + " ('KNeighborsTimeSeriesRegressor',\n", + " aeon.regression.distance_based._time_series_neighbors.KNeighborsTimeSeriesRegressor),\n", + " ('RDSTClassifier', aeon.classification.shapelet_based._rdst.RDSTClassifier),\n", + " ('RDSTRegressor', aeon.regression.shapelet_based._rdst.RDSTRegressor)]" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 44 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "You can pass these estimators unequal length series and they will work as expected.\n" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-12T16:34:03.070381Z", + "start_time": "2024-11-12T16:34:03.042207Z" + } + }, + "cell_type": "code", + "source": [ + "from aeon.classification.distance_based import KNeighborsTimeSeriesClassifier\n", + "\n", + "knn = KNeighborsTimeSeriesClassifier()\n", + "model = knn.fit(plaid_X, plaid_y)" + ], + "outputs": [], + "execution_count": 45 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "If time series are unequal length, collection estimators will raise an error if they\n", + "do not have the capability to handle this characteristic. If you want to use them, \n", + "you will need to preprocess the data to be equal length. " + ] + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-12T16:34:04.886239Z", + "start_time": "2024-11-12T16:34:04.856319Z" + } + }, + "cell_type": "code", + "source": [ + "rc = RocketClassifier()\n", + "try:\n", + " rc.fit(plaid_X, plaid_y)\n", + "except ValueError as e:\n", + " print(f\"ValueError: {e}\")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ValueError: Data seen by instance of RocketClassifier has unequal length series, but RocketClassifier cannot handle unequal length series. \n" + ] + } + ], + "execution_count": 46 }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Padding, truncating or resizing.\n", + "### Padding, truncating or resizing.\n", "\n", "We can pad, truncate or resize. By default, pad adds zeros to make all series the\n", "length of the longest, truncate removes all values beyond the length of the shortest\n", @@ -384,7 +626,6 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2020-12-19T14:32:01.245270Z", @@ -394,9 +635,12 @@ }, "pycharm": { "is_executing": true + }, + "ExecuteTime": { + "end_time": "2024-11-12T16:34:07.677683Z", + "start_time": "2024-11-12T16:34:07.554662Z" } }, - "outputs": [], "source": [ "from aeon.transformations.collection import Padder, Resizer, Truncator\n", "\n", @@ -407,7 +651,58 @@ "X3 = truncate.fit_transform(plaid_X)\n", "X4 = resize.fit_transform(plaid_X)\n", "print(X2.shape, \"\\n\", X3.shape, \"\\n\", X4.shape)" - ] + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1074, 1, 1344) \n", + " (1074, 1, 100) \n", + " (1074, 1, 600)\n" + ] + } + ], + "execution_count": 47 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-12T16:34:15.198556Z", + "start_time": "2024-11-12T16:34:14.994028Z" + } + }, + "cell_type": "code", + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.title(\"Before and after padding: PLAID first case (shifted up for unpadded)\")\n", + "plt.plot(plaid_X[0][0] + 10)\n", + "plt.plot(X2[0][0])" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 48 }, { "cell_type": "markdown", @@ -442,6 +737,56 @@ "is_executing": true } } + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Missing Values\n", + "\n", + "Missing values are indicated by `NaN` in numpy array. You can test whether any `aeon`\n", + " data structure contains missing values using the utility function" + ] + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "X = np.random.random(size=(10, 2, 200))\n", + "has_missing(X)" + ] + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "X[5][0][55] = np.NAN\n", + "has_missing(X)" + ] + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "There are a range of strategies for handling missing values. These include:\n", + "\n", + "1. Use an estimator that internally handles missing values. It is fairly easy for\n", + "some algorithms (such as decision trees) to internally deal with missing values,\n", + "usually be using it as a distinct series value after discretisation.\n", + "2. Removing series with missing: this is often desirable if the train set size is\n", + "large, the number of series with missing is small and the proportion of missing\n", + "values for these series is high.\n", + "3. Interpolating: estimating the missing values from the other series values. This is\n", + " often desirable if the train set size is small and the proportion of missing values\n", + " is low.\n", + "\n", + "Removing series with missing and interpolation is currently best done by you: there\n", + "are no transformers to deal with at at the moment. It is on the wish list.\n" + ] } ], "metadata": { diff --git a/examples/transformations/rocket.ipynb b/examples/transformations/rocket.ipynb index eec06438ca..3e1ed7db1d 100644 --- a/examples/transformations/rocket.ipynb +++ b/examples/transformations/rocket.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Demo of ROCKET transform\n", + "# The ROCKET transform\n", "\n", "## Overview\n", "\n", diff --git a/examples/transformations/transformations.ipynb b/examples/transformations/transformations.ipynb index db78fec3a8..7d88735104 100644 --- a/examples/transformations/transformations.ipynb +++ b/examples/transformations/transformations.ipynb @@ -221,6 +221,7 @@ "A list of all the available transformers can be found in the [API](https://www.aeon-toolkit.org/en/latest/api_reference/transformations.html). We currently have\n", "specific notebooks for the following transformers:\n", "\n", + "- [preprocessing](preprocessing.ipynb)\n", "- [catch22](catch22.ipynb)\n", "- [channel selection](channel_selection.ipynb)\n", "- [mini rocket](mini_rocket.ipynb)\n", diff --git a/examples/utils/preprocessing.ipynb b/examples/utils/preprocessing.ipynb deleted file mode 100644 index 68a434a320..0000000000 --- a/examples/utils/preprocessing.ipynb +++ /dev/null @@ -1,353 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Preprocessing data prior to machine learning\n", - "\n", - "For machine learning, we assume that data is in shape `(n_cases, n_channels,\n", - "n_timepoints)` for equal length series or a python list with `len` of `[n_cases]` if\n", - "the series are unequal length. However, in reality, there are often many steps to get\n", - " your data into this format. We introduce some common uses cases that may be handled\n", - " with preprocessing, and give some suggestions about how to handle them.\n" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "from aeon.classification.distance_based import KNeighborsTimeSeriesClassifier\n", - "from aeon.datasets import load_japanese_vowels, load_plaid\n", - "from aeon.utils.discovery import all_estimators\n", - "from aeon.utils.validation import has_missing, is_equal_length, is_univariate" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "is_executing": true - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Missing values\n", - "\n", - "Missing values are indicated by `NaN` in numpy array. You can test whether any `aeon`\n", - " data structure contains missing values using the utility function\n", - "\n" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": 26, - "outputs": [ - { - "data": { - "text/plain": "False" - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X = np.random.random(size=(10, 2, 200))\n", - "has_missing(X)" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": 27, - "outputs": [ - { - "data": { - "text/plain": "True" - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X[5][0][55] = np.NAN\n", - "has_missing(X)" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "markdown", - "source": [ - "There are a range of strategies for handling missing values. These include:\n", - "\n", - "1. Use an estimator that internally handles missing values. It is fairly easy for\n", - "some algorithms (such as decision trees) to internally deal with missing values,\n", - "usually be using it as a distinct series value after discretisation.\n", - "2. Removing series with missing: this is often desirable if the train set size is\n", - "large, the number of series with missing is small and the proportion of missing\n", - "values for these series is high.\n", - "3. Interpolating: estimating the missing values from the other series values. This is\n", - " often desirable if the train set size is small and the proportion of missing values\n", - " is low.\n" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "all_estimators(\n", - " type_filter=[\"classifier\", \"regressor\", \"clusterer\"],\n", - " tag_filter={\"capability:missing_values\": True},\n", - ")" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "is_executing": true - } - } - }, - { - "cell_type": "markdown", - "source": [ - "Removing series with missing and interpolation is currently best done by you: there\n", - "are no transformers to deal with at at the moment. It is on the wish list." - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "markdown", - "source": [ - "## Unequal length series\n", - "\n", - "Learning from unequal length series is very common. aeon provided two baked in\n", - "unequal length collections: the univariate PLAID dataset, and the multivariate\n", - "JapaneseVowels dataset:\n" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "plaid_X, plaid_y = load_plaid(split=\"train\")\n", - "print(\n", - " f\"PLAID is univariate = {is_univariate(plaid_X)} has missing =\"\n", - " f\"{has_missing(plaid_X)} is equal length = {is_equal_length(plaid_X)}\"\n", - ")\n", - "vowels_X, vowels_y = load_japanese_vowels(split=\"train\")\n", - "print(\n", - " f\"JapaneseVowels is univariate = {is_univariate(vowels_X)} \"\n", - " f\"has missing = {has_missing(vowels_X)} is \"\n", - " f\"equal length = {is_equal_length(vowels_X)}\"\n", - ")" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "is_executing": true - } - } - }, - { - "cell_type": "markdown", - "source": [ - "### Handling unequal length\n", - "\n", - "There are two basic strategies for unequal length problems\n", - "1. Use an estimator that can internally handle missing values\n", - "2. Transform the data to be equal length by, for example, truncating or padding series\n", - "\n", - "At the time of writing, functionality for handling unequal length series is limited.\n", - "Estimators with the tag `\"capability:unequal_length\": True` have the capability to\n", - "handle unequal length series. For classification, regression and\n", - "clusterign, the\n", - "current list is" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": 14, - "outputs": [ - { - "data": { - "text/plain": " name \\\n0 Catch22Classifier \n1 Catch22Regressor \n2 DummyClassifier \n3 ElasticEnsemble \n4 KNeighborsTimeSeriesClassifier \n5 KNeighborsTimeSeriesRegressor \n\n estimator \n0 \n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
nameestimator
0Catch22Classifier<class 'aeon.classification.feature_based._cat...
1Catch22Regressor<class 'aeon.regression.feature_based._catch22...
2DummyClassifier<class 'aeon.classification._dummy.DummyClassi...
3ElasticEnsemble<class 'aeon.classification.distance_based._el...
4KNeighborsTimeSeriesClassifier<class 'aeon.classification.distance_based._ti...
5KNeighborsTimeSeriesRegressor<class 'aeon.regression.distance_based._time_s...
\n" - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "all_estimators(\n", - " type_filter=[\"classifier\", \"regressor\", \"clusterer\"],\n", - " tag_filter={\"capability:unequal_length\": True},\n", - ")" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "markdown", - "source": [ - "You can pass these estimators unequal length series and they will work as expected." - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": 15, - "outputs": [ - { - "data": { - "text/plain": "KNeighborsTimeSeriesClassifier()", - "text/html": "
KNeighborsTimeSeriesClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "knn = KNeighborsTimeSeriesClassifier()\n", - "knn.fit(plaid_X, plaid_y)" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "markdown", - "source": [ - "The alternative is to transform your data so that it becomes equal length, and can\n", - "then be used with any time series estimator. Two simple examples are tpo pad the\n", - "series to the longest series length, or to truncate to the shortest series length. By\n", - " default, padding pads with zeros. Be careful if your data is not normalised, because\n", - " this could then effect the classifier." - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": 17, - "outputs": [ - { - "data": { - "text/plain": "(537, 1, 1344)" - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from aeon.transformations.collection import Padder\n", - "\n", - "pt = Padder()\n", - "plaid_equal = pt.fit_transform(plaid_X)\n", - "plaid_equal.shape" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": 21, - "outputs": [ - { - "data": { - "text/plain": "[]" - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "text/plain": "
", - "image/png": "" - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "plt.title(\"Before and after padding: PLAID first case (shifted up for unpadded)\")\n", - "plt.plot(plaid_X[0][0] + 10)\n", - "plt.plot(plaid_equal[0][0])" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "markdown", - "source": [ - "## Coming soon\n", - "\n", - "Unequally spaced samples\n", - "Streaming series: windowing and segmenting\n", - "Channel selection for multivariate series" - ], - "metadata": { - "collapsed": false - } - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -}