|
| 1 | +"""Local Outlier Factor (LOF) algorithm for anomaly detection.""" |
| 2 | + |
| 3 | +__maintainer__ = [] |
| 4 | +__all__ = ["LOF"] |
| 5 | + |
| 6 | +from typing import Optional, Union |
| 7 | + |
| 8 | +import numpy as np |
| 9 | + |
| 10 | +from aeon.anomaly_detection._pyodadapter import PyODAdapter |
| 11 | +from aeon.utils.validation._dependencies import _check_soft_dependencies |
| 12 | + |
| 13 | + |
| 14 | +class LOF(PyODAdapter): |
| 15 | + """Local Outlier Factor (LOF) algorithm for anomaly detection. |
| 16 | +
|
| 17 | + This class implement metrics-based outlier detection algorithms using the |
| 18 | + Local Outlier Factor (LOF) algorithm from PyOD. |
| 19 | +
|
| 20 | + .. list-table:: Capabilities |
| 21 | + :stub-columns: 1 |
| 22 | +
|
| 23 | + * - Input data format |
| 24 | + - univariate or multivariate |
| 25 | + * - Output data format |
| 26 | + - anomaly scores |
| 27 | + * - missing_values |
| 28 | + - False |
| 29 | + * - Learning Type |
| 30 | + - unsupervised or semi-supervised |
| 31 | + * - python_dependencies |
| 32 | + - ["pyod"] |
| 33 | +
|
| 34 | + The documentation for parameters has been adapted from the |
| 35 | + [PyOD documentation](https://pyod.readthedocs.io/en/latest/pyod.models.html#id586). |
| 36 | + Here, `X` refers to the set of sliding windows extracted from the time series |
| 37 | + using :func:`aeon.utils.windowing.sliding_windows` with the parameters |
| 38 | + ``window_size`` and ``stride``. The internal `X` has the shape |
| 39 | + `(n_windows, window_size * n_channels)`. |
| 40 | +
|
| 41 | + Parameters |
| 42 | + ---------- |
| 43 | + n_neighbors : int, optional (default=20) |
| 44 | + Number of neighbors to use by default for `kneighbors` queries. |
| 45 | + If n_neighbors is larger than the number of samples provided, |
| 46 | + all samples will be used. |
| 47 | + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional |
| 48 | + Algorithm used to compute the nearest neighbors: |
| 49 | + - 'ball_tree' will use BallTree |
| 50 | + - 'kd_tree' will use KDTree |
| 51 | + - 'brute' will use a brute-force search. |
| 52 | + - 'auto' will attempt to decide the most appropriate algorithm |
| 53 | + based on the values passed to :meth:`fit` method. |
| 54 | + Note: fitting on sparse input will override the setting of |
| 55 | + this parameter, using brute force. |
| 56 | + leaf_size : int, optional (default=30) |
| 57 | + Leaf size passed to `BallTree` or `KDTree`. This can |
| 58 | + affect the speed of the construction and query, as well as the memory |
| 59 | + required to store the tree. The optimal value depends on the |
| 60 | + nature of the problem. |
| 61 | + metric : string or callable, default 'minkowski' |
| 62 | + metric used for the distance computation. Any metric from scikit-learn |
| 63 | + or scipy.spatial.distance can be used. |
| 64 | + If metric is a callable function, it is called on each |
| 65 | + pair of instances (rows) and the resulting value recorded. The callable |
| 66 | + should take two arrays as input and return one value indicating the |
| 67 | + distance between them. This works for Scipy's metrics, but is less |
| 68 | + efficient than passing the metric name as a string. |
| 69 | + p : integer, optional (default = 2) |
| 70 | + Parameter for the Minkowski metric |
| 71 | + metric_params : dict, optional (default = None) |
| 72 | + Additional keyword arguments for the metric function. |
| 73 | + n_jobs : int, optional (default = 1) |
| 74 | + The number of parallel jobs to run for neighbors search. |
| 75 | + If ``-1``, then the number of jobs is set to the number of CPU cores. |
| 76 | + Affects only kneighbors and kneighbors_graph methods. |
| 77 | + novelty : bool (default=False) |
| 78 | + By default, LocalOutlierFactor is only meant to be used for outlier |
| 79 | + detection (novelty=False). Set novelty to True if you want to use |
| 80 | + LocalOutlierFactor for novelty detection. In this case be aware that |
| 81 | + that you should only use predict, decision_function and score_samples |
| 82 | + on new unseen data and not on the training set. |
| 83 | + """ |
| 84 | + |
| 85 | + _tags = { |
| 86 | + "capability:multivariate": True, |
| 87 | + "capability:univariate": True, |
| 88 | + "capability:missing_values": False, |
| 89 | + "fit_is_empty": False, |
| 90 | + "python_dependencies": ["pyod"], |
| 91 | + } |
| 92 | + |
| 93 | + def __init__( |
| 94 | + self, |
| 95 | + n_neighbors: int = 20, |
| 96 | + algorithm: Optional[str] = "auto", |
| 97 | + leaf_size: int = 30, |
| 98 | + metric: str = "minkowski", |
| 99 | + p: int = 2, |
| 100 | + metric_params: Optional[dict] = None, |
| 101 | + n_jobs: int = 1, |
| 102 | + window_size: int = 10, |
| 103 | + stride: int = 1, |
| 104 | + ): |
| 105 | + _check_soft_dependencies(*self._tags["python_dependencies"]) |
| 106 | + from pyod.models.lof import LOF as PyOD_LOF |
| 107 | + |
| 108 | + # Set a default contamination value internally |
| 109 | + contamination = 0.1 |
| 110 | + |
| 111 | + model = PyOD_LOF( |
| 112 | + n_neighbors=n_neighbors, |
| 113 | + algorithm=algorithm, |
| 114 | + leaf_size=leaf_size, |
| 115 | + metric=metric, |
| 116 | + p=p, |
| 117 | + metric_params=metric_params, |
| 118 | + n_jobs=n_jobs, |
| 119 | + contamination=contamination, # Only for PyOD LOF |
| 120 | + novelty=False, # Initialize unsupervised LOF (novelty=False) |
| 121 | + ) |
| 122 | + self.n_neighbors = n_neighbors |
| 123 | + self.algorithm = algorithm |
| 124 | + self.leaf_size = leaf_size |
| 125 | + self.metric = metric |
| 126 | + self.p = p |
| 127 | + self.metric_params = metric_params |
| 128 | + self.n_jobs = n_jobs |
| 129 | + super().__init__(pyod_model=model, window_size=window_size, stride=stride) |
| 130 | + |
| 131 | + def _fit(self, X: np.ndarray, y: Union[np.ndarray, None] = None) -> None: |
| 132 | + # Set novelty to True for semi-supervised learning |
| 133 | + self.pyod_model.novelty = True |
| 134 | + super()._fit(X, y) |
| 135 | + |
| 136 | + def _predict(self, X: np.ndarray) -> np.ndarray: |
| 137 | + return super()._predict(X) |
| 138 | + |
| 139 | + def _fit_predict( |
| 140 | + self, X: np.ndarray, y: Union[np.ndarray, None] = None |
| 141 | + ) -> np.ndarray: |
| 142 | + # Set novelty to False for unsupervised learning |
| 143 | + self.pyod_model.novelty = False |
| 144 | + return super()._fit_predict(X, y) |
| 145 | + |
| 146 | + @classmethod |
| 147 | + def _get_test_params(cls, parameter_set="default"): |
| 148 | + """Return testing parameter settings for the estimator. |
| 149 | +
|
| 150 | + Parameters |
| 151 | + ---------- |
| 152 | + parameter_set : str, default="default" |
| 153 | + Name of the set of test parameters to return, for use in tests. |
| 154 | +
|
| 155 | + Returns |
| 156 | + ------- |
| 157 | + params : dict |
| 158 | + Parameters to create testing instances of the class. |
| 159 | + Each dict corresponds to parameters that will create an "interesting" |
| 160 | + test instance. |
| 161 | + """ |
| 162 | + # Define a test parameter set with different combinations of parameters |
| 163 | + return { |
| 164 | + "n_neighbors": 5, |
| 165 | + "leaf_size": 10, |
| 166 | + "p": 2, |
| 167 | + "window_size": 10, |
| 168 | + "stride": 2, |
| 169 | + } |
0 commit comments