From 032ae63d4f1b2609553515d6cecb3e3c569e64c5 Mon Sep 17 00:00:00 2001 From: mr-price <48429290+mr-price@users.noreply.github.com> Date: Thu, 20 Jun 2024 11:24:55 +0530 Subject: [PATCH] ADD sklearn LOF support (#168) Co-authored-by: Tian Lan <31748898+Emerald01@users.noreply.github.com> --- conf/benchmark_anomaly.json | 9 ++ docs/source/merlion.models.anomaly.rst | 7 + merlion/dashboard/models/anomaly.py | 3 +- merlion/models/anomaly/lof.py | 175 +++++++++++++++++++++++++ merlion/models/factory.py | 1 + tests/anomaly/test_lof.py | 93 +++++++++++++ 6 files changed, 287 insertions(+), 1 deletion(-) create mode 100644 merlion/models/anomaly/lof.py create mode 100644 tests/anomaly/test_lof.py diff --git a/conf/benchmark_anomaly.json b/conf/benchmark_anomaly.json index e35b0ff77..b3220eb70 100644 --- a/conf/benchmark_anomaly.json +++ b/conf/benchmark_anomaly.json @@ -191,5 +191,14 @@ "post_rule_train_config": { "default": {"unsup_quantile": 0.95} } + }, + "LocalOutlierFactor": {"alias": "LocalOutlierFactor"}, + "LocalOutlierFactor": { + "config": { + "default": {} + }, + "post_rule_train_config": { + "default": {"unsup_quantile": 0.95} + } } } diff --git a/docs/source/merlion.models.anomaly.rst b/docs/source/merlion.models.anomaly.rst index fcd17032a..133e1c722 100644 --- a/docs/source/merlion.models.anomaly.rst +++ b/docs/source/merlion.models.anomaly.rst @@ -133,6 +133,13 @@ anomaly.lstm_ed :undoc-members: :show-inheritance: +anomaly.lof +^^^^^^^^^^^^^^^ +.. automodule:: merlion.models.anomaly.lof + :members: + :undoc-members: + :show-inheritance: + anomaly.deep\_point\_anomaly\_detector ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. automodule:: merlion.models.anomaly.deep_point_anomaly_detector diff --git a/merlion/dashboard/models/anomaly.py b/merlion/dashboard/models/anomaly.py index 5e7045719..66b1ff392 100644 --- a/merlion/dashboard/models/anomaly.py +++ b/merlion/dashboard/models/anomaly.py @@ -24,6 +24,7 @@ class AnomalyModel(ModelMixin, DataMixin): "ArimaDetector", "DynamicBaseline", "IsolationForest", + "LocalOutlierFactor" "ETSDetector", "MSESDetector", "ProphetDetector", @@ -34,7 +35,7 @@ class AnomalyModel(ModelMixin, DataMixin): "ZMS", "DeepPointAnomalyDetector", ] - multivariate_algorithms = ["IsolationForest", "AutoEncoder", "VAE", "DAGMM", "LSTMED"] + multivariate_algorithms = ["IsolationForest", "AutoEncoder", "VAE", "DAGMM", "LSTMED","LocalOutlierFactor"] thresholds = ["Threshold", "AggregateAlarms"] def __init__(self): diff --git a/merlion/models/anomaly/lof.py b/merlion/models/anomaly/lof.py new file mode 100644 index 000000000..c8b6ecc3f --- /dev/null +++ b/merlion/models/anomaly/lof.py @@ -0,0 +1,175 @@ +# +# Copyright (c) 2024 salesforce.com, inc. +# All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +# +""" +The classic LocalOutlierFactor model for anomaly detection. +""" +import logging + +import numpy as np +import pandas as pd +from sklearn.neighbors import LocalOutlierFactor + +from merlion.models.anomaly.base import DetectorConfig, DetectorBase +from merlion.transform.moving_average import DifferenceTransform +from merlion.transform.sequence import TransformSequence +from merlion.transform.resample import Shingle + +logger = logging.getLogger(__name__) + + +class LOFConfig(DetectorConfig): + """ + Configuration class for `LocalOutlierFactor`. + """ + + _default_transform = TransformSequence([DifferenceTransform(), Shingle(size=2, stride=1)]) + + def __init__( + self, + n_neighbors=20, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + contamination=0.1, + n_jobs=1, + novelty=True, + **kwargs + ): + """ + n_neighbors : int, optional (default=20) + Number of neighbors to use by default for `kneighbors` queries. + If n_neighbors is larger than the number of samples provided, + all samples will be used. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use BallTree + - 'kd_tree' will use KDTree + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, optional (default=30) + Leaf size passed to `BallTree` or `KDTree`. This can + affect the speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : string or callable, default 'minkowski' + metric used for the distance computation. Any metric from scikit-learn + or scipy.spatial.distance can be used. + + If 'precomputed', the training input X is expected to be a distance + matrix. + + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Valid values for metric are: + + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan'] + + - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', + 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', + 'sqeuclidean', 'yule'] + + See the documentation for scipy.spatial.distance for details on these + metrics: + http://docs.scipy.org/doc/scipy/reference/spatial.distance.html + + p : integer, optional (default = 2) + Parameter for the Minkowski metric from + sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances + + metric_params : dict, optional (default = None) + Additional keyword arguments for the metric function. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. the proportion + of outliers in the data set. When fitting this is used to define the + threshold on the decision function. + + n_jobs : int, optional (default = 1) + The number of parallel jobs to run for neighbors search. + If ``-1``, then the number of jobs is set to the number of CPU cores. + Affects only kneighbors and kneighbors_graph methods. + + novelty : bool (default=False) + By default, LocalOutlierFactor is only meant to be used for outlier + detection (novelty=False). Set novelty to True if you want to use + LocalOutlierFactor for novelty detection. In this case be aware that + that you should only use predict, decision_function and score_samples + on new unseen data and not on the training set. + """ + self.contamination = contamination + self.n_neighbors = n_neighbors + self.algorithm = algorithm + self.leaf_size = leaf_size + self.metric = metric + self.p = p + self.metric_params = metric_params + self.n_jobs = n_jobs + self.novelty = novelty + # Expect the max_score be overridden in the calibrator function + kwargs["max_score"] = 1.0 + super().__init__(**kwargs) + + +class LOF(DetectorBase): + """ + The classic LocalOutlierFactor sklearn implementation. + """ + + config_class = LOFConfig + + def __init__(self, config: LOFConfig): + super().__init__(config) + self.model = LocalOutlierFactor( + n_neighbors=config.n_neighbors, + algorithm=config.algorithm, + leaf_size=config.leaf_size, + metric=config.metric, + p=config.p, + metric_params=config.metric_params, + contamination=config.contamination, + n_jobs=config.n_jobs, + novelty=config.novelty, + ) + + @property + def require_even_sampling(self) -> bool: + return False + + @property + def require_univariate(self) -> bool: + return False + + def _train(self, train_data: pd.DataFrame, train_config=None) -> pd.DataFrame: + times, train_values = train_data.index, train_data.values + self.model.fit(train_values) + train_scores = -self.model.score_samples(train_values) + return pd.DataFrame(train_scores, index=times, columns=["anom_score"]) + + def _get_anomaly_score(self, time_series: pd.DataFrame, time_series_prev: pd.DataFrame = None) -> pd.DataFrame: + # Return the negative of model's score, since model scores are in [-1, 0), where more negative = more anomalous + scores = -self.model.score_samples(np.array(time_series.values)) + return pd.DataFrame(scores, index=time_series.index) diff --git a/merlion/models/factory.py b/merlion/models/factory.py index fcca84f8e..b320d5f9a 100644 --- a/merlion/models/factory.py +++ b/merlion/models/factory.py @@ -26,6 +26,7 @@ ArimaDetector="merlion.models.anomaly.forecast_based.arima:ArimaDetector", DynamicBaseline="merlion.models.anomaly.dbl:DynamicBaseline", IsolationForest="merlion.models.anomaly.isolation_forest:IsolationForest", + LocalOutlierFactor="merlion.models.anomaly.lof:LOF", # Forecast-based anomaly detection models ETSDetector="merlion.models.anomaly.forecast_based.ets:ETSDetector", MSESDetector="merlion.models.anomaly.forecast_based.mses:MSESDetector", diff --git a/tests/anomaly/test_lof.py b/tests/anomaly/test_lof.py new file mode 100644 index 000000000..6cb61a7f5 --- /dev/null +++ b/tests/anomaly/test_lof.py @@ -0,0 +1,93 @@ +# +# Copyright (c) 2023 salesforce.com, inc. +# All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +# +import logging +from os.path import abspath, dirname, join +import sys +import unittest + +import numpy as np + +from merlion.models.anomaly.lof import LOF, LOFConfig +from merlion.transform.moving_average import MovingAverage, ExponentialMovingAverage +from merlion.transform.resample import Shingle +from merlion.transform.sequence import TransformSequence +from merlion.post_process.threshold import AggregateAlarms +from merlion.utils.data_io import csv_to_time_series + +rootdir = dirname(dirname(dirname(abspath(__file__)))) +logger = logging.getLogger(__name__) + + +class TestLOF(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.csv_name = join(rootdir, "data", "example.csv") + self.test_len = 32768 + self.data = csv_to_time_series(self.csv_name, timestamp_unit="ms", data_cols=["kpi"]) + logger.info(f"Data looks like:\n{self.data[:5]}") + self.vals_train = self.data[: -self.test_len] + self.vals_test = self.data[-self.test_len :] + + # You probably wouldn't use this transform in practice, but we use it + # here to test ExponentialMovingAverage and MovingAverage on + # multi-variate time series + self.model = LOF( + LOFConfig( + threshold=AggregateAlarms(alm_threshold=3.5), + transform=TransformSequence( + [ + Shingle(size=5, stride=1), + ExponentialMovingAverage(alpha=0.9, normalize=True), + MovingAverage(weights=[0.1, 0.2, 0.3, 0.4]), + ] + ) + ) + ) + print() + logger.info("Training model...\n") + self.model.train(self.vals_train, post_rule_train_config={"unsup_quantile": 0.999}) + + def test_score(self): + # score function returns the raw anomaly scores + print("-" * 80) + logger.info("test_score\n" + "-" * 80 + "\n") + scores = self.model.get_anomaly_score(self.vals_test) + logger.info(f"Scores look like:\n{scores[:5]}") + scores = scores.to_pd().values.flatten() + logger.info("max score = " + str(max(scores))) + logger.info("min score = " + str(min(scores)) + "\n") + + def test_alarm(self): + # alarm function returns the post-rule processed anomaly scores + print("-" * 80) + logger.info("test_alarm\n" + "-" * 80 + "\n") + alarms = self.model.get_anomaly_label(self.vals_test) + n_alarms = np.sum(alarms.to_pd().values != 0) + logger.info(f"Alarms look like:\n{alarms[:5]}") + logger.info(f"Number of alarms: {n_alarms}\n") + self.assertLess(n_alarms, 17) + + def test_save_load(self): + print("-" * 80) + logger.info("test_save_load\n" + "-" * 80 + "\n") + self.model.save(dirname=join(rootdir, "tmp", "lof")) + loaded_model = LOF.load(dirname=join(rootdir, "tmp", "lof")) + + scores = self.model.get_anomaly_score(self.vals_test) + loaded_model_scores = loaded_model.get_anomaly_score(self.vals_test) + self.assertSequenceEqual(list(scores), list(loaded_model_scores)) + + alarms = self.model.get_anomaly_label(self.vals_test) + loaded_model_alarms = loaded_model.get_anomaly_label(self.vals_test) + self.assertSequenceEqual(list(alarms), list(loaded_model_alarms)) + + +if __name__ == "__main__": + logging.basicConfig( + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", stream=sys.stdout, level=logging.DEBUG + ) + unittest.main()