From 0b5f45d9e69587c0c7207cd4429fc56b740f41dd Mon Sep 17 00:00:00 2001
From: hackeT <40039738+Tatsuya-hasegawa@users.noreply.github.com>
Date: Wed, 6 Nov 2024 16:18:00 +0900
Subject: [PATCH 1/3] add multi dimension subplots and chosen 2 features
 scatter from many features

---
 msticpy/analysis/outliers.py | 151 ++++++++++++++++++++++++-----------
 1 file changed, 106 insertions(+), 45 deletions(-)

diff --git a/msticpy/analysis/outliers.py b/msticpy/analysis/outliers.py
index 069c5658..d4b4d835 100644
--- a/msticpy/analysis/outliers.py
+++ b/msticpy/analysis/outliers.py
@@ -14,7 +14,7 @@
 """
 
 import math
-from typing import List, Tuple
+from typing import List, Tuple, Optional
 
 import numpy as np
 import pandas as pd
@@ -38,7 +38,7 @@
 
 # pylint: disable=invalid-name
 def identify_outliers(
-    x: np.ndarray, x_predict: np.ndarray, contamination: float = 0.05
+    x: np.ndarray, x_predict: np.ndarray, contamination: float = 0.05, max_features: Optional[int] = None, 
 ) -> Tuple[IsolationForest, np.ndarray, np.ndarray]:
     """
     Identify outlier items using SkLearn IsolationForest.
@@ -51,6 +51,8 @@ def identify_outliers(
         Model
     contamination : float
         Percentage contamination (default: {0.05})
+    max_features : int, optional
+        The maximum number of features to be used for Isolation Forest (default: None)
 
     Returns
     -------
@@ -64,8 +66,10 @@ def identify_outliers(
 
     # fit the model
     rows, cols = x.shape
-    max_samples = min(100, cols)
-    max_features = math.floor(math.sqrt(rows))
+    max_samples = min(100, rows)
+    if not max_features:
+        max_features = math.floor(math.sqrt(cols))
+   
     clf = IsolationForest(
         max_samples=max_samples,
         max_features=max_features,
@@ -111,50 +115,107 @@ def plot_outlier_results(
         Plot title
 
     """
-    # plot the line, the samples, and the nearest vectors to the plane
-    x_max_x = x[:, 0].max() + (x[:, 0].max() / 10)
-    x_min_x = -x[:, 0].max() / 10
-    x_max_y = x[:, 1].max() + (x[:, 1].max() / 10)
-    x_min_y = -x[:, 1].max() / 10
-    xx, yy = np.meshgrid(
-        np.linspace(x_min_x, x_max_x, 100), np.linspace(x_min_y, x_max_y, 100)
-    )
-    z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
-    z = z.reshape(xx.shape)
 
-    plt.rcParams["figure.figsize"] = (20, 10)
+    if len(feature_columns) < 2:
+        raise ValueError("plot_outlier_results function needs more than two features for the graph visualization.")
+    
+    elif len(feature_columns) == 2: # two dimension plot: remain original codes from msticpy 2.14.0      
+        # plot the line, the samples, and the nearest vectors to the plane
+        x_max_x = x[:, 0].max() + (x[:, 0].max() / 10)
+        x_min_x = -x[:, 0].max() / 10
+        x_max_y = x[:, 1].max() + (x[:, 1].max() / 10)
+        x_min_y = -x[:, 1].max() / 10
+        xx, yy = np.meshgrid(
+            np.linspace(x_min_x, x_max_x, 100), np.linspace(x_min_y, x_max_y, 100)
+        )
+        z = clf.decision_function(np.c_[xx.ravel(), yy.ravel(), np.zeros((xx.ravel().shape[0], clf.n_features_in_ - len(feature_columns)))])
+        z = z.reshape(xx.shape)
 
-    plt.title(plt_title)
-    # pylint: disable=no-member
-    plt.contourf(xx, yy, z, cmap=plt.cm.Blues_r)  # type: ignore
+        plt.rcParams["figure.figsize"] = (20, 10)
+
+        plt.title(plt_title)
+        # pylint: disable=no-member
+        plt.contourf(xx, yy, z, cmap=plt.cm.Blues_r)  # type: ignore
+
+        b1 = plt.scatter(x[:, 0], x[:, 1], c="white", s=20, edgecolor="k")
+        b2 = plt.scatter(x_predict[:, 0], x_predict[:, 1], c="green", s=40, edgecolor="k")
+        c = plt.scatter(
+            x_outliers[:, 0], x_outliers[:, 1], c="red", marker="x", s=200
+        )
+        plt.axis("tight")
+
+        xp_max_x = x_predict[:, 0].max() + (x_predict[:, 0].max() / 10)
+        xp_min_x = -x_predict[:, 0].max() / 10
+        xp_max_y = x_predict[:, 1].max() + (x_predict[:, 1].max() / 10)
+        xp_min_y = -x_predict[:, 1].max() / 10
+
+        plt.xlim((xp_min_x, xp_max_x))
+        plt.ylim((xp_min_y, xp_max_y))
+        plt.xlabel(feature_columns[0])  # type: ignore
+        plt.ylabel(feature_columns[1])  # type: ignore
+
+        plt.legend(
+            [b1, b2, c],
+            [
+                "training observations",
+                "new regular observations",
+                "new abnormal observations",
+            ],
+            loc="upper right",
+        )
+        plt.show()
+    
+    elif len(feature_columns) > 2: # multi dimension subplots
+        dimension_num = x.shape[1]
+        fig, axes = plt.subplots(dimension_num, dimension_num, figsize=(20, 20),constrained_layout=True)
+        for i in range(dimension_num):
+            for j in range(dimension_num):
+                if i != j:
+                    # plot the line, the samples, and the nearest vectors to the plane
+                    x_max_x = x[:, j].max() + (x[:, j].max() / 10)
+                    x_min_x = -x[:, j].max() / 10
+                    x_max_y = x[:, i].max() + (x[:, i].max() / 10)
+                    x_min_y = -x[:, i].max() / 10
+                    xx, yy = np.meshgrid(
+                        np.linspace(x_min_x, x_max_x, 100), np.linspace(x_min_y, x_max_y, 100)
+                    )
+                    z = clf.decision_function(np.c_[xx.ravel(), yy.ravel(), np.zeros((xx.ravel().shape[0], len(feature_columns)-2))])
+                    z = z.reshape(xx.shape)
+
+                    # pylint: disable=no-member
+                    axes[i, j].contourf(xx, yy, z, cmap=plt.cm.Blues_r)  # type: ignore
+
+                    b1 = axes[i, j].scatter(x[:, j], x[:, i], c="white", edgecolor="k")
+                    b2 = axes[i, j].scatter(x_predict[:, j], x_predict[:, i], c="green", edgecolor="k")
+                    c = axes[i, j].scatter(x_outliers[:, j], x_outliers[:, i], c="red", marker="x")
+
+                    xp_max_x = x_predict[:, 0].max() + (x_predict[:, 0].max() / 10)
+                    xp_min_x = -x_predict[:, 0].max() / 10
+                    xp_max_y = x_predict[:, 1].max() + (x_predict[:, 1].max() / 10)
+                    xp_min_y = -x_predict[:, 1].max() / 10
+
+                    axes[i, j].axis(xmin=xp_min_x, xmax=xp_max_x)
+                    axes[i, j].axis(ymin=xp_min_y, ymax=xp_max_y)
+                    axes[i, j].set_xlabel(f'{feature_columns[j]}')
+                    axes[i, j].set_ylabel(f'{feature_columns[i]}')
+
+                else:
+                    axes[i, j].axis('off')  # do not show the same features x,y each other.
+        
+        plt.suptitle(plt_title)
+        plt.legend(
+            [b1, b2, c],
+            [
+                "training observations",
+                "new regular observations",
+                "new abnormal observations",
+            ],
+            loc="best",
+            facecolor="#0072BD", 
+            framealpha=0.3,
+        )
+        plt.show()
 
-    b1 = plt.scatter(x[:, 0], x[:, 1], c="white", s=20, edgecolor="k")
-    b2 = plt.scatter(x_predict[:, 0], x_predict[:, 1], c="green", s=40, edgecolor="k")
-    c = plt.scatter(
-        x_outliers[:, 0], x_outliers[:, 1], c="red", marker="x", s=200, edgecolor="k"
-    )
-    plt.axis("tight")
-
-    xp_max_x = x_predict[:, 0].max() + (x_predict[:, 0].max() / 10)
-    xp_min_x = -x_predict[:, 0].max() / 10
-    xp_max_y = x_predict[:, 1].max() + (x_predict[:, 1].max() / 10)
-    xp_min_y = -x_predict[:, 1].max() / 10
-
-    plt.xlim((xp_min_x, xp_max_x))
-    plt.ylim((xp_min_y, xp_max_y))
-    plt.xlabel(feature_columns[0])  # type: ignore
-    plt.ylabel(feature_columns[1])  # type: ignore
-
-    plt.legend(
-        [b1, b2, c],
-        [
-            "training observations",
-            "new regular observations",
-            "new abnormal observations",
-        ],
-        loc="upper right",
-    )
-    plt.show()
 
 
 def remove_common_items(data: pd.DataFrame, columns: List[str]) -> pd.DataFrame:

From ea6873ab6e3961f683315409f98acbf74915bc88 Mon Sep 17 00:00:00 2001
From: hackeT <40039738+Tatsuya-hasegawa@users.noreply.github.com>
Date: Wed, 6 Nov 2024 17:00:16 +0900
Subject: [PATCH 2/3] add multi dimension subplots and chosen 2 features
 scatter from many features

---
 .pylintrc                    |  2 +
 msticpy/analysis/outliers.py | 79 +++++++++++++++++++++++-------------
 2 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index ffd2524c..94f08182 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -69,6 +69,8 @@ disable=raw-checker-failed,
         deprecated-pragma,
         use-symbolic-message-instead,
         too-many-positional-arguments,
+	too-many-arguments,
+	too-many-statements,
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
diff --git a/msticpy/analysis/outliers.py b/msticpy/analysis/outliers.py
index d4b4d835..4da36aed 100644
--- a/msticpy/analysis/outliers.py
+++ b/msticpy/analysis/outliers.py
@@ -14,7 +14,7 @@
 """
 
 import math
-from typing import List, Tuple, Optional
+from typing import List, Optional, Tuple
 
 import numpy as np
 import pandas as pd
@@ -33,12 +33,15 @@
     ) from imp_err
 
 __version__ = VERSION
-__author__ = "Ian Hellen"
+__author__ = "Ian Hellen, Tatsuya Hasegawa"
 
 
 # pylint: disable=invalid-name
 def identify_outliers(
-    x: np.ndarray, x_predict: np.ndarray, contamination: float = 0.05, max_features: Optional[int] = None, 
+    x: np.ndarray,
+    x_predict: np.ndarray,
+    contamination: float = 0.05,
+    max_features: Optional[int] = None,
 ) -> Tuple[IsolationForest, np.ndarray, np.ndarray]:
     """
     Identify outlier items using SkLearn IsolationForest.
@@ -69,7 +72,7 @@ def identify_outliers(
     max_samples = min(100, rows)
     if not max_features:
         max_features = math.floor(math.sqrt(cols))
-   
+
     clf = IsolationForest(
         max_samples=max_samples,
         max_features=max_features,
@@ -87,7 +90,7 @@ def identify_outliers(
     return clf, x_outliers, y_pred_outliers
 
 
-# pylint: disable=too-many-arguments, too-many-locals
+# pylint: disable=too-many-arguments, too-many-statements, too-many-locals
 def plot_outlier_results(
     clf: IsolationForest,
     x: np.ndarray,
@@ -113,13 +116,9 @@ def plot_outlier_results(
         list of feature columns to display
     plt_title : str
         Plot title
-
     """
-
-    if len(feature_columns) < 2:
-        raise ValueError("plot_outlier_results function needs more than two features for the graph visualization.")
-    
-    elif len(feature_columns) == 2: # two dimension plot: remain original codes from msticpy 2.14.0      
+    if len(feature_columns) == 2:
+        # two dimension plot: mostly remain original codes from msticpy 2.14.0
         # plot the line, the samples, and the nearest vectors to the plane
         x_max_x = x[:, 0].max() + (x[:, 0].max() / 10)
         x_min_x = -x[:, 0].max() / 10
@@ -128,7 +127,15 @@ def plot_outlier_results(
         xx, yy = np.meshgrid(
             np.linspace(x_min_x, x_max_x, 100), np.linspace(x_min_y, x_max_y, 100)
         )
-        z = clf.decision_function(np.c_[xx.ravel(), yy.ravel(), np.zeros((xx.ravel().shape[0], clf.n_features_in_ - len(feature_columns)))])
+        z = clf.decision_function(
+            np.c_[
+                xx.ravel(),
+                yy.ravel(),
+                np.zeros(
+                    (xx.ravel().shape[0], clf.n_features_in_ - len(feature_columns))
+                ),
+            ]
+        )
         z = z.reshape(xx.shape)
 
         plt.rcParams["figure.figsize"] = (20, 10)
@@ -138,10 +145,10 @@ def plot_outlier_results(
         plt.contourf(xx, yy, z, cmap=plt.cm.Blues_r)  # type: ignore
 
         b1 = plt.scatter(x[:, 0], x[:, 1], c="white", s=20, edgecolor="k")
-        b2 = plt.scatter(x_predict[:, 0], x_predict[:, 1], c="green", s=40, edgecolor="k")
-        c = plt.scatter(
-            x_outliers[:, 0], x_outliers[:, 1], c="red", marker="x", s=200
+        b2 = plt.scatter(
+            x_predict[:, 0], x_predict[:, 1], c="green", s=40, edgecolor="k"
         )
+        c = plt.scatter(x_outliers[:, 0], x_outliers[:, 1], c="red", marker="x", s=200)
         plt.axis("tight")
 
         xp_max_x = x_predict[:, 0].max() + (x_predict[:, 0].max() / 10)
@@ -164,10 +171,12 @@ def plot_outlier_results(
             loc="upper right",
         )
         plt.show()
-    
-    elif len(feature_columns) > 2: # multi dimension subplots
+
+    elif len(feature_columns) > 2:  # multi dimension subplots
         dimension_num = x.shape[1]
-        fig, axes = plt.subplots(dimension_num, dimension_num, figsize=(20, 20),constrained_layout=True)
+        fig, axes = plt.subplots(
+            dimension_num, dimension_num, figsize=(20, 20), constrained_layout=True
+        )
         for i in range(dimension_num):
             for j in range(dimension_num):
                 if i != j:
@@ -177,17 +186,28 @@ def plot_outlier_results(
                     x_max_y = x[:, i].max() + (x[:, i].max() / 10)
                     x_min_y = -x[:, i].max() / 10
                     xx, yy = np.meshgrid(
-                        np.linspace(x_min_x, x_max_x, 100), np.linspace(x_min_y, x_max_y, 100)
+                        np.linspace(x_min_x, x_max_x, 100),
+                        np.linspace(x_min_y, x_max_y, 100),
+                    )
+                    z = clf.decision_function(
+                        np.c_[
+                            xx.ravel(),
+                            yy.ravel(),
+                            np.zeros((xx.ravel().shape[0], len(feature_columns) - 2)),
+                        ]
                     )
-                    z = clf.decision_function(np.c_[xx.ravel(), yy.ravel(), np.zeros((xx.ravel().shape[0], len(feature_columns)-2))])
                     z = z.reshape(xx.shape)
 
                     # pylint: disable=no-member
                     axes[i, j].contourf(xx, yy, z, cmap=plt.cm.Blues_r)  # type: ignore
 
                     b1 = axes[i, j].scatter(x[:, j], x[:, i], c="white", edgecolor="k")
-                    b2 = axes[i, j].scatter(x_predict[:, j], x_predict[:, i], c="green", edgecolor="k")
-                    c = axes[i, j].scatter(x_outliers[:, j], x_outliers[:, i], c="red", marker="x")
+                    b2 = axes[i, j].scatter(
+                        x_predict[:, j], x_predict[:, i], c="green", edgecolor="k"
+                    )
+                    c = axes[i, j].scatter(
+                        x_outliers[:, j], x_outliers[:, i], c="red", marker="x"
+                    )
 
                     xp_max_x = x_predict[:, 0].max() + (x_predict[:, 0].max() / 10)
                     xp_min_x = -x_predict[:, 0].max() / 10
@@ -196,13 +216,14 @@ def plot_outlier_results(
 
                     axes[i, j].axis(xmin=xp_min_x, xmax=xp_max_x)
                     axes[i, j].axis(ymin=xp_min_y, ymax=xp_max_y)
-                    axes[i, j].set_xlabel(f'{feature_columns[j]}')
-                    axes[i, j].set_ylabel(f'{feature_columns[i]}')
+                    axes[i, j].set_xlabel(f"{feature_columns[j]}")
+                    axes[i, j].set_ylabel(f"{feature_columns[i]}")
 
                 else:
-                    axes[i, j].axis('off')  # do not show the same features x,y each other.
-        
-        plt.suptitle(plt_title)
+                    # do not show the same features x,y each other.
+                    axes[i, j].axis("off")
+
+        fig.suptitle(plt_title)
         plt.legend(
             [b1, b2, c],
             [
@@ -211,11 +232,13 @@ def plot_outlier_results(
                 "new abnormal observations",
             ],
             loc="best",
-            facecolor="#0072BD", 
+            facecolor="#0072BD",
             framealpha=0.3,
         )
         plt.show()
 
+    else:
+        raise ValueError("plot_outlier_results function needs more than two features.")
 
 
 def remove_common_items(data: pd.DataFrame, columns: List[str]) -> pd.DataFrame:

From 56788e541abff60de79ce5d894ef59f2ba20cece Mon Sep 17 00:00:00 2001
From: hackeT <40039738+Tatsuya-hasegawa@users.noreply.github.com>
Date: Wed, 6 Nov 2024 17:41:51 +0900
Subject: [PATCH 3/3] add multi dimension subplots and chosen 2 features
 scatter from many features

---
 msticpy/analysis/outliers.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/msticpy/analysis/outliers.py b/msticpy/analysis/outliers.py
index 4da36aed..26fff8c4 100644
--- a/msticpy/analysis/outliers.py
+++ b/msticpy/analysis/outliers.py
@@ -14,7 +14,7 @@
 """
 
 import math
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -41,7 +41,7 @@ def identify_outliers(
     x: np.ndarray,
     x_predict: np.ndarray,
     contamination: float = 0.05,
-    max_features: Optional[int] = None,
+    max_features: Optional[Union[int, float]] = None,
 ) -> Tuple[IsolationForest, np.ndarray, np.ndarray]:
     """
     Identify outlier items using SkLearn IsolationForest.
@@ -54,8 +54,10 @@ def identify_outliers(
         Model
     contamination : float
         Percentage contamination (default: {0.05})
-    max_features : int, optional
-        The maximum number of features to be used for Isolation Forest (default: None)
+    max_features : int or float, optional
+        Specifies max num or max rate of features
+        to be randomly selected when building each tree.
+        default: None => {math.floor(math.sqrt(cols))}
 
     Returns
     -------
@@ -90,7 +92,7 @@ def identify_outliers(
     return clf, x_outliers, y_pred_outliers
 
 
-# pylint: disable=too-many-arguments, too-many-statements, too-many-locals
+# pylint: disable=too-many-arguments, too-many-locals
 def plot_outlier_results(
     clf: IsolationForest,
     x: np.ndarray,
@@ -231,7 +233,6 @@ def plot_outlier_results(
                 "new regular observations",
                 "new abnormal observations",
             ],
-            loc="best",
             facecolor="#0072BD",
             framealpha=0.3,
         )