Fraunhofer-IESE · siebert-julien · Oct 29, 2025 · Jul 18, 2025 · Jul 18, 2025 · Oct 4, 2025
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -21,5 +21,5 @@ jobs:
           path: .cache
           restore-keys: |
             mkdocs-material-
-      - run: pip install mkdocs mkdocstrings[python] mkdocs-gen-files mkdocs-material mkdocs-literate-nav mkdocs-jupyter
+      - run: pip install -U mkdocs mkdocstrings[python] mkdocs-gen-files mkdocs-material mkdocs-literate-nav mkdocs-jupyter
       - run: mkdocs gh-deploy --force
diff --git a/.gitignore b/.gitignore
@@ -165,3 +165,6 @@ cython_debug/
 /badgers/uncertainty-main-uncertainty-generate-augmentation/
 /experiments/
 /.continue/
+/uncertainty-main-uncertainty-generate-augmentation/
+/mcp/
+/profiling_tests/
diff --git a/badgers/generators/tabular_data/outliers/__init__.py b/badgers/generators/tabular_data/outliers/__init__.py
@@ -0,0 +1,83 @@
+import abc
+
+import numpy as np
+import sklearn.base
+from numpy.random import default_rng
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+
+from badgers.core.base import GeneratorMixin
+from badgers.core.decorators.tabular_data import preprocess_inputs
+
+
+class OutliersGenerator(GeneratorMixin):
+    """
+    Base class for transformers that add outliers to tabular data
+    """
+
+    def __init__(self, random_generator: np.random.Generator = default_rng(seed=0)):
+        """
+        Initialize the OutliersGenerator with a random number generator.
+
+        :param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0).
+        """
+        self.random_generator = random_generator
+
+    @abc.abstractmethod
+    def generate(self, X, y=None, **params):
+        """
+        Abstract method to generate outliers data. Must be implemented by subclasses.
+
+        :param X: Input features (pandas DataFrame or numpy array).
+        :param y: Target variable (pandas Series or numpy array).
+        :param params: Additional parameters required for noise generation.
+        """
+        pass
+
+
+class DecompositionAndOutlierGenerator(OutliersGenerator):
+
+    def __init__(self, decomposition_transformer: sklearn.base.TransformerMixin, outlier_generator: OutliersGenerator):
+        """
+        Initialize the DecompositionAndOutlierGenerator with a decomposition transformer and an outlier generator.
+
+        :param decomposition_transformer: The dimensionality reduction transformer to be applied to the data before generating outliers.
+        :param outlier_generator: The outlier generator to be used after the data has been transformed.
+        """
+        assert hasattr(
+            decomposition_transformer,
+            'inverse_transform'), \
+            f'the decomposition transformer class must implement the inverse_transform function.' \
+            f'\nUnfortunately the class {decomposition_transformer} does not'
+        super().__init__(random_generator=outlier_generator.random_generator)
+
+        self.decomposition_transformer = decomposition_transformer
+        self.outlier_generator = outlier_generator
+
+    @preprocess_inputs
+    def generate(self, X, y=None, **params):
+        """
+        Randomly generate outliers by first applying a dimensionality reduction technique (sklearn.decomposition)
+        and an outlier transformer.
+
+        1. Standardize the input data (mean = 0, variance = 1)
+        2. Apply the dimensionality reduction transformer
+        3. Generates outliers by applying the outlier transformer
+        4. Inverse the dimensionality reduction and the standardization transformations
+
+        :param X: the input features
+        :param y: the regression target, class labels, or None
+        :param params:
+        :return:
+        """
+
+        # standardize the data and apply the dimensionality reduction transformer
+        pipeline = make_pipeline(
+            StandardScaler(),
+            self.decomposition_transformer,
+        )
+        Xt = pipeline.fit_transform(X)
+        # add outliers using the zscore_transformer
+        Xt, yt = self.outlier_generator.generate(Xt, y, **params)
+        # inverse the manifold and standardization transformations
+        return pipeline.inverse_transform(Xt), yt
diff --git a/badgers/generators/tabular_data/outliers/distribution_sampling.py b/badgers/generators/tabular_data/outliers/distribution_sampling.py
@@ -0,0 +1,206 @@
+import numpy as np
+from numpy.random import default_rng
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
+
+from badgers.core.decorators.tabular_data import preprocess_inputs
+from badgers.core.utils import random_sign, random_spherical_coordinate
+from badgers.generators.tabular_data.outliers import OutliersGenerator
+
+
+class HyperCubeSampling(OutliersGenerator):
+    """
+    Sampling uniformly at random within a hypercube encapsulating all the instances
+
+
+    See section 6.1.1 in [1]
+
+    [1] Georg Steinbuss and Klemens Böhm. 2021.
+        Generating Artificial Outliers in the Absence of Genuine Ones — A Survey.
+        ACM Trans. Knowl. Discov. Data 15, 2, Article 30 (April 2021), 37 pages.
+        https://doi.org/10.1145/3447822
+    """
+
+    def __init__(self, random_generator=default_rng(seed=0)):
+        """
+        Initialize the HyperCubeSampling with a random number generator.
+
+        :param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0).
+        """
+        super().__init__(random_generator)
+
+    @preprocess_inputs
+    def generate(self, X, y, n_outliers: int = 10, expansion: float = 0.0):
+        """
+
+        How to set the values for expansion.
+        Per default expansion = 0, this means the hypercube will cover all the instances using min and max as boundaries
+        It is possible to make the hypercube bigger, as proposed in [1] section 6.1.1
+
+            Instances from Data usually determine the bounds a, b ∈ IRd . For this reason, this approach
+            needs them as input. Tax and Duin [51] and Fan et al. [21] state only that these bounds should be
+            chosen so that the hyper-rectangle encapsulates all genuine instances. [ 48] uses the minimum and
+            maximum for each attribute obtained from Data. Theiler and Michael Cai [52] mention that the
+            boundary does not need to be far beyond these boundaries. Abe et al. [1] propose the rule that the
+            boundary should expand the minimum and maximum by 10%. Désir et al. [17] propose to expand
+            the boundary by 20%.
+
+        For expanding the hypercube by 10% use expansion = 0.1, for 20% use 0.2, etc.
+
+        :param X: the input features (pandas DataFrame or numpy array).
+        :param y: the class labels, target values, or None (if not provided).
+        :param n_outliers: The number of outliers to generate.
+        :param expansion: how much the hypercube shall be expanded beyond (min,max) range, in percent (0.1 == 10%)
+        :return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values.
+                 If `y` is None, the returned target values will also be None.
+        """
+        assert expansion >= 0
+        low = 0 - expansion
+        high = 1 + expansion
+
+        scaler = MinMaxScaler()
+        scaler.fit(X)
+
+        outliers = self.random_generator.uniform(low=low, high=high, size=(n_outliers, X.shape[1]))
+
+        # add "outliers" as labels for outliers
+        yt = np.array(["outliers"] * len(outliers))
+
+        return scaler.inverse_transform(outliers), yt
+
+
+class ZScoreSamplingGenerator(OutliersGenerator):
+    """
+    Randomly generates outliers as data points with a z-score > 3.
+
+    Very similar to "GaussTail" in section 6.1.5 in [1]
+
+    [1] Georg Steinbuss and Klemens Böhm. 2021.
+        Generating Artificial Outliers in the Absence of Genuine Ones — A Survey.
+        ACM Trans. Knowl. Discov. Data 15, 2, Article 30 (April 2021), 37 pages.
+        https://doi.org/10.1145/3447822
+    """
+
+    def __init__(self, random_generator=default_rng(seed=0)):
+        """
+        Initialize the ZScoreSamplingGenerator with a random number generator.
+
+        :param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0).
+        """
+        super().__init__(random_generator)
+
+    @preprocess_inputs
+    def generate(self, X, y, n_outliers: int = 10, scale: float = 1.0):
+        """
+        Randomly generates outliers as data points with a z-score > 3.
+
+        The process involves the following steps:
+        1. Standardize the input data so that it has a mean of 0 and a variance of 1.
+        2. Generate outliers by:
+           - choosing a random sign for each outlier.
+           - for each dimension of the data, set the value to be 3 plus a random number drawn from an exponential distribution
+            (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html).
+        3. Apply the inverse of the standardization transformation to convert the generated outliers back to the original scale.
+
+        :param X: the input features (pandas DataFrame or numpy array).
+        :param y: the class labels, target values, or None (if not provided).
+        :param n_outliers: The number of outliers to generate.
+        :param scale: float or array_like of floats (the scale parameter from https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
+                    The scale parameter, :math:`\beta = 1/\lambda`. Must be
+                    non-negative.
+        :return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values.
+                 If `y` is None, the returned target values will also be None.
+        """
+
+        # standardize X
+        scaler = StandardScaler()
+
+        # fit, transform
+        scaler.fit(X)
+        Xt = scaler.transform(X)
+
+        # generate outliers
+        outliers = np.array([
+            random_sign(self.random_generator, size=Xt.shape[1]) * (
+                3. + self.random_generator.exponential(size=Xt.shape[1], scale=scale))
+            for _ in range(n_outliers)
+        ])
+
+        # in case we only have 1 outlier, reshape the array to match sklearn convention
+        if outliers.shape[0] == 1:
+            outliers = outliers.reshape(1, -1)
+
+        # add "outliers" as labels for outliers
+        yt = np.array(["outliers"] * len(outliers))
+
+        return scaler.inverse_transform(outliers), yt
+
+
+class HypersphereSamplingGenerator(OutliersGenerator):
+    """
+    Generates outliers by sampling points from a hypersphere with radius at least 3 sigma
+
+    Very similar to "GaussTail" in section 6.1.5 in [1]
+
+    [1] Georg Steinbuss and Klemens Böhm. 2021.
+        Generating Artificial Outliers in the Absence of Genuine Ones — A Survey.
+        ACM Trans. Knowl. Discov. Data 15, 2, Article 30 (April 2021), 37 pages.
+        https://doi.org/10.1145/3447822
+    """
+
+    def __init__(self, random_generator=default_rng(seed=0)):
+        """
+        Initialize the HypersphereSamplingGenerator with a random number generator.
+
+        :param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0).
+        """
+        super().__init__(random_generator)
+
+    @preprocess_inputs
+    def generate(self, X, y=None, n_outliers: int = 10, scale: float = 1.0):
+        """
+        Randomly generates outliers by sampling points from a hypersphere.
+
+        The process involves the following steps:
+        1. Standardize the input data so that it has a mean of 0 and a variance of 1.
+        2. Generate outliers by:
+           - choosing angles uniformly at random for each dimension of the data.
+           - setting the radius to be 3 plus a random number drawn from an exponential distribution
+             (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html).
+        3. Convert the spherical coordinates to Cartesian coordinates.
+        4. Apply the inverse of the standardization transformation to convert the generated outliers back to the original scale.
+
+        :param X: the input features (pandas DataFrame or numpy array).
+        :param y: the class labels, target values, or None (if not provided).
+        :param n_outliers: The number of outliers to generate.
+        :param scale: float (the scale parameter from https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
+                    The scale parameter, :math:`\beta = 1/\lambda`. Must be
+                    non-negative.
+        :return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values.
+                 If `y` is None, the returned target values will also be None.
+        """
+
+        # standardize X
+        scaler = StandardScaler()
+
+        # fit, transform
+        scaler.fit(X)
+        Xt = scaler.transform(X)
+
+        # computing outliers
+        outliers = np.array([
+            random_spherical_coordinate(
+                random_generator=self.random_generator,
+                size=Xt.shape[1],
+                radius=3. + self.random_generator.exponential(scale=scale)
+            )
+            for _ in range(n_outliers)
+        ])
+
+        # in case we only have 1 outlier, reshape the array to match sklearn convention
+        if outliers.shape[0] == 1:
+            outliers = outliers.reshape(1, -1)
+
+        # add "outliers" as labels for outliers
+        yt = np.array(["outliers"] * len(outliers))
+
+        return scaler.inverse_transform(outliers), yt
diff --git a/badgers/generators/tabular_data/outliers/instance_sampling.py b/badgers/generators/tabular_data/outliers/instance_sampling.py
@@ -0,0 +1,42 @@
+import numpy as np
+import pandas as pd
+from numpy.random import default_rng
+
+from badgers.core.decorators.tabular_data import preprocess_inputs
+from badgers.generators.tabular_data.outliers import OutliersGenerator
+
+
+class UniformInstanceAttributeSampling(OutliersGenerator):
+    """
+    Randomly generates outliers by sampling from existing instances attributes uniformly at random
+    """
+
+    def __init__(self, random_generator=default_rng(seed=0)):
+        """
+        Initialize the UniformInstanceAttributeSampling with a random number generator.
+
+        :param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0).
+        """
+        super().__init__(random_generator)
+
+    @preprocess_inputs
+    def generate(self, X, y, n_outliers: int = 10):
+        """
+
+
+        :param X: the input features (pandas DataFrame or numpy array).
+        :param y: the class labels, target values, or None (if not provided).
+        :param n_outliers: The number of outliers to generate.
+        :return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values.
+                 If `y` is None, the returned target values will also be None.
+        """
+
+        outliers = pd.DataFrame(
+            data=np.stack([self.random_generator.choice(X.iloc[:,i], size=n_outliers) for i in range(X.shape[1])]).T,
+            columns = X.columns
+        )
+
+        # add "outliers" as labels for outliers
+        yt = np.array(["outliers"] * len(outliers))
+
+        return outliers, yt