Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/documentation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,5 @@ jobs:
path: .cache
restore-keys: |
mkdocs-material-
- run: pip install mkdocs mkdocstrings[python] mkdocs-gen-files mkdocs-material mkdocs-literate-nav mkdocs-jupyter
- run: pip install -U mkdocs mkdocstrings[python] mkdocs-gen-files mkdocs-material mkdocs-literate-nav mkdocs-jupyter
- run: mkdocs gh-deploy --force
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,6 @@ cython_debug/
/badgers/uncertainty-main-uncertainty-generate-augmentation/
/experiments/
/.continue/
/uncertainty-main-uncertainty-generate-augmentation/
/mcp/
/profiling_tests/
83 changes: 83 additions & 0 deletions badgers/generators/tabular_data/outliers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import abc

import numpy as np
import sklearn.base
from numpy.random import default_rng
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from badgers.core.base import GeneratorMixin
from badgers.core.decorators.tabular_data import preprocess_inputs


class OutliersGenerator(GeneratorMixin):
"""
Base class for transformers that add outliers to tabular data
"""

def __init__(self, random_generator: np.random.Generator = default_rng(seed=0)):
"""
Initialize the OutliersGenerator with a random number generator.

:param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0).
"""
self.random_generator = random_generator

@abc.abstractmethod
def generate(self, X, y=None, **params):
"""
Abstract method to generate outliers data. Must be implemented by subclasses.

:param X: Input features (pandas DataFrame or numpy array).
:param y: Target variable (pandas Series or numpy array).
:param params: Additional parameters required for noise generation.
"""
pass


class DecompositionAndOutlierGenerator(OutliersGenerator):

def __init__(self, decomposition_transformer: sklearn.base.TransformerMixin, outlier_generator: OutliersGenerator):
"""
Initialize the DecompositionAndOutlierGenerator with a decomposition transformer and an outlier generator.

:param decomposition_transformer: The dimensionality reduction transformer to be applied to the data before generating outliers.
:param outlier_generator: The outlier generator to be used after the data has been transformed.
"""
assert hasattr(
decomposition_transformer,
'inverse_transform'), \
f'the decomposition transformer class must implement the inverse_transform function.' \
f'\nUnfortunately the class {decomposition_transformer} does not'
super().__init__(random_generator=outlier_generator.random_generator)

self.decomposition_transformer = decomposition_transformer
self.outlier_generator = outlier_generator

@preprocess_inputs
def generate(self, X, y=None, **params):
"""
Randomly generate outliers by first applying a dimensionality reduction technique (sklearn.decomposition)
and an outlier transformer.

1. Standardize the input data (mean = 0, variance = 1)
2. Apply the dimensionality reduction transformer
3. Generates outliers by applying the outlier transformer
4. Inverse the dimensionality reduction and the standardization transformations

:param X: the input features
:param y: the regression target, class labels, or None
:param params:
:return:
"""

# standardize the data and apply the dimensionality reduction transformer
pipeline = make_pipeline(
StandardScaler(),
self.decomposition_transformer,
)
Xt = pipeline.fit_transform(X)
# add outliers using the zscore_transformer
Xt, yt = self.outlier_generator.generate(Xt, y, **params)
# inverse the manifold and standardization transformations
return pipeline.inverse_transform(Xt), yt
206 changes: 206 additions & 0 deletions badgers/generators/tabular_data/outliers/distribution_sampling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
import numpy as np
from numpy.random import default_rng
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from badgers.core.decorators.tabular_data import preprocess_inputs
from badgers.core.utils import random_sign, random_spherical_coordinate
from badgers.generators.tabular_data.outliers import OutliersGenerator


class HyperCubeSampling(OutliersGenerator):
"""
Sampling uniformly at random within a hypercube encapsulating all the instances


See section 6.1.1 in [1]

[1] Georg Steinbuss and Klemens Böhm. 2021.
Generating Artificial Outliers in the Absence of Genuine Ones — A Survey.
ACM Trans. Knowl. Discov. Data 15, 2, Article 30 (April 2021), 37 pages.
https://doi.org/10.1145/3447822
"""

def __init__(self, random_generator=default_rng(seed=0)):
"""
Initialize the HyperCubeSampling with a random number generator.

:param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0).
"""
super().__init__(random_generator)

@preprocess_inputs
def generate(self, X, y, n_outliers: int = 10, expansion: float = 0.0):
"""

How to set the values for expansion.
Per default expansion = 0, this means the hypercube will cover all the instances using min and max as boundaries
It is possible to make the hypercube bigger, as proposed in [1] section 6.1.1

Instances from Data usually determine the bounds a, b ∈ IRd . For this reason, this approach
needs them as input. Tax and Duin [51] and Fan et al. [21] state only that these bounds should be
chosen so that the hyper-rectangle encapsulates all genuine instances. [ 48] uses the minimum and
maximum for each attribute obtained from Data. Theiler and Michael Cai [52] mention that the
boundary does not need to be far beyond these boundaries. Abe et al. [1] propose the rule that the
boundary should expand the minimum and maximum by 10%. Désir et al. [17] propose to expand
the boundary by 20%.

For expanding the hypercube by 10% use expansion = 0.1, for 20% use 0.2, etc.

:param X: the input features (pandas DataFrame or numpy array).
:param y: the class labels, target values, or None (if not provided).
:param n_outliers: The number of outliers to generate.
:param expansion: how much the hypercube shall be expanded beyond (min,max) range, in percent (0.1 == 10%)
:return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values.
If `y` is None, the returned target values will also be None.
"""
assert expansion >= 0
low = 0 - expansion
high = 1 + expansion

scaler = MinMaxScaler()
scaler.fit(X)

outliers = self.random_generator.uniform(low=low, high=high, size=(n_outliers, X.shape[1]))

# add "outliers" as labels for outliers
yt = np.array(["outliers"] * len(outliers))

return scaler.inverse_transform(outliers), yt


class ZScoreSamplingGenerator(OutliersGenerator):
"""
Randomly generates outliers as data points with a z-score > 3.

Very similar to "GaussTail" in section 6.1.5 in [1]

[1] Georg Steinbuss and Klemens Böhm. 2021.
Generating Artificial Outliers in the Absence of Genuine Ones — A Survey.
ACM Trans. Knowl. Discov. Data 15, 2, Article 30 (April 2021), 37 pages.
https://doi.org/10.1145/3447822
"""

def __init__(self, random_generator=default_rng(seed=0)):
"""
Initialize the ZScoreSamplingGenerator with a random number generator.

:param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0).
"""
super().__init__(random_generator)

@preprocess_inputs
def generate(self, X, y, n_outliers: int = 10, scale: float = 1.0):
"""
Randomly generates outliers as data points with a z-score > 3.

The process involves the following steps:
1. Standardize the input data so that it has a mean of 0 and a variance of 1.
2. Generate outliers by:
- choosing a random sign for each outlier.
- for each dimension of the data, set the value to be 3 plus a random number drawn from an exponential distribution
(see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html).
3. Apply the inverse of the standardization transformation to convert the generated outliers back to the original scale.

:param X: the input features (pandas DataFrame or numpy array).
:param y: the class labels, target values, or None (if not provided).
:param n_outliers: The number of outliers to generate.
:param scale: float or array_like of floats (the scale parameter from https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
The scale parameter, :math:`\beta = 1/\lambda`. Must be
non-negative.
:return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values.
If `y` is None, the returned target values will also be None.
"""

# standardize X
scaler = StandardScaler()

# fit, transform
scaler.fit(X)
Xt = scaler.transform(X)

# generate outliers
outliers = np.array([
random_sign(self.random_generator, size=Xt.shape[1]) * (
3. + self.random_generator.exponential(size=Xt.shape[1], scale=scale))
for _ in range(n_outliers)
])

# in case we only have 1 outlier, reshape the array to match sklearn convention
if outliers.shape[0] == 1:
outliers = outliers.reshape(1, -1)

# add "outliers" as labels for outliers
yt = np.array(["outliers"] * len(outliers))

return scaler.inverse_transform(outliers), yt


class HypersphereSamplingGenerator(OutliersGenerator):
"""
Generates outliers by sampling points from a hypersphere with radius at least 3 sigma

Very similar to "GaussTail" in section 6.1.5 in [1]

[1] Georg Steinbuss and Klemens Böhm. 2021.
Generating Artificial Outliers in the Absence of Genuine Ones — A Survey.
ACM Trans. Knowl. Discov. Data 15, 2, Article 30 (April 2021), 37 pages.
https://doi.org/10.1145/3447822
"""

def __init__(self, random_generator=default_rng(seed=0)):
"""
Initialize the HypersphereSamplingGenerator with a random number generator.

:param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0).
"""
super().__init__(random_generator)

@preprocess_inputs
def generate(self, X, y=None, n_outliers: int = 10, scale: float = 1.0):
"""
Randomly generates outliers by sampling points from a hypersphere.

The process involves the following steps:
1. Standardize the input data so that it has a mean of 0 and a variance of 1.
2. Generate outliers by:
- choosing angles uniformly at random for each dimension of the data.
- setting the radius to be 3 plus a random number drawn from an exponential distribution
(see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html).
3. Convert the spherical coordinates to Cartesian coordinates.
4. Apply the inverse of the standardization transformation to convert the generated outliers back to the original scale.

:param X: the input features (pandas DataFrame or numpy array).
:param y: the class labels, target values, or None (if not provided).
:param n_outliers: The number of outliers to generate.
:param scale: float (the scale parameter from https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
The scale parameter, :math:`\beta = 1/\lambda`. Must be
non-negative.
:return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values.
If `y` is None, the returned target values will also be None.
"""

# standardize X
scaler = StandardScaler()

# fit, transform
scaler.fit(X)
Xt = scaler.transform(X)

# computing outliers
outliers = np.array([
random_spherical_coordinate(
random_generator=self.random_generator,
size=Xt.shape[1],
radius=3. + self.random_generator.exponential(scale=scale)
)
for _ in range(n_outliers)
])

# in case we only have 1 outlier, reshape the array to match sklearn convention
if outliers.shape[0] == 1:
outliers = outliers.reshape(1, -1)

# add "outliers" as labels for outliers
yt = np.array(["outliers"] * len(outliers))

return scaler.inverse_transform(outliers), yt
42 changes: 42 additions & 0 deletions badgers/generators/tabular_data/outliers/instance_sampling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import numpy as np
import pandas as pd
from numpy.random import default_rng

from badgers.core.decorators.tabular_data import preprocess_inputs
from badgers.generators.tabular_data.outliers import OutliersGenerator


class UniformInstanceAttributeSampling(OutliersGenerator):
"""
Randomly generates outliers by sampling from existing instances attributes uniformly at random
"""

def __init__(self, random_generator=default_rng(seed=0)):
"""
Initialize the UniformInstanceAttributeSampling with a random number generator.

:param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0).
"""
super().__init__(random_generator)

@preprocess_inputs
def generate(self, X, y, n_outliers: int = 10):
"""


:param X: the input features (pandas DataFrame or numpy array).
:param y: the class labels, target values, or None (if not provided).
:param n_outliers: The number of outliers to generate.
:return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values.
If `y` is None, the returned target values will also be None.
"""

outliers = pd.DataFrame(
data=np.stack([self.random_generator.choice(X.iloc[:,i], size=n_outliers) for i in range(X.shape[1])]).T,
columns = X.columns
)

# add "outliers" as labels for outliers
yt = np.array(["outliers"] * len(outliers))

return outliers, yt
Loading
Loading