Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,4 @@ jobs:
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Run tests with tox
# Run tox using the version of Python in `PATH`
run: tox -e py
run: tox -e py
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,6 @@ cython_debug/
.idea/

/notebooks/
/badgers/uncertainty-main-uncertainty-generate-augmentation/
/experiments/
/.continue/
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ from badgers.generators.tabular_data.noise import GaussianNoiseGenerator

X, y = make_blobs()
trf = GaussianNoiseGenerator()
Xt, yt = trf.generate(X,y,noise_std=0.5)

Xt, yt = trf.generate(X, y, noise_std=0.5)
```

More examples are available in the [tutorials](https://fraunhofer-iese.github.io/badgers/tutorials/Imbalance-Tabular-Data/) section.
Expand Down
2 changes: 1 addition & 1 deletion badgers/generators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""
Module containing all transformers
Module containing all generators
"""
2 changes: 1 addition & 1 deletion badgers/generators/graph/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""
Module containing all the transformers that accept graph data as input
This module contains all the generator functions designed to process and yield data from graph inputs.
"""
50 changes: 40 additions & 10 deletions badgers/generators/graph/missingness.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ def __init__(self, random_generator: numpy.random.Generator = default_rng(seed=0

@abc.abstractmethod
def generate(self, X, y=None, **params) -> Tuple:
"""
This method should be overridden by subclasses.
"""
pass


Expand All @@ -33,15 +36,28 @@ class NodesMissingCompletelyAtRandom(MissingGenerator):
"""

def __init__(self, random_generator: numpy.random.Generator = default_rng(seed=0)):
"""
Initialize the missingness generator.

:param random_generator: A NumPy random number generator.
Defaults to a default random number generator seeded with 0.
:type random_generator: numpy.random.Generator
"""
super().__init__(random_generator=random_generator)

def generate(self, X, y=None, percentage_missing: float = 0.1) -> Tuple:
"""

:param X:
:param y:
:param percentage_missing: The percentage of missing nodes (float value between 0 and 1 excluded)
:return:
Generate a graph with a specified percentage of missing nodes.

:param X: The input graph from which nodes will be removed.
:type X: nx.Graph
:param y: Optional target array associated with the nodes in the graph.
If provided, the corresponding elements will also be removed.
:type y: np.ndarray, optional
:param percentage_missing: The percentage of nodes to be removed (float value between 0 and 1).
:type percentage_missing: float
:return: A tuple containing the modified graph with missing nodes and the modified target array (if provided).
:rtype: Tuple[nx.Graph, Optional[np.ndarray]]
"""
assert 0 < percentage_missing < 1
if not isinstance(X, nx.Graph):
Expand Down Expand Up @@ -70,15 +86,29 @@ class EdgesMissingCompletelyAtRandom(MissingGenerator):
"""

def __init__(self, random_generator: numpy.random.Generator = default_rng(seed=0)):
"""
Initialize the missingness generator.

:param random_generator: A NumPy random number generator.
Defaults to a default random number generator seeded with 0.
:type random_generator: numpy.random.Generator
"""
super().__init__(random_generator=random_generator)

def generate(self, X, y=None, percentage_missing: float = 0.1) -> Tuple:
"""

:param X:
:param y:
:param percentage_missing: The percentage of missing nodes (float value between 0 and 1 excluded)
:return:
Generate a graph with a specified percentage of missing edges.

:param X: The input graph from which edges will be removed.
:type X: nx.Graph
:param y: Optional target data associated with the edges in the graph.
If provided, the corresponding elements will also be removed.
Can be a dictionary where keys are edge tuples and values are target values.
:type y: dict, optional
:param percentage_missing: The percentage of edges to be removed (float value between 0 and 1).
:type percentage_missing: float
:return: A tuple containing the modified graph with missing edges and the modified target data (if provided).
:rtype: Tuple[nx.Graph, Optional[dict]]
"""
assert 0 < percentage_missing < 1
if not isinstance(X, nx.Graph):
Expand Down
2 changes: 1 addition & 1 deletion badgers/generators/tabular_data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""
Module containing all the transformers that accept tabular data as input
This module contains all the generator functions designed to process and yield data from tabular inputs.
"""
51 changes: 30 additions & 21 deletions badgers/generators/tabular_data/drift.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ class DriftGenerator(GeneratorMixin):

def __init__(self, random_generator=default_rng(seed=0)):
"""
:param random_generator: numpy.random.Generator, default default_rng(seed=0)
A random generator
Initialize the drift generator.
:param random_generator: A NumPy random number generator used to generate random numbers.
Defaults to a default random number generator seeded with 0.
:type random_generator: numpy.random.Generator
"""
self.random_generator = random_generator

Expand All @@ -27,7 +29,6 @@ def generate(self, X, y, **params):
pass



class RandomShiftGenerator(DriftGenerator):
"""
Randomly shift (geometrical translation) values of each column independently of one another.
Expand All @@ -37,24 +38,27 @@ class RandomShiftGenerator(DriftGenerator):

def __init__(self, random_generator=default_rng(seed=0)):
"""
Initialize the RandomShiftGenerator.

:param random_generator: A random generator
:param shift_std: The standard deviation of the amount of shift applied (shift is chosen from a normal distribution)
:param random_generator: A NumPy random number generator used to generate random numbers.
Defaults to a default random number generator seeded with 0.
:type random_generator: numpy.random.Generator
"""
super().__init__(random_generator=random_generator)

@preprocess_inputs
def generate(self, X, y=None, shift_std: Union[float,np.array] = 0.1):
def generate(self, X, y=None, shift_std: Union[float, np.array] = 0.1):
"""
Randomly shift (geometrical translation) values of each column independently of one another.
Data are first standardized (mean = 0, var = 1) and a random number is added to each column.
The ith columns is simply translated: `$x_i \left arrow x_i + \epsilon_i$`


:param X:
:param y:
:param shift_std:
:return:
Data are first standardized (mean = 0, var = 1), and a random number drawn from a normal distribution
with mean 0 and standard deviation `shift_std` is added to each column.
The ith column is simply translated: `$x_i \leftarrow x_i + \epsilon_i$`, where $\epsilon_i \sim \mathcal{N}(0, \text{shift\_std})$.

:param X: Input features, a 2D array-like object (e.g., a Pandas DataFrame or a NumPy array).
:param y: Target variable, a 1D array-like object (optional). Not used in this implementation.
:param shift_std: Standard deviation of the normal distribution from which the random shifts are drawn.
Can be a single float (applied to all columns) or an array of floats (one per column).
:return: A tuple containing the modified feature matrix `X'` and the original target `y`.
"""
# normalize X
scaler = StandardScaler()
Expand All @@ -77,20 +81,25 @@ class RandomShiftClassesGenerator(DriftGenerator):

def __init__(self, random_generator=default_rng(seed=0)):
"""
:param random_generator: A random generator
Initialize the RandomShiftClassesGenerator.

:param random_generator: A NumPy random number generator used to generate random numbers.
Defaults to a default random number generator seeded with 0.
:type random_generator: numpy.random.Generator
"""
super().__init__(random_generator=random_generator)

@preprocess_inputs
def generate(self, X, y, shift_std: Union[float,np.array] = 0.1):
def generate(self, X, y, shift_std: Union[float, np.array] = 0.1):
"""
Randomly shift (geometrical translation) values of each class independently of one another.
Data are first standardized (mean = 0, var = 1) and
for each class a random number is added to all instances.
Data are first standardized (mean = 0, var = 1) and for each class a random number is added to all instances.

:param X:
:param y:
:param shift_std: The standard deviation of the amount of shift applied (shift is chosen from a normal distribution)
:param X: Input features, a 2D array-like object (e.g., a Pandas DataFrame or a NumPy array).
:param y: Target variable, a 1D array-like object representing the class labels.
:param shift_std: Standard deviation of the normal distribution from which the random shifts are drawn.
Can be a single float (applied to all classes) or an array of floats (one per class).
:return: A tuple containing the modified feature matrix `X'` and the original target `y`.
"""
# extract unique labels
classes = np.unique(y)
Expand Down
96 changes: 72 additions & 24 deletions badgers/generators/tabular_data/imbalance.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,61 @@ class ImbalanceGenerator(GeneratorMixin):

def __init__(self, random_generator=default_rng(seed=0)):
"""
:param random_generator: A random generator
Initialize the ImbalanceGenerator with a specified random number generator.

:param random_generator: A NumPy random number generator used to generate random numbers.
Defaults to a default random number generator seeded with 0.
:type random_generator: numpy.random.Generator
"""
self.random_generator = random_generator

@abc.abstractmethod
def generate(self, X, y=None, **params):
"""
Abstract method to generate imbalanced data from the input data.
This should be overridden

:param X: Input features, can be a pandas DataFrame or a numpy array.
:type X: Union[pandas.DataFrame, numpy.ndarray]
:param y: Target variable, can be a pandas Series or a numpy array.
If None, it is assumed that the target is not provided.
:type y: Union[pandas.Series, numpy.ndarray, None], optional
:param params: Additional keyword arguments that might be required for specific implementations.
:type params: dict
"""
pass


class RandomSamplingFeaturesGenerator(ImbalanceGenerator):

def __init__(self, random_generator=default_rng(seed=0), ):
"""
:param random_generator: A random generator
Initialize the RandomSamplingFeaturesGenerator with a specified random number generator.
:param random_generator: A NumPy random number generator used to generate random numbers.
Defaults to a default random number generator seeded with 0.
:type random_generator: numpy.random.Generator
"""
super().__init__(random_generator=random_generator)

@preprocess_inputs
def generate(self, X, y=None, sampling_proba_func=lambda X: normalize_proba(X.iloc[:, 0])):
"""
Randomly samples instances based on the features values in X

:param X:
:param y:
:param sampling_proba_func: A function that takes as input data and returns a sampling probability
:return: Xt, yt
Randomly samples instances based on the feature values in X using a specified sampling probability function.

The sampling probability function is applied to the input features X to determine the probability of each instance being sampled.
By default, the first column of X is used to compute the normalized sampling probabilities.

:param X: Input features, can be a pandas DataFrame or a numpy array.
:type X: Union[pandas.DataFrame, numpy.ndarray]
:param y: Target variable, can be a pandas Series or a numpy array.
If None, it is assumed that the target is not provided.
:type y: Union[pandas.Series, numpy.ndarray, None], optional
:param sampling_proba_func: A function that takes as input data (X) and returns a series of sampling probabilities.
The function should ensure that the probabilities are normalized.
:type sampling_proba_func: callable
:return: A tuple containing the sampled features (Xt) and the corresponding target values (yt).
If y is None, only the sampled features (Xt) are returned.
:rtype: Tuple[Union[pandas.DataFrame, numpy.ndarray], Union[pandas.Series, numpy.ndarray, None]]
"""
# total number of instances that will be missing
# sampling
Expand All @@ -59,23 +88,31 @@ class RandomSamplingClassesGenerator(ImbalanceGenerator):

def __init__(self, random_generator=default_rng(seed=0), ):
"""
Initialize the RandomSamplingClassesGenerator with a specified random number generator.

:param random_generator: A random generator

:param random_generator: A NumPy random number generator used to generate random numbers.
Defaults to a default random number generator seeded with 0.
:type random_generator: numpy.random.Generator
"""
super().__init__(random_generator=random_generator)
self.transformed_labels_ = None

@preprocess_inputs
def generate(self, X, y, proportion_classes: dict = None):
"""
Randomly samples instances for each classes

:param X:
:param y:
:param proportion_classes: Example for having in total 50% of class 'A', 30% of class 'B', and 20% of class 'C'
proportion_classes={'A':0.5, 'B':0.3, 'C':0.2}
:return:
Randomly samples instances for each class based on the specified proportions.

:param X: Input features, can be a pandas DataFrame or a numpy array.
:type X: Union[pandas.DataFrame, numpy.ndarray]
:param y: Target variable, must be a pandas Series or a numpy array.
:type y: Union[pandas.Series, numpy.ndarray]
:param proportion_classes: A dictionary specifying the desired proportion of each class.
The keys are class labels and the values are the desired proportions.
For example, to have 50% of class 'A', 30% of class 'B', and 20% of class 'C',
use `proportion_classes={'A': 0.5, 'B': 0.3, 'C': 0.2}`.
:type proportion_classes: dict, optional
:return: A tuple containing the sampled features (Xt) and the corresponding target values (yt).
:rtype: Tuple[Union[pandas.DataFrame, numpy.ndarray], Union[pandas.Series, numpy.ndarray]]
"""
# local variables
Xt = []
Expand Down Expand Up @@ -103,21 +140,32 @@ class RandomSamplingTargetsGenerator(ImbalanceGenerator):

def __init__(self, random_generator=default_rng(seed=0)):
"""
Initialize the RandomSamplingTargetsGenerator with a specified random number generator.

:param random_generator: A random generator
:param sampling_proba_func: A function that takes y as input and returns a sampling probability
:param random_generator: A NumPy random number generator used to generate random numbers.
Defaults to a default random number generator seeded with 0.
:type random_generator: numpy.random.Generator
"""
super().__init__(random_generator=random_generator)
self.transformed_labels_ = None

@preprocess_inputs
def generate(self, X, y, sampling_proba_func=lambda y: normalize_proba(y)):
"""
Randomly samples instances for each classes

:param X:
:param y:
:return:
Randomly samples instances based on the target values in y using a specified sampling probability function.

The sampling probability function is applied to the target values y to determine the probability of each instance being sampled.
By default, the target values are used to compute the normalized sampling probabilities.

:param X: Input features, can be a pandas DataFrame or a numpy array.
:type X: Union[pandas.DataFrame, numpy.ndarray]
:param y: Target variable, must be a pandas Series or a numpy array.
:type y: Union[pandas.Series, numpy.ndarray]
:param sampling_proba_func: A function that takes as input target values (y) and returns a series of sampling probabilities.
The function should ensure that the probabilities are normalized.
:type sampling_proba_func: callable
:return: A tuple containing the sampled features (Xt) and the corresponding target values (yt).
:rtype: Tuple[Union[pandas.DataFrame, numpy.ndarray], Union[pandas.Series, numpy.ndarray]]
"""
sampling_probabilities_ = sampling_proba_func(y)
sampling_mask = self.random_generator.choice(X.shape[0], p=sampling_probabilities_, size=X.shape[0],
Expand Down
Loading
Loading