diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 50198c9..cf7c52d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -37,4 +37,4 @@ jobs: flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Run tests with tox # Run tox using the version of Python in `PATH` - run: tox -e py + run: tox -e py \ No newline at end of file diff --git a/.gitignore b/.gitignore index 8b33796..f21eefe 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,6 @@ cython_debug/ .idea/ /notebooks/ +/badgers/uncertainty-main-uncertainty-generate-augmentation/ +/experiments/ +/.continue/ diff --git a/README.md b/README.md index 77aa2b5..bdd7722 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,8 @@ from badgers.generators.tabular_data.noise import GaussianNoiseGenerator X, y = make_blobs() trf = GaussianNoiseGenerator() -Xt, yt = trf.generate(X,y,noise_std=0.5) + +Xt, yt = trf.generate(X, y, noise_std=0.5) ``` More examples are available in the [tutorials](https://fraunhofer-iese.github.io/badgers/tutorials/Imbalance-Tabular-Data/) section. diff --git a/badgers/generators/__init__.py b/badgers/generators/__init__.py index 09f64f5..852d3f9 100644 --- a/badgers/generators/__init__.py +++ b/badgers/generators/__init__.py @@ -1,3 +1,3 @@ """ -Module containing all transformers +Module containing all generators """ \ No newline at end of file diff --git a/badgers/generators/graph/__init__.py b/badgers/generators/graph/__init__.py index bac955e..e773b3e 100644 --- a/badgers/generators/graph/__init__.py +++ b/badgers/generators/graph/__init__.py @@ -1,3 +1,3 @@ """ -Module containing all the transformers that accept graph data as input +This module contains all the generator functions designed to process and yield data from graph inputs. """ \ No newline at end of file diff --git a/badgers/generators/graph/missingness.py b/badgers/generators/graph/missingness.py index 2285eb2..4beae10 100644 --- a/badgers/generators/graph/missingness.py +++ b/badgers/generators/graph/missingness.py @@ -24,6 +24,9 @@ def __init__(self, random_generator: numpy.random.Generator = default_rng(seed=0 @abc.abstractmethod def generate(self, X, y=None, **params) -> Tuple: + """ + This method should be overridden by subclasses. + """ pass @@ -33,15 +36,28 @@ class NodesMissingCompletelyAtRandom(MissingGenerator): """ def __init__(self, random_generator: numpy.random.Generator = default_rng(seed=0)): + """ + Initialize the missingness generator. + + :param random_generator: A NumPy random number generator. + Defaults to a default random number generator seeded with 0. + :type random_generator: numpy.random.Generator + """ super().__init__(random_generator=random_generator) def generate(self, X, y=None, percentage_missing: float = 0.1) -> Tuple: """ - - :param X: - :param y: - :param percentage_missing: The percentage of missing nodes (float value between 0 and 1 excluded) - :return: + Generate a graph with a specified percentage of missing nodes. + + :param X: The input graph from which nodes will be removed. + :type X: nx.Graph + :param y: Optional target array associated with the nodes in the graph. + If provided, the corresponding elements will also be removed. + :type y: np.ndarray, optional + :param percentage_missing: The percentage of nodes to be removed (float value between 0 and 1). + :type percentage_missing: float + :return: A tuple containing the modified graph with missing nodes and the modified target array (if provided). + :rtype: Tuple[nx.Graph, Optional[np.ndarray]] """ assert 0 < percentage_missing < 1 if not isinstance(X, nx.Graph): @@ -70,15 +86,29 @@ class EdgesMissingCompletelyAtRandom(MissingGenerator): """ def __init__(self, random_generator: numpy.random.Generator = default_rng(seed=0)): + """ + Initialize the missingness generator. + + :param random_generator: A NumPy random number generator. + Defaults to a default random number generator seeded with 0. + :type random_generator: numpy.random.Generator + """ super().__init__(random_generator=random_generator) def generate(self, X, y=None, percentage_missing: float = 0.1) -> Tuple: """ - - :param X: - :param y: - :param percentage_missing: The percentage of missing nodes (float value between 0 and 1 excluded) - :return: + Generate a graph with a specified percentage of missing edges. + + :param X: The input graph from which edges will be removed. + :type X: nx.Graph + :param y: Optional target data associated with the edges in the graph. + If provided, the corresponding elements will also be removed. + Can be a dictionary where keys are edge tuples and values are target values. + :type y: dict, optional + :param percentage_missing: The percentage of edges to be removed (float value between 0 and 1). + :type percentage_missing: float + :return: A tuple containing the modified graph with missing edges and the modified target data (if provided). + :rtype: Tuple[nx.Graph, Optional[dict]] """ assert 0 < percentage_missing < 1 if not isinstance(X, nx.Graph): diff --git a/badgers/generators/tabular_data/__init__.py b/badgers/generators/tabular_data/__init__.py index b9193e0..75ebc4f 100644 --- a/badgers/generators/tabular_data/__init__.py +++ b/badgers/generators/tabular_data/__init__.py @@ -1,3 +1,3 @@ """ -Module containing all the transformers that accept tabular data as input +This module contains all the generator functions designed to process and yield data from tabular inputs. """ diff --git a/badgers/generators/tabular_data/drift.py b/badgers/generators/tabular_data/drift.py index 7d7e893..5c8c097 100644 --- a/badgers/generators/tabular_data/drift.py +++ b/badgers/generators/tabular_data/drift.py @@ -17,8 +17,10 @@ class DriftGenerator(GeneratorMixin): def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: numpy.random.Generator, default default_rng(seed=0) - A random generator + Initialize the drift generator. + :param random_generator: A NumPy random number generator used to generate random numbers. + Defaults to a default random number generator seeded with 0. + :type random_generator: numpy.random.Generator """ self.random_generator = random_generator @@ -27,7 +29,6 @@ def generate(self, X, y, **params): pass - class RandomShiftGenerator(DriftGenerator): """ Randomly shift (geometrical translation) values of each column independently of one another. @@ -37,24 +38,27 @@ class RandomShiftGenerator(DriftGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ + Initialize the RandomShiftGenerator. - :param random_generator: A random generator - :param shift_std: The standard deviation of the amount of shift applied (shift is chosen from a normal distribution) + :param random_generator: A NumPy random number generator used to generate random numbers. + Defaults to a default random number generator seeded with 0. + :type random_generator: numpy.random.Generator """ super().__init__(random_generator=random_generator) @preprocess_inputs - def generate(self, X, y=None, shift_std: Union[float,np.array] = 0.1): + def generate(self, X, y=None, shift_std: Union[float, np.array] = 0.1): """ Randomly shift (geometrical translation) values of each column independently of one another. - Data are first standardized (mean = 0, var = 1) and a random number is added to each column. - The ith columns is simply translated: `$x_i \left arrow x_i + \epsilon_i$` - - - :param X: - :param y: - :param shift_std: - :return: + Data are first standardized (mean = 0, var = 1), and a random number drawn from a normal distribution + with mean 0 and standard deviation `shift_std` is added to each column. + The ith column is simply translated: `$x_i \leftarrow x_i + \epsilon_i$`, where $\epsilon_i \sim \mathcal{N}(0, \text{shift\_std})$. + + :param X: Input features, a 2D array-like object (e.g., a Pandas DataFrame or a NumPy array). + :param y: Target variable, a 1D array-like object (optional). Not used in this implementation. + :param shift_std: Standard deviation of the normal distribution from which the random shifts are drawn. + Can be a single float (applied to all columns) or an array of floats (one per column). + :return: A tuple containing the modified feature matrix `X'` and the original target `y`. """ # normalize X scaler = StandardScaler() @@ -77,20 +81,25 @@ class RandomShiftClassesGenerator(DriftGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: A random generator + Initialize the RandomShiftClassesGenerator. + + :param random_generator: A NumPy random number generator used to generate random numbers. + Defaults to a default random number generator seeded with 0. + :type random_generator: numpy.random.Generator """ super().__init__(random_generator=random_generator) @preprocess_inputs - def generate(self, X, y, shift_std: Union[float,np.array] = 0.1): + def generate(self, X, y, shift_std: Union[float, np.array] = 0.1): """ Randomly shift (geometrical translation) values of each class independently of one another. - Data are first standardized (mean = 0, var = 1) and - for each class a random number is added to all instances. + Data are first standardized (mean = 0, var = 1) and for each class a random number is added to all instances. - :param X: - :param y: - :param shift_std: The standard deviation of the amount of shift applied (shift is chosen from a normal distribution) + :param X: Input features, a 2D array-like object (e.g., a Pandas DataFrame or a NumPy array). + :param y: Target variable, a 1D array-like object representing the class labels. + :param shift_std: Standard deviation of the normal distribution from which the random shifts are drawn. + Can be a single float (applied to all classes) or an array of floats (one per class). + :return: A tuple containing the modified feature matrix `X'` and the original target `y`. """ # extract unique labels classes = np.unique(y) diff --git a/badgers/generators/tabular_data/imbalance.py b/badgers/generators/tabular_data/imbalance.py index a86666d..c673c33 100644 --- a/badgers/generators/tabular_data/imbalance.py +++ b/badgers/generators/tabular_data/imbalance.py @@ -16,12 +16,28 @@ class ImbalanceGenerator(GeneratorMixin): def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: A random generator + Initialize the ImbalanceGenerator with a specified random number generator. + + :param random_generator: A NumPy random number generator used to generate random numbers. + Defaults to a default random number generator seeded with 0. + :type random_generator: numpy.random.Generator """ self.random_generator = random_generator @abc.abstractmethod def generate(self, X, y=None, **params): + """ + Abstract method to generate imbalanced data from the input data. + This should be overridden + + :param X: Input features, can be a pandas DataFrame or a numpy array. + :type X: Union[pandas.DataFrame, numpy.ndarray] + :param y: Target variable, can be a pandas Series or a numpy array. + If None, it is assumed that the target is not provided. + :type y: Union[pandas.Series, numpy.ndarray, None], optional + :param params: Additional keyword arguments that might be required for specific implementations. + :type params: dict + """ pass @@ -29,19 +45,32 @@ class RandomSamplingFeaturesGenerator(ImbalanceGenerator): def __init__(self, random_generator=default_rng(seed=0), ): """ - :param random_generator: A random generator + Initialize the RandomSamplingFeaturesGenerator with a specified random number generator. + :param random_generator: A NumPy random number generator used to generate random numbers. + Defaults to a default random number generator seeded with 0. + :type random_generator: numpy.random.Generator """ super().__init__(random_generator=random_generator) @preprocess_inputs def generate(self, X, y=None, sampling_proba_func=lambda X: normalize_proba(X.iloc[:, 0])): """ - Randomly samples instances based on the features values in X - - :param X: - :param y: - :param sampling_proba_func: A function that takes as input data and returns a sampling probability - :return: Xt, yt + Randomly samples instances based on the feature values in X using a specified sampling probability function. + + The sampling probability function is applied to the input features X to determine the probability of each instance being sampled. + By default, the first column of X is used to compute the normalized sampling probabilities. + + :param X: Input features, can be a pandas DataFrame or a numpy array. + :type X: Union[pandas.DataFrame, numpy.ndarray] + :param y: Target variable, can be a pandas Series or a numpy array. + If None, it is assumed that the target is not provided. + :type y: Union[pandas.Series, numpy.ndarray, None], optional + :param sampling_proba_func: A function that takes as input data (X) and returns a series of sampling probabilities. + The function should ensure that the probabilities are normalized. + :type sampling_proba_func: callable + :return: A tuple containing the sampled features (Xt) and the corresponding target values (yt). + If y is None, only the sampled features (Xt) are returned. + :rtype: Tuple[Union[pandas.DataFrame, numpy.ndarray], Union[pandas.Series, numpy.ndarray, None]] """ # total number of instances that will be missing # sampling @@ -59,9 +88,11 @@ class RandomSamplingClassesGenerator(ImbalanceGenerator): def __init__(self, random_generator=default_rng(seed=0), ): """ + Initialize the RandomSamplingClassesGenerator with a specified random number generator. - :param random_generator: A random generator - + :param random_generator: A NumPy random number generator used to generate random numbers. + Defaults to a default random number generator seeded with 0. + :type random_generator: numpy.random.Generator """ super().__init__(random_generator=random_generator) self.transformed_labels_ = None @@ -69,13 +100,19 @@ def __init__(self, random_generator=default_rng(seed=0), ): @preprocess_inputs def generate(self, X, y, proportion_classes: dict = None): """ - Randomly samples instances for each classes - - :param X: - :param y: - :param proportion_classes: Example for having in total 50% of class 'A', 30% of class 'B', and 20% of class 'C' - proportion_classes={'A':0.5, 'B':0.3, 'C':0.2} - :return: + Randomly samples instances for each class based on the specified proportions. + + :param X: Input features, can be a pandas DataFrame or a numpy array. + :type X: Union[pandas.DataFrame, numpy.ndarray] + :param y: Target variable, must be a pandas Series or a numpy array. + :type y: Union[pandas.Series, numpy.ndarray] + :param proportion_classes: A dictionary specifying the desired proportion of each class. + The keys are class labels and the values are the desired proportions. + For example, to have 50% of class 'A', 30% of class 'B', and 20% of class 'C', + use `proportion_classes={'A': 0.5, 'B': 0.3, 'C': 0.2}`. + :type proportion_classes: dict, optional + :return: A tuple containing the sampled features (Xt) and the corresponding target values (yt). + :rtype: Tuple[Union[pandas.DataFrame, numpy.ndarray], Union[pandas.Series, numpy.ndarray]] """ # local variables Xt = [] @@ -103,9 +140,11 @@ class RandomSamplingTargetsGenerator(ImbalanceGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ + Initialize the RandomSamplingTargetsGenerator with a specified random number generator. - :param random_generator: A random generator - :param sampling_proba_func: A function that takes y as input and returns a sampling probability + :param random_generator: A NumPy random number generator used to generate random numbers. + Defaults to a default random number generator seeded with 0. + :type random_generator: numpy.random.Generator """ super().__init__(random_generator=random_generator) self.transformed_labels_ = None @@ -113,11 +152,20 @@ def __init__(self, random_generator=default_rng(seed=0)): @preprocess_inputs def generate(self, X, y, sampling_proba_func=lambda y: normalize_proba(y)): """ - Randomly samples instances for each classes - - :param X: - :param y: - :return: + Randomly samples instances based on the target values in y using a specified sampling probability function. + + The sampling probability function is applied to the target values y to determine the probability of each instance being sampled. + By default, the target values are used to compute the normalized sampling probabilities. + + :param X: Input features, can be a pandas DataFrame or a numpy array. + :type X: Union[pandas.DataFrame, numpy.ndarray] + :param y: Target variable, must be a pandas Series or a numpy array. + :type y: Union[pandas.Series, numpy.ndarray] + :param sampling_proba_func: A function that takes as input target values (y) and returns a series of sampling probabilities. + The function should ensure that the probabilities are normalized. + :type sampling_proba_func: callable + :return: A tuple containing the sampled features (Xt) and the corresponding target values (yt). + :rtype: Tuple[Union[pandas.DataFrame, numpy.ndarray], Union[pandas.Series, numpy.ndarray]] """ sampling_probabilities_ = sampling_proba_func(y) sampling_mask = self.random_generator.choice(X.shape[0], p=sampling_probabilities_, size=X.shape[0], diff --git a/badgers/generators/tabular_data/missingness.py b/badgers/generators/tabular_data/missingness.py index 1ef5b7f..05176d7 100644 --- a/badgers/generators/tabular_data/missingness.py +++ b/badgers/generators/tabular_data/missingness.py @@ -23,6 +23,18 @@ def __init__(self, random_generator: numpy.random.Generator = default_rng(seed=0 @abc.abstractmethod def generate(self, X, y, **params): + """ + Abstract method to generate missing values in the input data. + This should be overridden by subclasses. + + :param X: Input features, can be a pandas DataFrame or a numpy array. + :type X: Union[pandas.DataFrame, numpy.ndarray] + :param y: Target variable, can be a pandas Series or a numpy array. + If None, it is assumed that the target is not provided and will be ignored. + :type y: Union[pandas.Series, numpy.ndarray, None], optional + :param params: Additional keyword arguments that might be required for specific implementations. + :type params: dict + """ pass @@ -35,19 +47,27 @@ class MissingCompletelyAtRandom(MissingValueGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: A random generator + Initializes the MissingCompletelyAtRandom class with a specified random generator. + + :param random_generator: A NumPy random number generator instance. Defaults to a new instance of `default_rng` with seed 0. + :type random_generator: numpy.random.Generator """ super().__init__(random_generator=random_generator) @preprocess_inputs def generate(self, X, y, percentage_missing: float = 0.1): """ - Computes indices of missing values using a uniform distribution. - - :param X: the input features - :param y: the target - :param percentage_missing: The percentage of missing values (float value between 0 and 1 included) - :return: Xt, yt + Introduces missing values into the input features `X` completely at random according to a specified percentage. + + :param X: The input features, which can be a pandas DataFrame or a numpy array. + :type X: Union[pandas.DataFrame, numpy.ndarray] + :param y: The target variable, which can be a pandas Series or a numpy array. + If not provided, it is assumed that the target is not needed and will be ignored. + :type y: Union[pandas.Series, numpy.ndarray, None], optional + :param percentage_missing: The proportion of values to be replaced with missing values, expressed as a float between 0 and 1. + :type percentage_missing: float + :return: A tuple containing the modified input features `Xt` with introduced missing values and the original target `y`. + :rtype: Tuple[Union[pandas.DataFrame, numpy.ndarray], Union[pandas.Series, numpy.ndarray, None]] """ assert 0 <= percentage_missing <= 1 # compute number of missing values per column @@ -74,18 +94,29 @@ class DummyMissingAtRandom(MissingValueGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: A random generator + Initializes the DummyMissingAtRandom class with a specified random generator. + + :param random_generator: A NumPy random number generator instance. Defaults to a new instance of `default_rng` with seed 0. + :type random_generator: numpy.random.Generator """ super().__init__(random_generator=random_generator) @preprocess_inputs def generate(self, X, y, percentage_missing: float = 0.1): """ - - :param X: the input features - :param y: the target - :param percentage_missing: The percentage of missing values (float value between 0 and 1 included) - :return: Xt, yt + Introduces missing values into the input features `X` at random based on another feature, + where the probability of a data instance X[_,i] missing depends upon another feature X[_,j], + and j is randomly chosen. + + :param X: The input features, which can be a pandas DataFrame or a numpy array. + :type X: Union[pandas.DataFrame, numpy.ndarray] + :param y: The target variable, which can be a pandas Series or a numpy array. + If not provided, it is assumed that the target is not needed and will be ignored. + :type y: Union[pandas.Series, numpy.ndarray, None], optional + :param percentage_missing: The proportion of values to be replaced with missing values, expressed as a float between 0 and 1. + :type percentage_missing: float + :return: A tuple containing the modified input features `Xt` with introduced missing values and the original target `y`. + :rtype: Tuple[Union[pandas.DataFrame, numpy.ndarray], Union[pandas.Series, numpy.ndarray, None]] """ assert 0 <= percentage_missing <= 1 # initialize probability with zeros @@ -124,18 +155,29 @@ class DummyMissingNotAtRandom(MissingValueGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: A random generator + Initializes the DummyMissingNotAtRandom class with a specified random generator. + + :param random_generator: A NumPy random number generator instance. Defaults to a new instance of `default_rng` with seed 0. + :type random_generator: numpy.random.Generator """ super().__init__(random_generator=random_generator) @preprocess_inputs def generate(self, X, y, percentage_missing): """ - - :param X: the input features - :param y: the target - :param percentage_missing: The percentage of missing values (float value between 0 and 1 included) - :return: Xt, yt + Introduces missing values into the input features `X` not at random, where the probability of a data instance X[i,j] missing + depends linearly upon its own value. Specifically, a data point X[i,j] = max(X[:,j]) has a missing probability of 1, and a + data point X[i,j] = min(X[:,j]) has a missing probability of 0. + + :param X: The input features, which can be a pandas DataFrame or a numpy array. + :type X: Union[pandas.DataFrame, numpy.ndarray] + :param y: The target variable, which can be a pandas Series or a numpy array. + If not provided, it is assumed that the target is not needed and will be ignored. + :type y: Union[pandas.Series, numpy.ndarray, None], optional + :param percentage_missing: The proportion of values to be replaced with missing values, expressed as a float between 0 and 1. + :type percentage_missing: float + :return: A tuple containing the modified input features `Xt` with introduced missing values and the original target `y`. + :rtype: Tuple[Union[pandas.DataFrame, numpy.ndarray], Union[pandas.Series, numpy.ndarray, None]] """ assert 0 <= percentage_missing <= 1 diff --git a/badgers/generators/tabular_data/noise.py b/badgers/generators/tabular_data/noise.py index feea246..04b9186 100644 --- a/badgers/generators/tabular_data/noise.py +++ b/badgers/generators/tabular_data/noise.py @@ -11,23 +11,33 @@ class NoiseGenerator(GeneratorMixin): """ - Base class for generators that add noise to tabular data + Base class for generators that add noise to tabular data. """ def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: A random generator + Initialize the NoiseGenerator with a specified random number generator. + + :param random_generator: A random number generator instance from numpy.random, + used to introduce randomness in the noise generation process. """ self.random_generator = random_generator @abc.abstractmethod def generate(self, X, y, **params): + """ + Abstract method to generate noisy data. Must be implemented by subclasses. + + :param X: Input features (pandas DataFrame or numpy array). + :param y: Target variable (pandas Series or numpy array). + :param params: Additional parameters required for noise generation. + """ pass class GaussianNoiseGenerator(NoiseGenerator): """ - A generator that adds Gaussian white noise to the tabular data + A generator that adds Gaussian white noise to the tabular data. """ def __init__(self, random_generator=default_rng(seed=0)): @@ -41,15 +51,15 @@ def __init__(self, random_generator=default_rng(seed=0)): @preprocess_inputs def generate(self, X, y, noise_std): """ - Adds Gaussian white noise to the data. - The data is first standardized (each column has a mean = 0 and variance = 1). - The noise is generated from a normal distribution with standard deviation = `noise_std`. - The noise is added to the data. - - :param X: the input - :param y: the target - :param noise_std: The standard deviation of the noise to be added - :return: Xt, yt + Adds Gaussian white noise to the input data. + The data is first standardized such that each feature (column) has a mean of 0 and a variance of 1. + Gaussian noise is then generated from a normal distribution with a standard deviation equal to `noise_std`. + This noise is added to the standardized data. + + :param X: Input features (pandas DataFrame or numpy array). + :param y: Target variable (pandas Series or numpy array), which remains unchanged. + :param noise_std: Standard deviation of the Gaussian noise to be added. + :return: Xt, yt where Xt is the noisy input features and yt is the unchanged target variable y. """ # standardize X scaler = StandardScaler() @@ -69,24 +79,26 @@ class GaussianNoiseClassesGenerator(NoiseGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ + Initialize the GaussianNoiseClassesGenerator with a specified random number generator. - :param random_generator: A random generator + :param random_generator: A random number generator instance from numpy.random, + used to introduce randomness in the noise generation process. """ super().__init__(random_generator=random_generator) @preprocess_inputs def generate(self, X, y, noise_std_per_class=dict()): """ - Add Gaussian white noise to the data. - the data is first standardized (each column has a mean = 0 and variance = 1). - The noise is generated from a normal distribution with standard deviation = `noise_std`. - The noise is added to the data. - - :param X: the input - :param y: the target - :param noise_std_per_class: A dictionary giving the standard deviation of the noise to be added for each class - key = class labels, values = noise std for this given class - :return: Xt, yt + Add Gaussian white noise to the data separately for each class. + The data is first standardized such that each feature (column) has a mean of 0 and a variance of 1. + Gaussian noise is then generated from a normal distribution with a standard deviation specified in `noise_std_per_class` for each class. + This noise is added to the standardized data for each class separately. + + :param X: Input features (pandas DataFrame or numpy array). + :param y: Target variable (pandas Series or numpy array). + :param noise_std_per_class: A dictionary specifying the standard deviation of the noise to be added for each class. + Keys are class labels, and values are the noise standard deviations for the corresponding classes. + :return: Xt, yt where Xt is the noisy input features and yt is the unchanged target variable y. """ # standardize X scaler = StandardScaler() diff --git a/badgers/generators/tabular_data/outliers.py b/badgers/generators/tabular_data/outliers.py index 830fb0b..5876b94 100644 --- a/badgers/generators/tabular_data/outliers.py +++ b/badgers/generators/tabular_data/outliers.py @@ -21,12 +21,21 @@ class OutliersGenerator(GeneratorMixin): def __init__(self, random_generator: np.random.Generator=default_rng(seed=0)): """ - :param random_generator: A random generator + Initialize the OutliersGenerator with a random number generator. + + :param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0). """ self.random_generator = random_generator @abc.abstractmethod def generate(self, X, y=None, **params): + """ + Abstract method to generate outliers data. Must be implemented by subclasses. + + :param X: Input features (pandas DataFrame or numpy array). + :param y: Target variable (pandas Series or numpy array). + :param params: Additional parameters required for noise generation. + """ pass @@ -37,8 +46,9 @@ class ZScoreSamplingGenerator(OutliersGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ + Initialize the ZScoreSamplingGenerator with a random number generator. - :param random_generator: A random generator + :param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0). """ super().__init__(random_generator) @@ -47,17 +57,19 @@ def generate(self, X, y, n_outliers: int = 10): """ Randomly generates outliers as data points with a z-score > 3. - 1. Standardize the input data (mean = 0, variance = 1) - 3. Generate outliers as follows: - - the sign is randomly chosen - - for each dimension: the value is equal to 3 + a random number following an exponential distribution function - with default parameters (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html) - 4. Inverse the standardization transformation - - :param X: the input features - :param y: the class labels, target values or None (if none yt - :param n_outliers: The number of outliers to generate - :return: + The process involves the following steps: + 1. Standardize the input data so that it has a mean of 0 and a variance of 1. + 2. Generate outliers by: + - choosing a random sign for each outlier. + - for each dimension of the data, set the value to be 3 plus a random number drawn from an exponential distribution + with default parameters (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html). + 3. Apply the inverse of the standardization transformation to convert the generated outliers back to the original scale. + + :param X: the input features (pandas DataFrame or numpy array). + :param y: the class labels, target values, or None (if not provided). + :param n_outliers: The number of outliers to generate. + :return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values. + If `y` is None, the returned target values will also be None. """ # standardize X @@ -91,27 +103,31 @@ class HypersphereSamplingGenerator(OutliersGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ + Initialize the HypersphereSamplingGenerator with a random number generator. - :param random_generator: A random generator - + :param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0). """ super().__init__(random_generator) @preprocess_inputs def generate(self, X, y=None, n_outliers: int = 10): """ - Randomly generates outliers as data points with a z-score > 3. - - 1. Standardize the input data (mean = 0, variance = 1) - 3. Generate outliers on a hypersphere (see https://en.wikipedia.org/wiki/N-sphere#Spherical_coordinates): - - angles are chosen uniformly at random - - radius is = 3 + a random number following an exponential distribution function with default parameters (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html) - 4. Inverse the standardization transformation - - :param X: the input features - :param y: not used - :param n_outliers: The number of outliers to generate - :return: + Randomly generates outliers by sampling points from a hypersphere. + + The process involves the following steps: + 1. Standardize the input data so that it has a mean of 0 and a variance of 1. + 2. Generate outliers by: + - choosing angles uniformly at random for each dimension of the data. + - setting the radius to be 3 plus a random number drawn from an exponential distribution with default parameters + (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html). + 3. Convert the spherical coordinates to Cartesian coordinates. + 4. Apply the inverse of the standardization transformation to convert the generated outliers back to the original scale. + + :param X: the input features (pandas DataFrame or numpy array). + :param y: the class labels, target values, or None (if not provided). + :param n_outliers: The number of outliers to generate. + :return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values. + If `y` is None, the returned target values will also be None. """ # standardize X @@ -166,10 +182,12 @@ def generate(self, X, y=None, n_outliers: int = 10, bins: int = 10): All values generated for each feature are simply concatenated (independence hypothesis!). - :param X: the input features - :param y: not used - :param params: - :return: + :param X: the input features (pandas DataFrame or numpy array). + :param y: the class labels, target values, or None (not used). + :param n_outliers: The number of outliers to generate. + :param bins: The number of bins to use when creating histograms for each feature. + :return: A tuple containing the augmented feature matrix with added outliers and the corresponding target values. + If `y` is None, the returned target values will also be None. """ outliers = [] @@ -214,9 +232,9 @@ class HistogramSamplingGenerator(OutliersGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ + Initialize the HistogramSamplingGenerator with a random number generator. - :param random_generator: A random generator - + :param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0). """ super().__init__(random_generator) @@ -289,8 +307,9 @@ class LowDensitySamplingGenerator(OutliersGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ + Initialize the LowDensitySamplingGenerator with a random number generator. - :param random_generator: A random generator + :param random_generator: An instance of numpy's random number generator (default is a new generator with seed 0). """ super().__init__(random_generator=random_generator) self.density_estimator = KernelDensity(bandwidth="scott") @@ -363,9 +382,10 @@ class DecompositionAndOutlierGenerator(OutliersGenerator): def __init__(self, decomposition_transformer: sklearn.base.TransformerMixin = PCA(n_components=2), outlier_generator: OutliersGenerator = ZScoreSamplingGenerator(default_rng(0))): """ + Initialize the DecompositionAndOutlierGenerator with a decomposition transformer and an outlier generator. - :param decomposition_transformer: The dimensionality reduction transformer to be used before the outlier transformer - :param outlier_generator: The outlier transformer to be used after the dimensionality has been reduced + :param decomposition_transformer: The dimensionality reduction transformer to be applied to the data before generating outliers. + :param outlier_generator: The outlier generator to be used after the data has been transformed. """ assert hasattr( decomposition_transformer, diff --git a/badgers/generators/text/__init__.py b/badgers/generators/text/__init__.py index bada14c..0b16d3a 100644 --- a/badgers/generators/text/__init__.py +++ b/badgers/generators/text/__init__.py @@ -1,3 +1,3 @@ """ -Module containing all the transformers that accept text data as input +This module contains all the generator functions designed to process and yield data from text inputs. """ \ No newline at end of file diff --git a/badgers/generators/text/typos.py b/badgers/generators/text/typos.py index 971c24c..7c83574 100644 --- a/badgers/generators/text/typos.py +++ b/badgers/generators/text/typos.py @@ -13,9 +13,10 @@ class TyposGenerator(GeneratorMixin): def __init__(self, random_generator=default_rng(seed=0)): """ + Initialize the TyposGenerator with a given random number generator. - :param random_generator: numpy.random.Generator, default default_rng(seed=0) - A random generator + :param random_generator: A random number generator used to introduce randomness in typo generation. + :type random_generator: numpy.random.Generator, default=default_rng(seed=0) """ self.random_generator = random_generator @@ -32,20 +33,23 @@ class SwapLettersGenerator(TyposGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: A random generator + Initialize the SwapLettersGenerator with a given random number generator. + + :param random_generator: A random number generator used to introduce randomness in letter swapping. + :type random_generator: numpy.random.Generator, default=default_rng(seed=0) """ super().__init__(random_generator) def generate(self, X, y, swap_proba:float=0.1) -> Tuple: """ - For each word with a length greater than 3, apply a single swap with probability `self.swap_proba` - Where the swap happens is determined randomly - - - :param X: A list of words where we apply typos - :param y: not used - :param swap_proba: Each word with a length greater than 3 will have this probability to contain a switch (max one per word) - :return: the transformed list of words + For each word with a length greater than 3, apply a single swap with probability `swap_proba`. + The position of the swap is chosen randomly among possible adjacent pairs of letters, + excluding the first and last letters of the word. + :param X: A list of words where typos are introduced. + :param y: Not used in this method. + :param swap_proba: Probability that a word with more than 3 characters will have one adjacent pair of letters swapped. + This probability applies to each eligible word independently. + :return: A tuple containing the transformed list of words and the original labels `y` (unchanged). """ for i in range(len(X)): if len(X[i]) > 3 and self.random_generator.random() <= swap_proba: @@ -64,8 +68,10 @@ class LeetSpeakGenerator(TyposGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ + Initialize the LeetSpeakGenerator with a given random number generator. - :param random_generator: a random number generator + :param random_generator: A random number generator used to introduce randomness in leetspeak transformation. + :type random_generator: numpy.random.Generator, default=default_rng(seed=0) """ super().__init__(random_generator=random_generator) self.leet_speak_mapping = { @@ -102,10 +108,14 @@ def __init__(self, random_generator=default_rng(seed=0)): def randomly_replace_letter(self, letter, replacement_proba): """ - Randomly replace a letter with its leet counterpart - :param letter: - :param replacement_proba: the probability of replacing a letter with its leet counterpart - :return: + Randomly replace a letter with its leet counterpart based on the provided probability. + + :param letter: The letter to potentially replace. + :type letter: str + :param replacement_proba: The probability of replacing the letter with its leet counterpart. + :type replacement_proba: float + :return: The replaced letter if a random draw is less than or equal to the replacement_proba, otherwise the original letter. + :rtype: str """ if letter.upper() in self.leet_speak_mapping: if self.random_generator.random() < replacement_proba: @@ -115,12 +125,22 @@ def randomly_replace_letter(self, letter, replacement_proba): def generate(self, X, y, replacement_proba: float = 0.1) -> Tuple: """ + Apply leet speak transformation to a list of words. - :param X: A list of words where we apply leet replacement - :param y: - :param replacement_proba: the probability of replacing a letter with its leet counterpart - :return: + :param X: A list of words where leet speak transformation is applied. + :param y: The labels associated with the words, which remain unchanged. + :param replacement_proba: The probability of replacing a letter with its leet counterpart. + This probability applies to each letter in each word independently. + :return: A tuple containing the transformed list of words and the original labels `y` (unchanged). """ + transformed_X = [] + for word in X: + transformed_word = ''.join( + self.randomly_replace_letter(letter, replacement_proba) for letter in word + ) + transformed_X.append(transformed_word) + + return transformed_X, y assert 0 <= replacement_proba <= 1 Xt = [ ''.join([self.randomly_replace_letter(l, replacement_proba=replacement_proba) for l in word]) @@ -133,16 +153,23 @@ class SwapCaseGenerator(TyposGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: A random generator + Initialize the SwapCaseGenerator with a given random number generator. + + :param random_generator: A random number generator used to introduce randomness in case swapping. + :type random_generator: numpy.random.Generator, default=default_rng(seed=0) """ super().__init__(random_generator) def randomly_swapcase_letter(self, letter, swapcase_proba): """ - Randomly swap case a letter - :param letter: - :param swapcase_proba: the probability of swapping case - :return: + Randomly swap the case of a letter based on the provided probability. + + :param letter: The letter whose case may be swapped. + :type letter: str + :param swapcase_proba: The probability of swapping the case of the letter. + :type swapcase_proba: float + :return: The letter with swapped case if a random draw is less than or equal to the swapcase_proba, otherwise the original letter. + :rtype: str """ if self.random_generator.random() < swapcase_proba: letter = letter.swapcase() @@ -150,6 +177,15 @@ def randomly_swapcase_letter(self, letter, swapcase_proba): return letter def generate(self, X, y, swapcase_proba: float = 0.1) -> Tuple: + """ + Apply random case swapping to each letter in a list of words. + + :param X: A list of words where random case swapping is applied. + :param y: The labels associated with the words, which remain unchanged. + :param swapcase_proba: The probability of swapping the case of each letter. + This probability applies to each letter in each word independently. + :return: A tuple containing the transformed list of words and the original labels `y` (unchanged). + """ assert 0 <= swapcase_proba <= 1 Xt = [ ''.join([self.randomly_swapcase_letter(l, swapcase_proba=swapcase_proba) for l in word]) diff --git a/badgers/generators/time_series/changepoints.py b/badgers/generators/time_series/changepoints.py index eb0ff32..cbb9ed1 100644 --- a/badgers/generators/time_series/changepoints.py +++ b/badgers/generators/time_series/changepoints.py @@ -14,13 +14,25 @@ class ChangePointsGenerator(GeneratorMixin): def __init__(self, random_generator=default_rng(seed=0), ): """ - :param random_generator: a random number generator + Initialize the ChangePointsGenerator with a given random number generator. + + :param random_generator: A random number generator instance (default is numpy's default_rng with seed 0). """ self.random_generator = random_generator self.changepoints = None @abc.abstractmethod def generate(self, X, y, **params) -> Tuple: + """ + Abstract method that generates changepoints in the given time-series data. + + This method must be overridden by subclasses. + + :param X: Input features of the time-series data. + :param y: Target values of the time-series data. + :param params: Additional parameters required for changepoint generation. + :return: A tuple containing the modified time-series data and the generated changepoints. + """ pass @@ -30,19 +42,25 @@ class RandomChangeInMeanGenerator(ChangePointsGenerator): """ def __init__(self, random_generator=default_rng(seed=0)): + """ + Initialize the RandomChangeInMeanGenerator with a given random number generator. + + :param random_generator: A random number generator instance (default is numpy's default_rng with seed 0). + """ super().__init__(random_generator=random_generator) @preprocess_inputs def generate(self, X, y, n_changepoints: int = 10, min_change: float = -5, max_change: float = 5) -> Tuple: """ + Generate random changepoints in the time-series data where the mean changes at each changepoint. - :param X: - :param y: - :param max_change: - :param min_change: - :param n_changepoints: - :return: + :param X: Input features of the time-series data. + :param y: Target values of the time-series data. + :param n_changepoints: Number of changepoints to generate. + :param min_change: Minimum value of the change in mean. + :param max_change: Maximum value of the change in mean. + :return: A tuple containing the modified time-series data and the generated changepoints. """ # Generate change points self.changepoints = list( diff --git a/badgers/generators/time_series/missingness.py b/badgers/generators/time_series/missingness.py index 723c75d..678b0a3 100644 --- a/badgers/generators/time_series/missingness.py +++ b/badgers/generators/time_series/missingness.py @@ -15,16 +15,15 @@ class MissingValuesGenerator(GeneratorMixin): def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: a random number generator - :param n_outliers: the number of outliers to generate + Initialize the MissingValuesGenerator with a given random number generator. + + :param random_generator: An instance of a random number generator from NumPy, + used to introduce randomness in the generation process. + Defaults to a default_rng seeded with 0. """ self.random_generator = random_generator self.missing_indices_ = [] - @abc.abstractmethod - def generate(self, X, y, **params) -> Tuple: - pass - class MissingAtRandomGenerator(MissingValuesGenerator): """ @@ -33,20 +32,23 @@ class MissingAtRandomGenerator(MissingValuesGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ + Initialize the MissingAtRandomGenerator with a given random number generator. - :param random_generator: a random number generator - + :param random_generator: An instance of a random number generator from NumPy, + used to introduce randomness in the generation process. + Defaults to a default_rng seeded with 0. """ super().__init__(random_generator=random_generator) @preprocess_inputs def generate(self, X, y, n_missing: int = 10) -> Tuple: """ - Randomly set values to np.nan (missing) - :param X: - :param y: - :param n_missing: the number of outliers to generate - :return: + Randomly sets a specified number of values in the input array X to np.nan, representing missing values. + + :param X: A numpy array of shape (n_samples, n_features) containing the input time-series data. + :param y: A numpy array of shape (n_samples,) containing the target values. This parameter is not modified by this method. + :param n_missing: The number of missing values to randomly introduce into the data. Defaults to 10. + :return: A tuple (X_out, y_out) where X_out is the modified array with missing values and y_out is the original target array. """ # generate missing values indices and values rows = self.random_generator.choice(X.shape[0], size=n_missing, replace=False, p=None) diff --git a/badgers/generators/time_series/noise.py b/badgers/generators/time_series/noise.py index 51d778e..40a8dc2 100644 --- a/badgers/generators/time_series/noise.py +++ b/badgers/generators/time_series/noise.py @@ -17,24 +17,51 @@ class NoiseGenerator(GeneratorMixin): def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: A random generator + Initializes the NoiseGenerator with a specified random generator. + :param random_generator: An instance of a random number generator from `numpy.random`. + Default is `default_rng(seed=0)`. """ self.random_generator = random_generator @abc.abstractmethod - def generate(self, X, y, **params) -> Tuple: + def generate(self, X, y, **params) -> Tuple[pd.DataFrame, pd.Series]: + """ + Abstract method to be implemented by subclasses. Adds noise to the input data. + + :param X: Input features DataFrame. + :param y: Target Series. + :param params: Additional parameters that might be required for noise generation. + :return: A tuple containing the modified features DataFrame and the target Series. + """ pass class LocalGaussianNoiseGenerator(NoiseGenerator): - def __init__(self, random_generator=default_rng(seed=0), ): + def __init__(self, random_generator=default_rng(seed=0)): + """ + Initializes the LocalGaussianNoiseGenerator with a specified random generator. + + :param random_generator: An instance of a random number generator from `numpy.random`. + Default is `default_rng(seed=0)`. + """ super().__init__(random_generator=random_generator) @preprocess_inputs - def generate(self, X, y, n_patterns: int = 10, min_width_pattern: int = 5, max_width_patterns: int=10, - noise_std: float = 0.1) -> Tuple: - # generate extreme values indices and values + def generate(self, X, y, n_patterns: int = 10, min_width_pattern: int = 5, max_width_patterns: int = 10, + noise_std: float = 0.1) -> Tuple[pd.DataFrame, pd.Series]: + """ + Adds Gaussian noise to randomly selected local patterns within the input data. + + :param X: Input features DataFrame. + :param y: Target Series. + :param n_patterns: Number of local patterns to add noise to. + :param min_width_pattern: Minimum width of each pattern. + :param max_width_patterns: Maximum width of each pattern. + :param noise_std: Standard deviation of the Gaussian noise. + :return: A tuple containing the modified features DataFrame and the original target Series. + """ + # Generate indices for random patterns self.patterns_indices_ = generate_random_patterns_indices( random_generator=self.random_generator, n_patterns=n_patterns, @@ -43,35 +70,39 @@ def generate(self, X, y, n_patterns: int = 10, min_width_pattern: int = 5, max_w max_width_patterns=max_width_patterns) scaler = StandardScaler() - # fit, transform + # Fit and transform the data scaler.fit(X) Xt = scaler.transform(X) + # Add Gaussian noise to each pattern for (start, end) in self.patterns_indices_: Xt[start:end, :] += self.random_generator.normal(loc=0, scale=noise_std, size=(end-start, Xt.shape[1])) - # inverse standardization + # Inverse standardize the data return pd.DataFrame(data=scaler.inverse_transform(Xt), columns=X.columns, index=X.index), y class GlobalGaussianNoiseGenerator(NoiseGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: A random generator + Initializes the GlobalGaussianNoiseGenerator with a specified random generator. + :param random_generator: An instance of a random number generator from `numpy.random`. + Default is `default_rng(seed=0)`. """ super().__init__(random_generator=random_generator) @preprocess_inputs def generate(self, X, y, noise_std: float = 0.1): """ - Add Gaussian white noise to the data. - the data is first standardized (each column has a mean = 0 and variance = 1). + Adds Gaussian white noise to the entire dataset. + The data is first standardized (each feature has a mean = 0 and variance = 1). The noise is generated from a normal distribution with standard deviation = `noise_std`. - The noise is added to the data. + The noise is then added to the standardized data, and the result is inverse-standardized to restore the original scale. - :param noise_std: The standard deviation of the noise to be added - :param X: - :return: + :param X: Input features DataFrame. + :param y: Target Series. + :param noise_std: The standard deviation of the noise to be added. + :return: A tuple containing the modified features DataFrame and the original target Series. """ scaler = StandardScaler() # fit, transform diff --git a/badgers/generators/time_series/outliers.py b/badgers/generators/time_series/outliers.py index 889ebe3..c36fb21 100644 --- a/badgers/generators/time_series/outliers.py +++ b/badgers/generators/time_series/outliers.py @@ -16,14 +16,26 @@ class OutliersGenerator(GeneratorMixin): def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: a random number generator - :param n_outliers: the number of outliers to generate + Initialize the OutliersGenerator with a specified random number generator. + + :param random_generator: An instance of a random number generator from NumPy's random module. + Default is a default_rng seeded with 0. """ self.random_generator = random_generator self.outliers_indices_ = [] @abc.abstractmethod def generate(self, X, y, **params) -> Tuple: + """ + Generate point outliers in the given time-series data. + + This method should be overridden by subclasses to implement specific outlier generation strategies. + + :param X: Input features (time-series data). + :param y: Target values (optional, may be None). + :param params: Additional parameters that may be required for outlier generation. + :return: A tuple containing the modified data with outliers and any additional information. + """ pass @@ -34,20 +46,23 @@ class RandomZerosGenerator(OutliersGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ + Initialize the RandomZerosGenerator with a specified random number generator. - :param random_generator: a random number generator - + :param random_generator: An instance of a random number generator from NumPy's random module. + Default is a default_rng seeded with 0. """ super().__init__(random_generator=random_generator) @preprocess_inputs def generate(self, X, y, n_outliers: int = 10) -> Tuple: """ - Randomly set values to zero - :param X: - :param y: - :param n_outliers: the number of outliers to generate - :return: + Randomly set a specified number of values in the input data to zero. + + :param X: The input features (time-series data), expected to be a 2D numpy array. + :param y: The target values, optional and can be None. + :param n_outliers: The number of outliers to generate by setting to zero. + Defaults to 10. + :return: A tuple containing the modified data with outliers and the indices of the generated outliers. """ # generate extreme values indices and values rows = self.random_generator.choice(X.shape[0], size=n_outliers, replace=False, p=None) @@ -69,8 +84,10 @@ class LocalZScoreGenerator(OutliersGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ + Initialize the LocalZScoreGenerator with a specified random number generator. - :param random_generator: a random number generator + :param random_generator: An instance of a random number generator from NumPy's random module. + Default is a default_rng seeded with 0. """ super().__init__(random_generator=random_generator) @@ -78,15 +95,17 @@ def __init__(self, random_generator=default_rng(seed=0)): def generate(self, X, y, n_outliers: int = 10, local_window_size: int = 10): """ - Computes indices of extreme values using a uniform distribution. - Computes the values at random outside the range [mean(X) - 3 sigma(X), mean(X) + 3 sigma(X)]. - The sign of the extreme value is the same as the value being replaced. - - :param X: - :param y: - :param n_outliers: the number of outliers to generate - :param local_window_size: the width (number of data points) of the local window to compute local Z-Score - :return: the transformed array + Generates outliers based on local Z-scores. + + For each outlier, a local window of size `local_window_size` is selected, and the local mean and standard deviation are computed. + An outlier is then generated by setting the value at a randomly chosen index within this window to a value outside the range + [local_mean - 3 * local_std, local_mean + 3 * local_std]. The sign of the outlier value is the same as the sign of the original value. + + :param X: The input features (time-series data), expected to be a 2D numpy array. + :param y: The target values, optional and can be None. + :param n_outliers: The number of outliers to generate. Defaults to 10. + :param local_window_size: The width (number of data points) of the local window used to compute the local Z-score. Defaults to 10. + :return: A tuple containing the modified data with outliers and the indices of the generated outliers. """ # generate extreme values indices and values delta = int(local_window_size / 2) diff --git a/badgers/generators/time_series/patterns.py b/badgers/generators/time_series/patterns.py index 8d996c7..5165855 100644 --- a/badgers/generators/time_series/patterns.py +++ b/badgers/generators/time_series/patterns.py @@ -12,14 +12,36 @@ def add_offset(values: np.array, offset: float = 0.) -> np.array: + """ + Adds an offset to the given array of values. + + :param values: The input array of values to which the offset will be added. + :param offset: The offset value to be added to each element in the array. + :return: A new array with the offset added to each element. + """ return values + offset def add_linear_trend(values: np.array, start_value: float = 0., end_value: float = 1.) -> np.array: + """ + Adds a linear trend to the given array of values. + + :param values: The input array of values to which the linear trend will be added. + :param start_value: The starting value of the linear trend. + :param end_value: The ending value of the linear trend. + :return: A new array with the linear trend added to each element. + """ return values + np.linspace(start_value - values[0], end_value - values[-1], len(values)) def scale(values: np.array, scaling_factor: float = 1.) -> np.array: + """ + Scales the given array of values by a specified factor. + + :param values: The input array of values to be scaled. + :param scaling_factor: The factor by which to scale the values. + :return: A new array with each element scaled by the specified factor. + """ return values * scaling_factor @@ -27,8 +49,10 @@ class Pattern: def __init__(self, values: np.array): """ - Pattern constructor - :param values: a 1D or 2D numpy array (rows = Time axis, columns = Features), if a single feature is used (1D), the values is automatically reshaped using reshape(-1,1) + Initialize a Pattern object. + + :param values: A 1D or 2D numpy array where rows represent the time axis and columns represent features. + If only a single feature is provided (1D array), it is automatically reshaped into a 2D array with shape (-1, 1). """ if values.ndim == 1: values = values.reshape(-1, 1) @@ -50,26 +74,37 @@ class PatternsGenerator(GeneratorMixin): def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: a random number generator - :param n_patterns: the number of patterns to generate + Initialize the PatternsGenerator with a random number generator. + + :param random_generator: An instance of a random number generator from `numpy.random`, used for generating random patterns. """ self.random_generator = random_generator self.patterns_indices_ = [] @abc.abstractmethod def generate(self, X, y, **params) -> Tuple: + """ + Abstract method to inject patterns into the time-series data. + This method should be overridden by subclasses to implement specific pattern generation logic. + + :param X: Input time-series data as a 2D numpy array or pandas DataFrame. + :param y: Target values as a 1D numpy array or pandas Series. + :param params: Additional parameters that might be required for pattern generation. + :return: A tuple containing the modified time-series data and target values. + """ pass def _inject_pattern(self, X: pd.DataFrame, p: Pattern, start_index: int, end_index: int, - scaling_factor: Union[float,str] = 'auto'): + scaling_factor: Union[float, str, None] = 'auto'): """ - Utility function to inject a predefined pattern `p` into a signal `X` - :param X: the signal to inject the pattern - :param p: the pattern to be injected - :param start_index: - :param end_index: - :param scaling_factor: float | None | "auto" (default "auto") - :return: the transformed signal where the pattern has been injected + Utility function to inject a predefined pattern `p` into a signal `X`. + + :param X: The signal (time-series data) to inject the pattern into, as a pandas DataFrame. + :param p: The pattern to be injected, represented as a `Pattern` object. + :param start_index: The starting index in `X` where the pattern injection begins. + :param end_index: The ending index in `X` where the pattern injection ends. + :param scaling_factor: The factor by which to scale the pattern before injection. Can be a float, 'auto' to scale based on the signal's range, or None to apply no scaling. + :return: The transformed signal (time-series data) as a pandas DataFrame, where the pattern has been injected. """ # start and end values @@ -101,12 +136,30 @@ class RandomlySpacedPatterns(PatternsGenerator): """ def __init__(self, random_generator=default_rng(seed=0)): + """ + Initialize the RandomlySpacedPatterns with a random number generator. + + :param random_generator: An instance of a random number generator from `numpy.random`, used for generating random patterns. + """ super().__init__(random_generator=random_generator) @preprocess_inputs def generate(self, X, y, n_patterns: int = 10, min_width_pattern: int = 5, max_width_patterns: int = 10, - pattern: Pattern = Pattern(values=np.array([0, 0, 0, 0, 0]))) -> Tuple: + pattern: Pattern = Pattern(values=np.array([0, 0, 0, 0, 0])), + scaling_factor: Union[float, str, None] = 'auto') -> Tuple: + """ + Inject patterns with random width and indices in the time-series data. + + :param X: Input time-series data as a 2D numpy array or pandas DataFrame. + :param y: Target values as a 1D numpy array or pandas Series (not used in this method). + :param n_patterns: The number of patterns to inject into the time-series data. + :param min_width_pattern: The minimum width of the pattern to inject. + :param max_width_patterns: The maximum width of the pattern to inject. + :param pattern: The pattern to inject, represented as a `Pattern` object. + :param scaling_factor: The factor by which to scale the pattern before injection. Can be a float, 'auto' to scale based on the signal's range, or None to apply no scaling. + :return: A tuple containing the transformed time-series data and the unchanged target values. + """ # generate patterns indices and values self.patterns_indices_ = generate_random_patterns_indices( random_generator=self.random_generator, @@ -116,7 +169,7 @@ def generate(self, X, y, n_patterns: int = 10, min_width_pattern: int = 5, max_width_patterns=max_width_patterns) for (start, end) in self.patterns_indices_: - X = self._inject_pattern(X, p=pattern, start_index=start, end_index=end, scaling_factor='auto') + X = self._inject_pattern(X, p=pattern, start_index=start, end_index=end, scaling_factor=scaling_factor) return X, y @@ -127,6 +180,11 @@ class RandomlySpacedConstantPatterns(PatternsGenerator): """ def __init__(self, random_generator=default_rng(seed=0)): + """ + Initialize the RandomlySpacedConstantPatterns with a random number generator. + + :param random_generator: An instance of a random number generator from `numpy.random`, used for generating random patterns. + """ super().__init__(random_generator=random_generator) @preprocess_inputs @@ -134,14 +192,15 @@ def generate(self, X, y, n_patterns: int = 10, min_width_pattern: int = 5, max_width_patterns: int = 10, constant_value: float = 0) -> Tuple: """ - - :param X: - :param y: - :param n_patterns: - :param min_width_pattern: - :param max_width_patterns: - :param constant_value: - :return: + Generate constant patterns with random width and indices in the time-series data. + + :param X: Input time-series data as a 2D numpy array or pandas DataFrame. + :param y: Target values as a 1D numpy array or pandas Series (not used in this method). + :param n_patterns: The number of constant patterns to inject into the time-series data. + :param min_width_pattern: The minimum width of each constant pattern to inject. + :param max_width_patterns: The maximum width of each constant pattern to inject. + :param constant_value: The constant value of the patterns to inject. + :return: A tuple containing the transformed time-series data and the unchanged target values. """ # generate patterns indices and values self.patterns_indices_ = generate_random_patterns_indices( @@ -163,19 +222,25 @@ class RandomlySpacedLinearPatterns(PatternsGenerator): """ def __init__(self, random_generator=default_rng(seed=0)): + """ + Initialize the RandomlySpacedLinearPatterns with a random number generator. + + :param random_generator: An instance of a random number generator from `numpy.random`, used for generating random patterns. + """ super().__init__(random_generator=random_generator) @preprocess_inputs def generate(self, X, y, n_patterns: int = 10, min_width_pattern: int = 5, max_width_patterns: int = 10) -> Tuple: """ - - :param X: - :param y: - :param n_patterns: - :param min_width_pattern: - :param max_width_patterns: - :return: + Generate linear patterns with random width and indices in the time-series data. + + :param X: Input time-series data as a 2D numpy array or pandas DataFrame. + :param y: Target values as a 1D numpy array or pandas Series (not used in this method). + :param n_patterns: The number of linear patterns to inject into the time-series data. + :param min_width_pattern: The minimum width of each linear pattern to inject. + :param max_width_patterns: The maximum width of each linear pattern to inject. + :return: A tuple containing the transformed time-series data and the unchanged target values. """ # generate patterns indices and values self.patterns_indices_ = generate_random_patterns_indices( diff --git a/badgers/generators/time_series/seasons.py b/badgers/generators/time_series/seasons.py index 6f9cb57..3be1da9 100644 --- a/badgers/generators/time_series/seasons.py +++ b/badgers/generators/time_series/seasons.py @@ -14,12 +14,20 @@ class SeasonsGenerator(GeneratorMixin): def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: a random number generator + :param random_generator: A random number generator instance used for generating random numbers. """ self.random_generator = random_generator @abc.abstractmethod def generate(self, X, y, **params) -> Tuple: + """ + Generates seasonal patterns in the input time-series data. + + :param X: Input features (time-series data). + :param y: Target variable (can be None if not applicable). + :param params: Additional parameters that may be required for generating seasons. + :return: A tuple containing the modified input features and target variable. + """ pass @@ -34,11 +42,12 @@ def __init__(self, random_generator=default_rng(seed=0)): @preprocess_inputs def generate(self, X, y, period: int = 10) -> Tuple: """ + Adds a global sinusoidal seasonal pattern to the input time-series data. - :param X: - :param y: - :param period: the period for the season - :return: + :param X: Input features (time-series data). Expected to be a 2D numpy array where each row represents a time step. + :param y: Target variable (can be None if not applicable). Expected to be a 1D numpy array. + :param period: The period of the sinusoidal season. Determines the length of one complete cycle of the sinusoidal wave. + :return: A tuple containing the modified input features with the added sinusoidal season and the unchanged target variable. """ t = np.arange(len(X)) season = np.sin(t[:,np.newaxis]*2*np.pi/period) diff --git a/badgers/generators/time_series/transmission_errors.py b/badgers/generators/time_series/transmission_errors.py index bf612ae..4cf61f3 100644 --- a/badgers/generators/time_series/transmission_errors.py +++ b/badgers/generators/time_series/transmission_errors.py @@ -19,12 +19,22 @@ class TransmissionErrorGenerator(GeneratorMixin): def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: A random generator + Initializes the TransmissionErrorGenerator with a specified random number generator. + + :param random_generator: A random number generator instance (default is a NumPy random generator seeded with 0). """ self.random_generator = random_generator @abc.abstractmethod - def generate(self, X, y, **params) -> Tuple: + def generate(self, X, y, **params) -> Tuple[pd.DataFrame, pd.Series]: + """ + Abstract method to generate transmission errors on the input data. + + :param X: Input features, expected to be a pandas DataFrame. + :param y: Target variable, expected to be a pandas Series. + :param params: Additional parameters that might be needed for generating errors. + :return: A tuple containing the modified input features and target variable with transmission errors applied. + """ pass @@ -37,9 +47,9 @@ class RandomTimeSwitchGenerator(TransmissionErrorGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ + Initializes the RandomTimeSwitchGenerator with a specified random number generator. - :param random_generator: - + :param random_generator: A random number generator instance (default is a NumPy random generator seeded with 0). """ super().__init__(random_generator=random_generator) self.switch_indices_ = None @@ -47,14 +57,16 @@ def __init__(self, random_generator=default_rng(seed=0)): @preprocess_inputs def generate(self, X, y, n_switches: int = 10) -> Tuple: """ - Switch `n_switches` values between X[i] and X[i+1] where i is chosen uniformly at random in [0,len(X)-1] + Introduces `n_switches` random switches in the input time series data X. - Nothing happens to y + This method randomly selects `n_switches` pairs of consecutive indices (i, i+1) and swaps their values in X. + The target variable y remains unchanged. - :param X: - :param y: - :param n_switches: number of switches - :return: Xt, y the transformed time series data and y (the same as input) + :param X: A pandas DataFrame representing the input time series data. + :param y: A pandas Series representing the target variable (remains unchanged). + :param n_switches: An integer specifying the number of random switches to introduce in X. + :return: A tuple (Xt, y) where Xt is the modified input time series data with random switches applied, + and y is the original target variable. """ assert n_switches > 0, 'n_switches should be strictly greater than 0' @@ -78,9 +90,9 @@ class RandomRepeatGenerator(TransmissionErrorGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ + Initializes the RandomRepeatGenerator with a specified random number generator. - :param random_generator: - + :param random_generator: A random number generator instance (default is a NumPy random generator seeded with 0). """ super().__init__(random_generator=random_generator) self.repeats_ = None # to store the indices of the repeats (from the original X) and the length of the repeat @@ -89,13 +101,19 @@ def __init__(self, random_generator=default_rng(seed=0)): def generate(self, X, y, n_repeats: int = 10, min_nb_repeats: int = 1, max_nb_repeats: int = 10) -> Tuple: """ - - :param X: - :param y: - :param n_repeats: number of values that will be repeated - :param min_nb_repeats: the minimum number of repeats - :param max_nb_repeats: the maximum number of repeats - :return: Xt, y the transformed time series data and y (the same as input) + Introduces `n_repeats` random repetitions in the input time series data X. + + This method randomly selects `n_repeats` indices from X and repeats each selected value a random number of times between + `min_nb_repeats` and `max_nb_repeats`. The repeated values are inserted immediately after the selected index in X. + The target variable y remains unchanged. + + :param X: A pandas DataFrame representing the input time series data. + :param y: A pandas Series representing the target variable (remains unchanged). + :param n_repeats: An integer specifying the number of random repetitions to introduce in X. + :param min_nb_repeats: An integer specifying the minimum number of times a value can be repeated. + :param max_nb_repeats: An integer specifying the maximum number of times a value can be repeated. + :return: A tuple (Xt, y) where Xt is the modified input time series data with random repetitions applied, + and y is the original target variable. """ assert n_repeats > 0, 'n_repeats should be strictly greater than 0' @@ -133,9 +151,9 @@ class RandomDropGenerator(TransmissionErrorGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ + Initializes the RandomDropGenerator with a specified random number generator. - :param random_generator: - + :param random_generator: A random number generator instance (default is a NumPy random generator seeded with 0). """ super().__init__(random_generator=random_generator) self.drops_indices_ = None # to store the indices of the drops @@ -143,11 +161,16 @@ def __init__(self, random_generator=default_rng(seed=0)): @preprocess_inputs def generate(self, X, y, n_drops: int = 10) -> Tuple: """ + Introduces `n_drops` random drops in the input time series data X. + + This method randomly selects `n_drops` indices from X and removes the corresponding rows. + The target variable y remains unchanged. - :param X: time series data - :param y: not used - :param n_drops: number of values to drop from the time series - :return: Xt, y the transformed time series data and y (the same as input) + :param X: A pandas DataFrame representing the input time series data. + :param y: A pandas Series representing the target variable (remains unchanged). + :param n_drops: An integer specifying the number of random drops to introduce in X. + :return: A tuple (Xt, y) where Xt is the modified input time series data with random drops applied, + and y is the original target variable. """ assert n_drops > 0, 'n_drops should be strictly greater than 0' @@ -168,9 +191,9 @@ class LocalRegionsRandomDropGenerator(TransmissionErrorGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ + Initializes the LocalRegionsRandomDropGenerator with a specified random number generator. - :param random_generator: - + :param random_generator: A random number generator instance (default is a NumPy random generator seeded with 0). """ super().__init__(random_generator=random_generator) self.drops_indices_ = None # to store the indices of the drops @@ -180,14 +203,20 @@ def __init__(self, random_generator=default_rng(seed=0)): def generate(self, X, y, n_drops: int = 10, n_regions: int = 5, min_width_regions: int = 5, max_width_regions: int = 10) -> Tuple: """ - - :param X: time series data - :param y: not used - :param n_drops: number of values to drop from the time series - :param n_regions: number of time regions (or time intervals) where values will be dropped - :param min_width_regions: minimum width of the time regions (intervals) - :param max_width_regions: maximum width of the time regions (intervals) - :return: Xt, y the transformed time series data and y (the same as input) + Introduces `n_drops` random drops in the input time series data X within `n_regions` specific time regions. + + This method randomly defines `n_regions` time regions within the time series, each having a width between + `min_width_regions` and `max_width_regions`. Within each region, values are randomly dropped until the total number of + dropped values reaches `n_drops`. The target variable y remains unchanged. + + :param X: A pandas DataFrame representing the input time series data. + :param y: A pandas Series representing the target variable (remains unchanged). + :param n_drops: An integer specifying the total number of random drops to introduce in X. + :param n_regions: An integer specifying the number of time regions where values will be dropped. + :param min_width_regions: An integer specifying the minimum width of the time regions (intervals). + :param max_width_regions: An integer specifying the maximum width of the time regions (intervals). + :return: A tuple (Xt, y) where Xt is the modified input time series data with random drops applied within specific + time regions, and y is the original target variable. """ assert n_drops > 0, 'n_drops should be strictly greater than 0' @@ -217,9 +246,9 @@ class LocalRegionsRandomRepeatGenerator(TransmissionErrorGenerator): def __init__(self, random_generator=default_rng(seed=0)): """ + Initializes the LocalRegionsRandomRepeatGenerator with a specified random number generator. - :param random_generator: - + :param random_generator: A random number generator instance (default is a NumPy random generator seeded with 0). """ super().__init__(random_generator=random_generator) self.repeats_ = None # to store the indices of the repeats (from the original X) and the length of the repeat @@ -230,16 +259,23 @@ def generate(self, X, y, n_repeats: int = 10, min_nb_repeats: int = 1, max_nb_repeats: int = 10, n_regions: int = 5, min_width_regions: int = 5, max_width_regions: int = 10) -> Tuple: """ - - :param X: - :param y: - :param n_repeats: number of values that will be repeated - :param min_nb_repeats: the minimum number of repeats - :param max_nb_repeats: the maximum number of repeats - :param n_regions: number of time regions (or time intervals) where values will be dropped - :param min_width_regions: minimum width of the time regions (intervals) - :param max_width_regions: maximum width of the time regions (intervals) - :return: Xt, y the transformed time series data and y (the same as input) + Introduces `n_repeats` random repetitions in the input time series data X within `n_regions` specific time regions. + + This method randomly defines `n_regions` time regions within the time series, each having a width between + `min_width_regions` and `max_width_regions`. Within each region, values are randomly selected and repeated a random number of times + between `min_nb_repeats` and `max_nb_repeats`. The repeated values are inserted immediately after the selected index in X. + The target variable y remains unchanged. + + :param X: A pandas DataFrame representing the input time series data. + :param y: A pandas Series representing the target variable (remains unchanged). + :param n_repeats: An integer specifying the total number of random repetitions to introduce in X. + :param min_nb_repeats: An integer specifying the minimum number of times a value can be repeated. + :param max_nb_repeats: An integer specifying the maximum number of times a value can be repeated. + :param n_regions: An integer specifying the number of time regions where values will be repeated. + :param min_width_regions: An integer specifying the minimum width of the time regions (intervals). + :param max_width_regions: An integer specifying the maximum width of the time regions (intervals). + :return: A tuple (Xt, y) where Xt is the modified input time series data with random repetitions applied within specific + time regions, and y is the original target variable. """ assert n_repeats > 0, 'n_repeats should be strictly greater than 0' diff --git a/badgers/generators/time_series/trends.py b/badgers/generators/time_series/trends.py index 0ce390e..0af434d 100644 --- a/badgers/generators/time_series/trends.py +++ b/badgers/generators/time_series/trends.py @@ -16,12 +16,23 @@ class TrendsGenerator(GeneratorMixin): def __init__(self, random_generator=default_rng(seed=0)): """ - :param random_generator: a random number generator + Initialize the TrendsGenerator with a random number generator. + + :param random_generator: An instance of a random number generator, + default is `numpy.random.default_rng(seed=0)`. """ self.random_generator = random_generator @abc.abstractmethod def generate(self, X, y, **params) -> Tuple: + """ + Abstract method to generate trends in time-series data. + + :param X: The input features, typically a 2D array where each row represents a time step. + :param y: The target variable, typically a 1D array. + :param params: Additional parameters that can be used by the generating method. + :return: A tuple containing the modified features and target variable with generated trends. + """ pass @@ -31,17 +42,25 @@ class GlobalAdditiveLinearTrendGenerator(TrendsGenerator): """ def __init__(self, random_generator=default_rng(seed=0)): + """ + Initialize the GlobalAdditiveLinearTrendGenerator with a random number generator. + + :param random_generator: An instance of a random number generator, + default is `numpy.random.default_rng(seed=0)`. + """ super().__init__(random_generator=random_generator) @preprocess_inputs def generate(self, X, y, slope) -> Tuple: """ - - :param X: the input signal to be transformed - :param y: not changed (here for API compatibility) - :param slope: the slope of the trend (increase per time unit) - :type slope: Union[float | list] - :return: the transformed signal Xt (X + linear trend), and y (not changed) + Add a global linear trend to the input time-series data. + + :param X: The input signal to be transformed, expected to be a 2D array where each row represents a time step. + :param y: The target variable, which remains unchanged in this transformation. + :param slope: The slope of the trend (increase per time unit). Can be a single float value or a list of slopes + for each feature in X. + :type slope: Union[float, list] + :return: A tuple containing the transformed signal Xt (X + linear trend) and the unchanged target variable y. """ offset = np.linspace(0, slope * len(X), len(X)) @@ -55,20 +74,28 @@ class AdditiveLinearTrendGenerator(TrendsGenerator): """ def __init__(self, random_generator=default_rng(seed=0)): + """ + Initialize the AdditiveLinearTrendGenerator with a random number generator. + + :param random_generator: An instance of a random number generator, + default is `numpy.random.default_rng(seed=0)`. + """ super().__init__(random_generator=random_generator) @preprocess_inputs def generate(self, X, y, slope, start: int, end: int) -> Tuple: """ - - - :param X: the input signal to be transformed - :param y: not changed (here for API compatibility) - :param slope: (increase per time unit) - :type slope: Union[float | list] - :param end: - :param start: - :return: the transformed signal Xt (X + linear trend), and y (not changed) + Add a linear trend to a specified segment of the input time-series data. + + :param X: The input signal to be transformed, expected to be a 2D array where each row represents a time step. + :param y: The target variable, which remains unchanged in this transformation. + :param slope: The slope of the trend (increase per time unit). Can be a single float value or a list of slopes + for each feature in X. + :type slope: Union[float, list] + :param start: The starting index of the segment to apply the trend. + :param end: The ending index of the segment to apply the trend. + :return: A tuple containing the transformed signal Xt (X with the linear trend applied to the specified segment) + and the unchanged target variable y. """ if start is None: # when start is not given, it is chosen randomly in the first half of the signal @@ -97,24 +124,30 @@ class RandomlySpacedLinearTrends(TrendsGenerator): """ def __init__(self, random_generator=default_rng(seed=0)): + """ + Initialize the RandomlySpacedLinearTrends with a random number generator. + + :param random_generator: An instance of a random number generator, + default is `numpy.random.default_rng(seed=0)`. + """ super().__init__(random_generator=random_generator) @preprocess_inputs def generate(self, X, y, n_patterns: int = 10, min_width_pattern: int = 5, max_width_patterns: int = 10, slope_min: float = -0.05, slope_max: float = 0.05) -> Tuple: """ - Generates randomly time intervals where a linear trend is added to the signal - Slopes, Tme intervals locations and widths are chosen randomly. - - :param X: - :param y: - :param n_patterns: the total number of time intervals where a linear trend is add - :param min_width_pattern: the minimum with of the time intervals - :param max_width_patterns: the maximum with of the time intervals - :param slope_min: the minimum value of the slope (slope is chosen uniformly at random between min_slope and max_slope for each time interval and each column of X) - :param slope_max: the maximum value of the slope (slope is chosen uniformly at random between min_slope and max_slope for each time interval and each column of X) - - :return: + Generates randomly spaced time intervals where a linear trend is added to the signal. + Slopes, time interval locations, and widths are chosen randomly. + + :param X: The input signal to be transformed, expected to be a 2D array where each row represents a time step. + :param y: The target variable, which remains unchanged in this transformation. + :param n_patterns: The total number of time intervals where a linear trend is added. + :param min_width_pattern: The minimum width of the time intervals. + :param max_width_patterns: The maximum width of the time intervals. + :param slope_min: The minimum value of the slope. The slope is chosen uniformly at random between `slope_min` and `slope_max` for each time interval and each column of X. + :param slope_max: The maximum value of the slope. The slope is chosen uniformly at random between `slope_min` and `slope_max` for each time interval and each column of X. + + :return: A tuple containing the transformed signal Xt (X with randomly spaced linear trends added) and the unchanged target variable y. """ # generate patterns indices and values diff --git a/badgers/mcp/__init__.py b/badgers/mcp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dist/badgers-0.0.1-py3-none-any.whl b/dist/badgers-0.0.1-py3-none-any.whl deleted file mode 100644 index 795d2e7..0000000 Binary files a/dist/badgers-0.0.1-py3-none-any.whl and /dev/null differ diff --git a/dist/badgers-0.0.1.tar.gz b/dist/badgers-0.0.1.tar.gz deleted file mode 100644 index 8cf3bc4..0000000 Binary files a/dist/badgers-0.0.1.tar.gz and /dev/null differ diff --git a/docs/index.md b/docs/index.md index a782b8b..d14511d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -13,7 +13,5 @@ You might think of using [badgers](https://github.com/Fraunhofer-IESE/badgers) f [Badgers](https://github.com/Fraunhofer-IESE/badgers) provides a set of predefined generators for different modalities (tabular data, time series, text, etc.) and different data quality problems (outliers, noise, drift, etc.). Of course many data quality problems are use case dependent and it is not possible to implement all generators. Therefore, the idea is that badgers can serve as a structure for developing novel generators (see how to develop novel generators in the [dev-tutorials](../tutorials/Create-New-Tabular-Generators/) section). - Want to try [badgers](https://github.com/Fraunhofer-IESE/badgers)? Then go to the [getting started](getting-started.md) section or dive into the [tutorials](../tutorials/Imbalance-Tabular-Data/) section. - - + Want to try [badgers](https://github.com/Fraunhofer-IESE/badgers)? Then go to the [getting started](getting-started.md) section or dive into the [tutorials](../tutorials/tabular-data/Imbalance-Tabular-Data/) section. diff --git a/profile/__init__.py b/profile/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tox.ini b/tox.ini index 0509ca6..21f344e 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,6 @@ [tox] env_list = + py313 py312 py311 py310