diff --git a/skpro/regression/bayesian.py b/skpro/regression/bayesian.py deleted file mode 100644 index d74dc92b..00000000 --- a/skpro/regression/bayesian.py +++ /dev/null @@ -1,454 +0,0 @@ -""" -Simple Bayesian Linear Regressor. - -Bayesian Linear Regression defined with normal priors for slopes and intercept -and half-normal prior for noise; coded on pymc backend. -""" - -# copyright: skpro developers -__author__ = ["meraldoantonio"] - -import pandas as pd - -from skpro.regression.base import BaseProbaRegressor - - -class BayesianLinearRegressor(BaseProbaRegressor): - """Bayesian Linear Regression class. - Defined with normal priors for slopes and intercept and - inverse gamma prior for noise variance; coded on pymc backend. - - Parameters - ---------- - intercept_mu : float, optional (default=0) - Mean of the normal prior for the intercept. - intercept_sigma : float, optional (default=10) - Standard deviation of the normal prior for the intercept. - slopes_mu : float, optional (default=0) - Mean of the normal prior for the slopes. - slopes_sigma : float, optional (default=10) - Standard deviation of the normal prior for the slopes. - noise_alpha : float, optional (default=1) - Alpha parameter for the inverse gamma prior for the noise variance. - noise_beta : float, optional (default=1) - Beta parameter for the inverse gamma prior for the noise variance. - chains : int, optional (default=2) - Number of MCMC chains to run. - draws : int, optional (default=2000) - Number of draws sampled from each chain; applied to all sampling steps. - - Example - ------- - >>> from skpro.regression.bayesian import BayesianLinearRegressor - >>> from sklearn.datasets import load_diabetes # doctest: +SKIP - >>> from sklearn.model_selection import train_test_split # doctest: +SKIP - >>> X, y = load_diabetes(return_X_y=True, as_frame=True) # doctest: +SKIP - >>> X_train, X_test, y_train, y_test = train_test_split(X, y) # doctest: +SKIP - - >>> bayes_model = BayesianLinearRegressor() # doctest: +SKIP - >>> bayes_model.fit(X_train, y_train) # doctest: +SKIP - >>> y_test_pred_proba = bayes_model.predict_proba(X_test) # doctest: +SKIP - >>> y_test_pred = bayes_model.predict(X_test) # doctest: +SKIP - """ - - _tags = { - # packaging info - # -------------- - "authors": ["meraldoantonio"], # authors, GitHub handles - "python_version": None, - "python_dependencies": ["pymc"], - # estimator tags - # -------------- - "capability:multioutput": False, # can the estimator handle multi-output data? - "capability:missing": True, # can the estimator handle missing data? - "X_inner_mtype": "pd_DataFrame_Table", # type seen in internal _fit, _predict - "y_inner_mtype": "pd_DataFrame_Table", # type seen in internal _fit - } - - def __init__( - self, - intercept_mu=0, - intercept_sigma=10, - slopes_mu=0, - slopes_sigma=10, - noise_alpha=1, - noise_beta=1, - chains=2, - draws=2000, - ): - # hyperparameters for priors - self.intercept_sigma = intercept_sigma - self.intercept_mu = intercept_mu - self.slopes_sigma = slopes_sigma - self.slopes_mu = slopes_mu - self.noise_alpha = noise_alpha - self.noise_beta = noise_beta - self.chains = chains - self.draws = draws - - super().__init__() - - # Assertions to check validity of input parameters - assert self.intercept_sigma > 0, "intercept_sigma must be positive" - assert self.slopes_sigma > 0, "slopes_sigma must be positive" - assert self.noise_alpha > 0, "noise_alpha must be positive" - assert self.noise_beta > 0, "noise_beta must be positive" - assert ( - isinstance(self.chains, int) and self.chains > 0 - ), "chains must be a positive integer" - assert ( - isinstance(self.draws, int) and self.draws > 0 - ), "draws must be a positive integer" - - def _fit(self, X, y): - """Fit regressor to training data. - - Writes to self: - Sets fitted model attributes ending in "_". - - Parameters - ---------- - X : pandas DataFrame - feature instances to fit regressor to - y : pandas DataFrame, must be same length as X - labels to fit regressor to - - Returns - ------- - self : reference to self - """ - import pymc as pm - - assert len(y.columns) == 1, "y must have only one column!" - self._X = X - self._y = y - self._y_vals = y.values[ - :, 0 - ] # we need a 1-dimensional array for compatibility with pymc - self._X_cols = X.columns - self._y_cols = y.columns - self._predict_done = False - - with pm.Model(coords={"obs_id": X.index, "pred_id": X.columns}) as self.model: - # Mutable data containers - X_data = pm.Data("X", self._X, dims=("obs_id", "pred_id")) - y_data = pm.Data("y", self._y_vals, dims=("obs_id")) - - # Priors for unknown model parameters - self.intercept = pm.Normal( - "intercept", mu=self.intercept_mu, sigma=self.intercept_sigma - ) - self.slopes = pm.Normal( - "slopes", - mu=self.slopes_mu, - sigma=self.slopes_sigma, - shape=self._X.shape[1], - dims=("pred_id"), - ) - self.noise_var = pm.InverseGamma( - "noise_var", alpha=self.noise_alpha, beta=self.noise_beta - ) - self.noise = pm.Deterministic("noise", self.noise_var**0.5) - - # Expected value of outcome - self.mu = pm.Deterministic( - "mu", self.intercept + pm.math.dot(X_data, self.slopes) - ) - - # Likelihood (sampling distribution) of observations - y_obs = pm.Normal( - "y_obs", mu=self.mu, sigma=self.noise, observed=y_data, dims=("obs_id") - ) - - # Constructing the posterior - self.trace = pm.sample(chains=self.chains, draws=self.draws) - - # Constructing the in-sample posterior predictive - self.trace.extend(pm.sample_posterior_predictive(self.trace)) - - return self - - def get_prior(self, return_type="xarray"): - """Extract the prior distribution.""" - import pymc as pm - - assert return_type in [ - "xarray", - "numpy", - "dataframe", - ], "return_type must be one of 'xarray', 'numpy' or 'dataframe" - - with self.model: - # if we've previously used the model for prediction, - # we need to reset the reference of 'X' to the training data - if self._predict_done: - pm.set_data( - {"X": self._X}, - coords={"obs_id": self._X.index, "pred_id": self._X.columns}, - ) - # todo: uniformize the number of chains during sampling - self.trace.extend(pm.sample_prior_predictive(samples=self.draws)) - prior = self.trace.prior - - if return_type == "xarray": - return prior - elif return_type == "numpy": - intercept = prior["intercept"].values.squeeze() - slopes = prior["slopes"].values.squeeze() - noise_var = prior["noise_var"].values.squeeze() - noise = prior["noise"].values.squeeze() - return { - "intercept": intercept, - "slopes": slopes, - "noise_var": noise_var, - "noise": noise, - } - else: - intercept = prior["intercept"].values.squeeze() - slopes = prior["slopes"].values.squeeze() - noise_var = prior["noise_var"].values.squeeze() - noise = prior["noise"].values.squeeze() - return pd.DataFrame( - { - "intercept": intercept, - "slopes": slopes, - "noise_var": noise_var, - "noise": noise, - } - ) - - def get_prior_summary(self): - """Get the summary statistics of the prior.""" - import arviz as az - import pymc as pm - - # if we've previously used the model for prediction, - # we need to reset the reference of 'X' to the training data - if self._predict_done: - pm.set_data( - {"X": self._X}, - coords={"obs_id": self._X.index, "pred_id": self._X.columns}, - ) - return az.summary( - self.trace.prior, var_names=["intercept", "slopes", "noise_var", "noise"] - ) - - def plot_ppc(self, **kwargs): - """Plot the prior predictive check.""" - import arviz as az - - return az.plot_ppc(self.trace, **kwargs) - - def get_posterior(self, return_type="xarray"): - """Extract the prior distribution.""" - assert ( - self._is_fitted - ), "The model must be fitted before posterior can be returned." - assert return_type in [ - "xarray", - "numpy", - "dataframe", - ], "return_type must be one of 'xarray', 'numpy' or 'dataframe" - - posterior = self.trace.posterior - - if return_type == "xarray": - return posterior - elif return_type == "numpy": - intercept = ( - posterior["intercept"].stack({"sample": ("chain", "draw")}).values - ) - slopes = posterior["slopes"].stack({"sample": ("chain", "draw")}).values - noise_var = ( - posterior["noise_var"].stack({"sample": ("chain", "draw")}).values - ) - noise = posterior["noise"].stack({"sample": ("chain", "draw")}).values - return { - "intercept": intercept, - "slopes": slopes, - "noise_var": noise_var, - "noise": noise, - } - else: - intercept = ( - posterior["intercept"].stack({"sample": ("chain", "draw")}).values - ) - slopes = posterior["slopes"].stack({"sample": ("chain", "draw")}).values - noise_var = ( - posterior["noise_var"].stack({"sample": ("chain", "draw")}).values - ) - noise = posterior["noise"].stack({"sample": ("chain", "draw")}).values - return pd.DataFrame( - { - "intercept": intercept, - "slopes": slopes, - "noise_var": noise_var, - "noise": noise, - } - ) - - def get_posterior_summary(self): - """Get the summary statistics of the posterior.""" - import arviz as az - - # if we've previously used the model for prediction, - # we need to reset the reference of 'X' to the training data - assert ( - self._is_fitted - ), "The model must be fitted before posterior summary can be returned." - return az.summary( - self.trace.posterior, - var_names=["intercept", "slopes", "noise_var", "noise"], - ) - - def _predict(self, X): - """Predict labels for data from features. - - State required: - Requires state to be "fitted" = self.is_fitted=True - - Accesses in self: - Fitted model attributes ending in "_" - - Parameters - ---------- - X : pandas DataFrame, must have same columns as X in `fit` - data to predict labels for - - Returns - ------- - y : pandas DataFrame, same length as `X`, same columns as `y` in `fit` - labels predicted for `X` - """ - assert X.columns.equals( - self._X_cols - ), f"The columns of X must be the same as the columns of the training data: {self._X_cols}" - - y_pred = self._predict_proba(X).mean() - return y_pred - - def _predict_proba(self, X): - """Predict distribution over labels for data from features. - - State required: - Requires state to be "fitted". - - Accesses in self: - Fitted model attributes ending in "_" - - Parameters - ---------- - X : pandas DataFrame, must have same columns as X in `fit` - data to predict labels for - - Returns - ------- - pred_proba_dist : skpro BaseDistribution, same length as `X` - labels predicted for `X` - """ - import pymc as pm - - from skpro.distributions import Empirical - - y_cols = self._y_cols # columns from y in fit, not automatically stored - index = X.index - with self.model: - # Set the X to be the new 'X' variable and then sample posterior predictive - pm.set_data({"X": X}, coords={"obs_id": X.index, "pred_id": X.columns}) - self.trace.extend( - pm.sample_posterior_predictive( - self.trace, random_seed=42, predictions=True - ) - ) - self._predict_done = True # a flag - - # Extract posterior predictive distributions as an xarray DataArray - pred_proba_xarray = self.trace.predictions["y_obs"] - - # Convert data to pd.DataFrame and format it appropriately for subsequent conversion into a skpro Empirical distribution - pred_proba_df = pred_proba_xarray.to_dataframe() - pred_proba_df = pred_proba_df.reset_index() - - # Create a new 'sample_id' column by combining the 'chain' and 'draw' columns - pred_proba_df["sample_id"] = ( - pred_proba_df["chain"] * self.draws + pred_proba_df["draw"] - ) - pred_proba_df = pred_proba_df[["obs_id", "sample_id", "y_obs"]] - pred_proba_df = pred_proba_df.rename(columns={"y_obs": y_cols[0]}) - pred_proba_df = pred_proba_df.set_index(["sample_id", "obs_id"]) - - # Convert data to skpro Empirical distribution - pred_proba_dist = Empirical(spl=pred_proba_df, index=index, columns=y_cols) - return pred_proba_dist - - # todo: return default parameters, so that a test instance can be created - # required for automated unit and integration testing of estimator - @classmethod - def get_test_params(cls, parameter_set="default"): - """Return testing parameter settings for the estimator. - - Parameters - ---------- - parameter_set : str, default="default" - Name of the set of test parameters to return, for use in tests. If no - special parameters are defined for a value, will return `"default"` set. - - Returns - ------- - params : dict or list of dict, default = {} - Parameters to create testing instances of the class - Each dict are parameters to construct an "interesting" test instance, i.e., - `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. - `create_test_instance` uses the first (or only) dictionary in `params` - """ - - # todo: set the testing parameters for the estimators - # Testing parameters can be dictionary or list of dictionaries - # - # this can, if required, use: - # class properties (e.g., inherited); parent class test case - # imported objects such as estimators from skpro or sklearn - # important: all such imports should be *inside get_test_params*, not at the top - # since imports are used only at testing time - # - # The parameter_set argument is not used for most automated, module level tests. - # It can be used in custom, estimator specific tests, for "special" settings. - # A parameter dictionary must be returned *for all values* of parameter_set, - # i.e., "parameter_set not available" errors should never be raised. - # - # A good parameter set should primarily satisfy two criteria, - # 1. Chosen set of parameters should have a low testing time, - # ideally in the magnitude of few seconds for the entire test suite. - # This is vital for the cases where default values result in - # "big" models which not only increases test time but also - # run into the risk of test workers crashing. - # 2. There should be a minimum two such parameter sets with different - # sets of values to ensure a wide range of code coverage is provided. - # - # example 1: specify params as dictionary - # any number of params can be specified - # params = {"est": value0, "parama": value1, "paramb": value2} - # - # example 2: specify params as list of dictionary - # note: Only first dictionary will be used by create_test_instance - # params = [{"est": value1, "parama": value2}, - # {"est": value3, "parama": value4}] - # - # example 3: parameter set depending on param_set value - # note: only needed if a separate parameter set is needed in tests - # if parameter_set == "special_param_set": - # params = {"est": value1, "parama": value2} - # return params - # - # # "default" params - # params = {"est": value3, "parama": value4} - # return params - - def visualize_model(self): - """Use graphviz to visualize the composition of the model.""" - import pymc as pm - - assert ( - self._is_fitted - ), "The model must be fitted before visualization can be done." - return pm.model_to_graphviz(self.model)