-
Notifications
You must be signed in to change notification settings - Fork 2
Fix 13 : Create SimpleImputer #14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 9 commits
8b0f402
98b4472
7d47859
cf9ef84
f52affe
58858b8
ac164ea
2d8e37e
029d773
d980805
5fd1aec
0c6f6be
9dbce79
86820b9
49ff179
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,55 @@ | ||
| import numpy as np | ||
| from sklearn.base import BaseEstimator, TransformerMixin | ||
|
|
||
| class DifferentiallyPrivateImputer(BaseEstimator, TransformerMixin): | ||
| def __init__(self, missing_values=np.nan, strategy='mean', fill_value=None, epsilon=1.0): | ||
| self.missing_values = missing_values | ||
| self.strategy = strategy | ||
| self.fill_value = fill_value | ||
| self.epsilon = epsilon # Privacy budget | ||
|
|
||
| def fit(self, X, y=None): | ||
| if self.strategy == 'most_frequent': | ||
| self.statistics_ = [np.nan] * X.shape[1] # Not used in most_frequent strategy | ||
| elif self.strategy == 'constant': | ||
| self.statistics_ = [self.fill_value] * X.shape[1] | ||
| else: | ||
| self.statistics_ = [np.nanmean(col) if np.issubdtype(col.dtype, np.number) else np.nan for col in X.T] | ||
| return self | ||
|
|
||
| def _impute_mean(self, col, missing_col): | ||
| non_missing_values = col[~missing_col] | ||
| col_mean = np.nanmean(non_missing_values) | ||
|
||
| sensitivity = np.nanmax(np.abs(non_missing_values - col_mean)) | ||
| return col_mean, sensitivity | ||
|
|
||
| def _impute_median(self, col, missing_col): | ||
| non_missing_values = col[~missing_col] | ||
| col_median = np.nanmedian(non_missing_values) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you are using nanmedian from numpy... shouldn't you use the dp version from this library ;) |
||
| sensitivity = np.nanmax(np.abs(non_missing_values - col_median)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you are using nanmax from numpy... shouldn't you use the dp version from this library ;) |
||
| return col_median, sensitivity | ||
|
|
||
| def transform(self, X): | ||
| noisy_X = np.copy(X) | ||
| missing_indices = np.isnan(X) | ||
|
|
||
| for col_idx in range(X.shape[1]): | ||
| col = X[:, col_idx] | ||
| missing_col = missing_indices[:, col_idx] | ||
|
|
||
| if np.any(missing_col): | ||
| if self.strategy == 'mean': | ||
| col_mean, sensitivity = self._impute_mean(col, missing_col) | ||
| elif self.strategy == 'median': | ||
| col_mean, sensitivity = self._impute_median(col, missing_col) | ||
| elif self.strategy == 'most_frequent': | ||
| col_mean = self.statistics_[col_idx] # Use most frequent value | ||
| sensitivity = 1 # Sensitivity for most_frequent is 1 | ||
|
|
||
| scale = sensitivity / self.epsilon | ||
| laplace_noise = np.random.laplace(loc=0, scale=scale, size=np.sum(missing_col)) | ||
|
|
||
| # Impute missing values with noisy values | ||
| noisy_X[missing_col, col_idx] = col_mean + laplace_noise | ||
|
|
||
| return noisy_X | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you are using nanmean from numpy... shouldn't you use the dp version from this library ;)