Merge pull request #66 from MindSetLib/asupdates

Random Forest, WoE & Gains curve
MindSetLib · Apr 15, 2021 · f9db93b · f9db93b
2 parents 4e1cad3 + 1dc0abf
commit f9db93b
Show file tree

Hide file tree

Showing 4 changed files with 198 additions and 2 deletions.
diff --git a/insolver/model_tools/model_utils.py b/insolver/model_tools/model_utils.py
@@ -3,7 +3,11 @@
 from urllib.request import urlopen
 from zipfile import ZipFile
 
-from numpy import log, sum
+from numpy import log, sum, maximum, unique, true_divide, linspace, ndarray
+
+import matplotlib.pyplot as plt
+
+from pandas import DataFrame, Series, concat, qcut
 from sklearn.model_selection import train_test_split
 
 
@@ -129,3 +133,79 @@ def deviance_gamma(y_hat, y, weight=None):
         return sum(2 * weight * (-log(y/y_hat) + (y-y_hat)/y_hat))
     else:
         return sum(2 * (-log(y/y_hat) + (y-y_hat)/y_hat))
+
+
+def inforamtion_value_woe(data, target, bins=10, cat_thresh=10, detail=False):
+    """Function for Information value and Weight of Evidence computation.
+
+    Args:
+        data (pd.DataFrame): DataFrame with data to compute IV and WoE.
+        target (:obj:`str` or :obj:`pd.Series`): Target variable to compute IV and WoE.
+        bins (:obj:`int`, optional): Number of bins for WoE calculation for continuous variables.
+        cat_thresh (:obj:`int`, optional): Maximum number of categories for non-binned WoE calculation.
+        detail (:obj:`bool`, optional):  Whether to return detailed results DataFrame or not. Short by default.
+
+    Returns:
+        pd.DataFrame, DataFrame containing the data on Information Value (depends on detail argument).
+    """
+    detailed_result, short_result = DataFrame(), DataFrame()
+    target = target.name if isinstance(target, Series) else target
+    cols = data.columns
+    for ivars in cols[~cols.isin([target])]:
+        if (data[ivars].dtype.kind in 'bifc') and (len(unique(data[ivars])) > cat_thresh):
+            binned_x = qcut(data[ivars], bins,  duplicates='drop')
+            d0 = DataFrame({'x': binned_x, 'y': data[target]})
+        else:
+            d0 = DataFrame({'x': data[ivars], 'y': data[target]})
+        d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
+        d.columns = ['Cutoff', 'N', 'Events']
+        d['% of Events'] = maximum(d['Events'], 0.5) / d['Events'].sum()
+        d['Non-Events'] = d['N'] - d['Events']
+        d['% of Non-Events'] = maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
+        d['WoE'] = log(d['% of Events'] / d['% of Non-Events'])
+        d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
+        d.insert(loc=0, column='Variable', value=ivars)
+        temp = DataFrame({"Variable": [ivars], "IV": [d['IV'].sum()]}, columns=["Variable", "IV"])
+        detailed_result = concat([detailed_result, temp], axis=0)
+        short_result = concat([short_result, d], axis=0)
+    return short_result if detail else detailed_result
+
+
+def gain_curve(predict, exposure):
+    if isinstance(predict, (Series, ndarray)) and isinstance(exposure, Series):
+        temp_df = concat([Series(predict, name='Predict').reset_index(drop=True),
+                          exposure.reset_index(drop=True)], axis=1)
+        temp_df = temp_df.sort_values('Predict', ascending=False).reset_index(drop=True)
+        normalized_df = temp_df.cumsum()/temp_df.sum()
+        w = sum(temp_df[exposure.name])
+        m = true_divide(sum(temp_df[exposure.name] * temp_df['Predict']), sum(temp_df[exposure.name]))
+        temp_df['Rank'] = 0
+        temp_df.loc[0, 'Rank'] = 1 + 0.5 * (temp_df.loc[0, exposure.name] - 1)
+        for x in range(1, len(temp_df)):
+            temp_df.loc[x, 'Rank'] = (temp_df.loc[x-1, 'Rank'] + 0.5 * (temp_df.loc[x-1, exposure.name] + 1)
+                                      + 0.5 * (temp_df.loc[x, exposure.name] - 1))
+        gini = 1 + 1/w - 2/(w**2 * m) * sum(temp_df[exposure.name] * temp_df['Predict'] * temp_df['Rank'])
+        plt.plot(normalized_df[exposure.name], normalized_df['Predict'], label=f'Predict (Gini: {round(gini, 3)})')
+    elif isinstance(predict, DataFrame) and isinstance(exposure, Series):
+        temp_df = concat([predict.reset_index(drop=True), exposure.reset_index(drop=True)], axis=1)
+        for pred_col in temp_df.columns[:-1]:
+            temp_df2 = temp_df[[pred_col, exposure.name]].sort_values(pred_col, ascending=False).reset_index(drop=True)
+            normalized_df = temp_df2.cumsum()/temp_df2.sum()
+            w = sum(temp_df2[exposure.name])
+            m = true_divide(sum(temp_df2[exposure.name] * temp_df2[pred_col]), sum(temp_df2[exposure.name]))
+            temp_df2['Rank'] = 0
+            temp_df2.loc[0, 'Rank'] = 1 + 0.5 * (temp_df2.loc[0, exposure.name] - 1)
+            for x in range(1, len(temp_df2)):
+                temp_df2.loc[x, 'Rank'] = (temp_df2.loc[x-1, 'Rank'] + 0.5 * (temp_df2.loc[x-1, exposure.name] + 1)
+                                           + 0.5 * (temp_df2.loc[x, exposure.name] - 1))
+            gini = 1 + 1/w - 2/(w**2 * m) * sum(temp_df2[exposure.name] * temp_df2[pred_col] * temp_df2['Rank'])
+            plt.plot(normalized_df[exposure.name], normalized_df[pred_col],
+                     label=f'{pred_col} (Gini: {round(gini, 3)})')
+    else:
+        raise Exception
+    plt.legend()
+    plt.plot(linspace(0, 1, 2), linspace(0, 1, 2), c='red', linestyle='--', linewidth=0.7)
+    plt.title('Gains curve')
+    plt.xlabel('Cumulative exposure')
+    plt.ylabel('Cumulative response')
+    plt.show()
diff --git a/insolver/wrappers/__init__.py b/insolver/wrappers/__init__.py
@@ -1,3 +1,4 @@
 from .glm import InsolverGLMWrapper
 from .gbm import InsolverGBMWrapper
+from .general import InsolverRFWrapper
 from .base import InsolverTrivialWrapper
diff --git a/insolver/wrappers/general.py b/insolver/wrappers/general.py
@@ -0,0 +1,115 @@
+from numpy import cumsum, diff, exp, true_divide, add, append, nan, concatenate, array
+from pandas import DataFrame, Series
+
+from sklearn.metrics import mean_squared_error, SCORERS
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+
+from .base import InsolverBaseWrapper
+from .extensions import InsolverCVHPExtension, InsolverPDPExtension
+
+
+class InsolverRFWrapper(InsolverBaseWrapper, InsolverCVHPExtension, InsolverPDPExtension):
+    """Insolver wrapper for Random Forest.
+
+    Attributes:
+        backend (str): Framework for building RF, 'sklearn' is supported.
+        task (str): Task that RF should solve: Classification or Regression. Values 'reg' and 'class' are supported.
+        n_estimators (:obj:`int`, optional): Number of trees in the forest. Equals 100 by default.
+        load_path (:obj:`str`, optional): Path to RF model to load from disk.
+        **kwargs: Parameters for RF estimators except `n_estimators`. Will not be changed in hyperopt.
+    """
+    def __init__(self, backend, task=None, n_estimators=100, load_path=None, **kwargs):
+        super(InsolverRFWrapper, self).__init__(backend)
+        self.algo, self._backends = 'Random Forest', ['sklearn']
+        self._tasks = ['class', 'reg']
+        self._back_load_dict = {'sklearn': self._pickle_load}
+        self._back_save_dict = {'sklearn': self._pickle_save}
+        self.n_estimators, self.params = n_estimators, None
+
+        if backend not in self._backends:
+            raise NotImplementedError(f'Error with the backend choice. Supported backends: {self._backends}')
+
+        if load_path is not None:
+            self.load_model(load_path)
+        else:
+            if task in self._tasks:
+                rf_init = {
+                    'class': {'sklearn': RandomForestClassifier},
+                    'reg': {'sklearn': RandomForestRegressor}
+                }
+
+                kwargs.update({'n_estimators': self.n_estimators})
+                self.model, self.params = rf_init[task][self.backend](**(kwargs if kwargs is not None else {})), kwargs
+
+                def __params_rf(**params):
+                    params.update(self.params)
+                    return rf_init[task][self.backend](**params)
+
+                self.object = __params_rf
+            else:
+                raise NotImplementedError(f'Task parameter supports values in {self._tasks}.')
+        self._update_meta()
+
+    def fit(self, X, y, report=None, **kwargs):
+        """Fit a Random Forest.
+
+        Args:
+            X (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training data.
+            y (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training target values.
+            report (:obj:`list`, :obj:`tuple`, optional): A list of metrics to report after model fitting, optional.
+            **kwargs: Other parameters passed to Scikit-learn API .fit().
+        """
+        self.model.fit(X, y, **kwargs)
+        if not hasattr(self.model, 'feature_name_'):
+            self.model.feature_name_ = X.columns if isinstance(X, DataFrame) else [X.name]
+        self._update_meta()
+        if report is not None:
+            if isinstance(report, (list, tuple)):
+                prediction = self.model.predict(X)
+                print(DataFrame([[x.__name__, x(y, prediction)] for x
+                                 in report]).rename({0: 'Metrics', 1: 'Value'}, axis=1).set_index('Metrics'))
+
+    def predict(self, X, **kwargs):
+        """Predict using RF with feature matrix X.
+
+        Args:
+            X (:obj:`pd.DataFrame`, :obj:`pd.Series`): Samples.
+            **kwargs: Other parameters passed to Scikit-learn API .predict().
+
+        Returns:
+            array: Returns predicted values.
+        """
+        return self.model.predict(X if not hasattr(self.model, 'feature_name_')
+                                  else X[self.model.feature_name_], **kwargs)
+
+    def cross_val(self, X, y, scoring=None, cv=None, **kwargs):
+        """Method for performing cross-validation given the hyperparameters of initialized or fitted model.
+
+        Args:
+            X (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training data.
+            y (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training target values.
+            scoring (:obj:`callable`): Metrics passed to sklearn.model_selection.cross_validate calculation.
+            cv (:obj:`int, cross-validation generator or an iterable`, optional): Cross-validation strategy from
+             sklearn. Performs 5-fold cv by default.
+            **kwargs: Other parameters passed to sklearn.model_selection.cross_validate.
+
+        Returns:
+            pd.DataFrame, pd.DataFrame: DataFrame with metrics on folds, DataFrame with shap values on folds.
+        """
+        scoring = mean_squared_error if scoring is None else scoring
+        models, metrics = self._cross_val(X, y, scoring=scoring, cv=cv, **kwargs)
+        if callable(scoring):
+            scorers = {scoring.__name__.replace('_', ' '): array([scoring(y, self.model.predict(X))])}
+        elif isinstance(scoring, (tuple, list)):
+            scorers = {scorer.__name__.replace('_', ' '): array([scorer(y, self.model.predict(X))]) for
+                       scorer in scoring}
+        elif isinstance(scoring, str):
+            if scoring in SCORERS:
+                scorers = {scoring.replace('_', ' '): array([SCORERS[scoring](self.model, X=X, y=y)])}
+            else:
+                raise ValueError(f'Scorer {scoring} is not supported.')
+        else:
+            raise NotImplementedError(f'Scoring of type {scoring} is not supported')
+        metrics = DataFrame({key: concatenate((scorers[key], metrics[key])) for key in scorers.keys()}).T
+        metrics.columns = [f'Fold {i}' if i != 0 else 'Overall' for i in range(metrics.shape[1])]
+        return metrics
diff --git a/setup.py b/setup.py
@@ -9,7 +9,7 @@
 
 
 setup(name='insolver',
-      version='0.4.7',
+      version='0.4.8',
       description='Insolver is low-code machine learning library, initially created for the insurance industry, '
                   'but can be used in any other.\n You can find a detailed overview at '
                   'https://insolver.readthedocs.io/en/latest/source/overview.html.',