Merge pull request #67 from MindSetLib/asupdates

Asupdates
MindSetLib · Apr 27, 2021 · 707acd0 · 707acd0
2 parents f9db93b + 1d79c15
commit 707acd0
Show file tree

Hide file tree

Showing 2 changed files with 90 additions and 37 deletions.
diff --git a/insolver/model_tools/model_comparison.py b/insolver/model_tools/model_comparison.py
@@ -5,7 +5,7 @@
 from numpy import min, max, mean, var, std, quantile, median
 from pandas import DataFrame
 
-from insolver.wrappers import InsolverGLMWrapper, InsolverGBMWrapper, InsolverTrivialWrapper
+from insolver.wrappers import InsolverGLMWrapper, InsolverGBMWrapper, InsolverRFWrapper, InsolverTrivialWrapper
 
 
 class ModelMetricsCompare:
@@ -19,9 +19,13 @@ class ModelMetricsCompare:
         stats (:obj:`list`, :obj:`tuple`, :obj:`callable`, optional): Statistics or list of statistics to compute.
         folder with models. If `None`, taking current working directory as source.
         h2o_init_params (:obj:`dict`, optional): Parameters passed to `h2o.init()`, when `backend` == 'h2o'.
+        predict_params (:obj:`list`, optional): List of dictionaries containing parameters passed to predict methods
+         for each model.
+        features (:obj:`list`, optional): List of lists containing features for predict method for each model.
     """
-    def __init__(self, X, y, source=None, metrics=None, stats=None, h2o_init_params=None):
-        wrappers = {'glm': InsolverGLMWrapper, 'gbm': InsolverGBMWrapper}
+    def __init__(self, X, y, source=None, metrics=None, stats=None, h2o_init_params=None, predict_params=None,
+                 features=None):
+        wrappers = {'glm': InsolverGLMWrapper, 'gbm': InsolverGBMWrapper, 'rf': InsolverRFWrapper}
         self.stats, self.metrics = None, None
         if (source is None) or isinstance(source, str):
             source = os.getcwd() if source is None else source
@@ -41,7 +45,7 @@ def __init__(self, X, y, source=None, metrics=None, stats=None, h2o_init_params=
         else:
             raise TypeError(f'Source of type {type(source)} is not supported.')
 
-        self._calc_metrics(X=X, y=y, metrics=metrics, stats=stats)
+        self._calc_metrics(X=X, y=y, metrics=metrics, stats=stats, predict_params=predict_params, features=features)
 
     def __repr__(self):
         stk = traceback.extract_stack()
@@ -58,14 +62,17 @@ def __repr__(self):
             print(self.metrics)
         return ''
 
-    def _calc_metrics(self, X, y, metrics=None, stats=None):
+    def _calc_metrics(self, X, y, metrics=None, stats=None, predict_params=None, features=None):
         """Computing metrics and statistics for models.
 
         Args:
             X (:obj:`pd.DataFrame`, :obj:`pd.Series`): Data for making predictions.
             y (:obj:`pd.DataFrame`, :obj:`pd.Series`): Actual target values for X.
             metrics (:obj:`list`, :obj:`tuple`, :obj:`callable`, optional): Metrics or list of metrics to compute.
             stats (:obj:`list`, :obj:`tuple`, :obj:`callable`, optional): Statistics or list of statistics to compute.
+            predict_params (:obj:`list`, optional): List of dictionaries containing parameters passed to predict methods
+         for each model.
+            features (:obj:`list`, optional): List of lists containing features for predict method for each model.
 
         Returns:
             Returns `None`, but results available in `self.stats`, `self.metrics`.
@@ -74,8 +81,12 @@ def _calc_metrics(self, X, y, metrics=None, stats=None):
         trivial = InsolverTrivialWrapper(agg=lambda x: x)
         trivial.fit(X, y)
         models = [trivial] + self.models
+        features = [None] + features
         for model in models:
-            p = model.predict(X)
+            p = model.predict(X if (features is None) or (features[models.index(model)] is None)
+                              else X[features[models.index(model)]],
+                              **({} if (predict_params is None) or (predict_params[models.index(model)] is None)
+                                 else predict_params[models.index(model)]))
             stats_val = [mean(p), var(p), std(p), min(p), quantile(p, 0.25), median(p), quantile(p, 0.75), max(p)]
             name_stats = ['Mean', 'Variance', 'St. Dev.', 'Min', 'Q1', 'Median', 'Q3', 'Max']
             if stats is not None:

diff --git a/insolver/model_tools/model_utils.py b/insolver/model_tools/model_utils.py
@@ -3,8 +3,7 @@
 from urllib.request import urlopen
 from zipfile import ZipFile
 
-from numpy import log, sum, maximum, unique, true_divide, linspace, ndarray
-
+import numpy as np
 import matplotlib.pyplot as plt
 
 from pandas import DataFrame, Series, concat, qcut
@@ -113,9 +112,9 @@ def deviance_poisson(y_hat, y, weight=None):
     """
     t_hat, t = y_hat + 1, y + 1
     if weight:
-        return sum(2 * weight * (t * log(t / t_hat) - (t - t_hat)))
+        return sum(2 * weight * (t * np.log(t / t_hat) - (t - t_hat)))
     else:
-        return sum(2 * (t * log(t / t_hat) - (t - t_hat)))
+        return sum(2 * (t * np.log(t / t_hat) - (t - t_hat)))
 
 
 def deviance_gamma(y_hat, y, weight=None):
@@ -130,9 +129,9 @@ def deviance_gamma(y_hat, y, weight=None):
         float, value of the Gamma deviance.
     """
     if weight:
-        return sum(2 * weight * (-log(y/y_hat) + (y-y_hat)/y_hat))
+        return sum(2 * weight * (-np.log(y/y_hat) + (y-y_hat)/y_hat))
     else:
-        return sum(2 * (-log(y/y_hat) + (y-y_hat)/y_hat))
+        return sum(2 * (-np.log(y/y_hat) + (y-y_hat)/y_hat))
 
 
 def inforamtion_value_woe(data, target, bins=10, cat_thresh=10, detail=False):
@@ -152,17 +151,17 @@ def inforamtion_value_woe(data, target, bins=10, cat_thresh=10, detail=False):
     target = target.name if isinstance(target, Series) else target
     cols = data.columns
     for ivars in cols[~cols.isin([target])]:
-        if (data[ivars].dtype.kind in 'bifc') and (len(unique(data[ivars])) > cat_thresh):
+        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars])) > cat_thresh):
             binned_x = qcut(data[ivars], bins,  duplicates='drop')
             d0 = DataFrame({'x': binned_x, 'y': data[target]})
         else:
             d0 = DataFrame({'x': data[ivars], 'y': data[target]})
         d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
         d.columns = ['Cutoff', 'N', 'Events']
-        d['% of Events'] = maximum(d['Events'], 0.5) / d['Events'].sum()
+        d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()
         d['Non-Events'] = d['N'] - d['Events']
-        d['% of Non-Events'] = maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
-        d['WoE'] = log(d['% of Events'] / d['% of Non-Events'])
+        d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
+        d['WoE'] = np.log(d['% of Events'] / d['% of Non-Events'])
         d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
         d.insert(loc=0, column='Variable', value=ivars)
         temp = DataFrame({"Variable": [ivars], "IV": [d['IV'].sum()]}, columns=["Variable", "IV"])
@@ -171,41 +170,84 @@ def inforamtion_value_woe(data, target, bins=10, cat_thresh=10, detail=False):
     return short_result if detail else detailed_result
 
 
-def gain_curve(predict, exposure):
-    if isinstance(predict, (Series, ndarray)) and isinstance(exposure, Series):
+def gain_curve(predict, exposure, step=1, figsize=(10, 6), gini_exact=False, output=False):
+    gini_df = DataFrame()
+    plt.figure(figsize=figsize)
+    if isinstance(predict, (Series, np.ndarray)) and isinstance(exposure, Series):
         temp_df = concat([Series(predict, name='Predict').reset_index(drop=True),
                           exposure.reset_index(drop=True)], axis=1)
         temp_df = temp_df.sort_values('Predict', ascending=False).reset_index(drop=True)
         normalized_df = temp_df.cumsum()/temp_df.sum()
-        w = sum(temp_df[exposure.name])
-        m = true_divide(sum(temp_df[exposure.name] * temp_df['Predict']), sum(temp_df[exposure.name]))
-        temp_df['Rank'] = 0
-        temp_df.loc[0, 'Rank'] = 1 + 0.5 * (temp_df.loc[0, exposure.name] - 1)
-        for x in range(1, len(temp_df)):
-            temp_df.loc[x, 'Rank'] = (temp_df.loc[x-1, 'Rank'] + 0.5 * (temp_df.loc[x-1, exposure.name] + 1)
-                                      + 0.5 * (temp_df.loc[x, exposure.name] - 1))
-        gini = 1 + 1/w - 2/(w**2 * m) * sum(temp_df[exposure.name] * temp_df['Predict'] * temp_df['Rank'])
-        plt.plot(normalized_df[exposure.name], normalized_df['Predict'], label=f'Predict (Gini: {round(gini, 3)})')
+        if gini_exact:
+            w = sum(temp_df[exposure.name])
+            m = np.true_divide(sum(temp_df[exposure.name] * temp_df['Predict']), sum(temp_df[exposure.name]))
+            temp_df['Rank'] = 0
+            temp_df.loc[0, 'Rank'] = 1 + 0.5 * (temp_df.loc[0, exposure.name] - 1)
+            for x in range(1, len(temp_df)):
+                temp_df.loc[x, 'Rank'] = (temp_df.loc[x-1, 'Rank'] + 0.5 * (temp_df.loc[x-1, exposure.name] + 1)
+                                          + 0.5 * (temp_df.loc[x, exposure.name] - 1))
+            gini = 1 + 1/w - 2/(w**2 * m) * sum(temp_df[exposure.name] * temp_df['Predict'] * temp_df['Rank'])
+        else:
+            auc = (np.sum(normalized_df['Predict'] * np.append(np.diff(normalized_df['Exposure']), 0))
+                   + np.sum(np.append(np.diff(normalized_df['Predict']), 0) *
+                            np.append(np.diff(normalized_df['Exposure']), 0))/2)
+            gini = 2 * auc - 1
+        gini_df = gini_df.append(DataFrame.from_dict({'Gini': {'Predict': gini}}))
+        plt.plot(normalized_df[exposure.name].values[::step], normalized_df['Predict'].values[::step],
+                 label=f'Predict (Gini: {round(gini, 3)})')
     elif isinstance(predict, DataFrame) and isinstance(exposure, Series):
         temp_df = concat([predict.reset_index(drop=True), exposure.reset_index(drop=True)], axis=1)
         for pred_col in temp_df.columns[:-1]:
             temp_df2 = temp_df[[pred_col, exposure.name]].sort_values(pred_col, ascending=False).reset_index(drop=True)
             normalized_df = temp_df2.cumsum()/temp_df2.sum()
-            w = sum(temp_df2[exposure.name])
-            m = true_divide(sum(temp_df2[exposure.name] * temp_df2[pred_col]), sum(temp_df2[exposure.name]))
-            temp_df2['Rank'] = 0
-            temp_df2.loc[0, 'Rank'] = 1 + 0.5 * (temp_df2.loc[0, exposure.name] - 1)
-            for x in range(1, len(temp_df2)):
-                temp_df2.loc[x, 'Rank'] = (temp_df2.loc[x-1, 'Rank'] + 0.5 * (temp_df2.loc[x-1, exposure.name] + 1)
-                                           + 0.5 * (temp_df2.loc[x, exposure.name] - 1))
-            gini = 1 + 1/w - 2/(w**2 * m) * sum(temp_df2[exposure.name] * temp_df2[pred_col] * temp_df2['Rank'])
-            plt.plot(normalized_df[exposure.name], normalized_df[pred_col],
+            if gini_exact:
+                w = sum(temp_df2[exposure.name])
+                m = np.true_divide(sum(temp_df2[exposure.name] * temp_df2[pred_col]), sum(temp_df2[exposure.name]))
+                temp_df2['Rank'] = 0
+                temp_df2.loc[0, 'Rank'] = 1 + 0.5 * (temp_df2.loc[0, exposure.name] - 1)
+                for x in range(1, len(temp_df2)):
+                    temp_df2.loc[x, 'Rank'] = (temp_df2.loc[x-1, 'Rank'] + 0.5 * (temp_df2.loc[x-1, exposure.name] + 1)
+                                               + 0.5 * (temp_df2.loc[x, exposure.name] - 1))
+                gini = 1 + 1/w - 2/(w**2 * m) * sum(temp_df2[exposure.name] * temp_df2[pred_col] * temp_df2['Rank'])
+            else:
+                auc = (np.sum(normalized_df[pred_col] * np.append(np.diff(normalized_df[exposure.name]), 0))
+                       + np.sum(np.append(np.diff(normalized_df[pred_col]), 0) *
+                                np.append(np.diff(normalized_df[exposure.name]), 0))/2)
+                gini = 2 * auc - 1
+            gini_df = gini_df.append(DataFrame.from_dict({'Gini': {pred_col: gini}}))
+            plt.plot(normalized_df[exposure.name].values[::step], normalized_df[pred_col].values[::step],
                      label=f'{pred_col} (Gini: {round(gini, 3)})')
     else:
         raise Exception
     plt.legend()
-    plt.plot(linspace(0, 1, 2), linspace(0, 1, 2), c='red', linestyle='--', linewidth=0.7)
+    plt.plot(np.linspace(0, 1, 2), np.linspace(0, 1, 2), c='red', linestyle='--', linewidth=0.7)
     plt.title('Gains curve')
     plt.xlabel('Cumulative exposure')
     plt.ylabel('Cumulative response')
     plt.show()
+    if output:
+        return gini_df
+
+
+def lift(predict, column, lift_type='groupby', q=10, output=False, reference='mean'):
+    df = concat([column.reset_index(drop=True), Series(predict, name='Predict')], axis=1)
+    if lift_type == 'groupby':
+        pass
+    elif lift_type == 'quantile':
+        df[column.name] = qcut(column, q=q).reset_index(drop=True)
+    else:
+        raise Exception
+    if reference == 'mean':
+        df = df.groupby(column.name).mean() / np.mean(predict)
+    elif reference == 'min':
+        df = df.groupby(column.name).mean() / df.groupby(column.name).min()
+    else:
+        raise Exception
+    plt.bar(df.index.astype(str), height=df['Predict'])
+    plt.title('Lift Metrics')
+    plt.xlabel(column.name)
+    plt.ylabel('Lift Score')
+    plt.xticks(rotation=90)
+    plt.show()
+    if output:
+        return df