Skip to content

Commit

Permalink
Merge pull request #67 from MindSetLib/asupdates
Browse files Browse the repository at this point in the history
Asupdates
  • Loading branch information
alexmindset authored Apr 27, 2021
2 parents f9db93b + 1d79c15 commit 707acd0
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 37 deletions.
23 changes: 17 additions & 6 deletions insolver/model_tools/model_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from numpy import min, max, mean, var, std, quantile, median
from pandas import DataFrame

from insolver.wrappers import InsolverGLMWrapper, InsolverGBMWrapper, InsolverTrivialWrapper
from insolver.wrappers import InsolverGLMWrapper, InsolverGBMWrapper, InsolverRFWrapper, InsolverTrivialWrapper


class ModelMetricsCompare:
Expand All @@ -19,9 +19,13 @@ class ModelMetricsCompare:
stats (:obj:`list`, :obj:`tuple`, :obj:`callable`, optional): Statistics or list of statistics to compute.
folder with models. If `None`, taking current working directory as source.
h2o_init_params (:obj:`dict`, optional): Parameters passed to `h2o.init()`, when `backend` == 'h2o'.
predict_params (:obj:`list`, optional): List of dictionaries containing parameters passed to predict methods
for each model.
features (:obj:`list`, optional): List of lists containing features for predict method for each model.
"""
def __init__(self, X, y, source=None, metrics=None, stats=None, h2o_init_params=None):
wrappers = {'glm': InsolverGLMWrapper, 'gbm': InsolverGBMWrapper}
def __init__(self, X, y, source=None, metrics=None, stats=None, h2o_init_params=None, predict_params=None,
features=None):
wrappers = {'glm': InsolverGLMWrapper, 'gbm': InsolverGBMWrapper, 'rf': InsolverRFWrapper}
self.stats, self.metrics = None, None
if (source is None) or isinstance(source, str):
source = os.getcwd() if source is None else source
Expand All @@ -41,7 +45,7 @@ def __init__(self, X, y, source=None, metrics=None, stats=None, h2o_init_params=
else:
raise TypeError(f'Source of type {type(source)} is not supported.')

self._calc_metrics(X=X, y=y, metrics=metrics, stats=stats)
self._calc_metrics(X=X, y=y, metrics=metrics, stats=stats, predict_params=predict_params, features=features)

def __repr__(self):
stk = traceback.extract_stack()
Expand All @@ -58,14 +62,17 @@ def __repr__(self):
print(self.metrics)
return ''

def _calc_metrics(self, X, y, metrics=None, stats=None):
def _calc_metrics(self, X, y, metrics=None, stats=None, predict_params=None, features=None):
"""Computing metrics and statistics for models.
Args:
X (:obj:`pd.DataFrame`, :obj:`pd.Series`): Data for making predictions.
y (:obj:`pd.DataFrame`, :obj:`pd.Series`): Actual target values for X.
metrics (:obj:`list`, :obj:`tuple`, :obj:`callable`, optional): Metrics or list of metrics to compute.
stats (:obj:`list`, :obj:`tuple`, :obj:`callable`, optional): Statistics or list of statistics to compute.
predict_params (:obj:`list`, optional): List of dictionaries containing parameters passed to predict methods
for each model.
features (:obj:`list`, optional): List of lists containing features for predict method for each model.
Returns:
Returns `None`, but results available in `self.stats`, `self.metrics`.
Expand All @@ -74,8 +81,12 @@ def _calc_metrics(self, X, y, metrics=None, stats=None):
trivial = InsolverTrivialWrapper(agg=lambda x: x)
trivial.fit(X, y)
models = [trivial] + self.models
features = [None] + features
for model in models:
p = model.predict(X)
p = model.predict(X if (features is None) or (features[models.index(model)] is None)
else X[features[models.index(model)]],
**({} if (predict_params is None) or (predict_params[models.index(model)] is None)
else predict_params[models.index(model)]))
stats_val = [mean(p), var(p), std(p), min(p), quantile(p, 0.25), median(p), quantile(p, 0.75), max(p)]
name_stats = ['Mean', 'Variance', 'St. Dev.', 'Min', 'Q1', 'Median', 'Q3', 'Max']
if stats is not None:
Expand Down
104 changes: 73 additions & 31 deletions insolver/model_tools/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
from urllib.request import urlopen
from zipfile import ZipFile

from numpy import log, sum, maximum, unique, true_divide, linspace, ndarray

import numpy as np
import matplotlib.pyplot as plt

from pandas import DataFrame, Series, concat, qcut
Expand Down Expand Up @@ -113,9 +112,9 @@ def deviance_poisson(y_hat, y, weight=None):
"""
t_hat, t = y_hat + 1, y + 1
if weight:
return sum(2 * weight * (t * log(t / t_hat) - (t - t_hat)))
return sum(2 * weight * (t * np.log(t / t_hat) - (t - t_hat)))
else:
return sum(2 * (t * log(t / t_hat) - (t - t_hat)))
return sum(2 * (t * np.log(t / t_hat) - (t - t_hat)))


def deviance_gamma(y_hat, y, weight=None):
Expand All @@ -130,9 +129,9 @@ def deviance_gamma(y_hat, y, weight=None):
float, value of the Gamma deviance.
"""
if weight:
return sum(2 * weight * (-log(y/y_hat) + (y-y_hat)/y_hat))
return sum(2 * weight * (-np.log(y/y_hat) + (y-y_hat)/y_hat))
else:
return sum(2 * (-log(y/y_hat) + (y-y_hat)/y_hat))
return sum(2 * (-np.log(y/y_hat) + (y-y_hat)/y_hat))


def inforamtion_value_woe(data, target, bins=10, cat_thresh=10, detail=False):
Expand All @@ -152,17 +151,17 @@ def inforamtion_value_woe(data, target, bins=10, cat_thresh=10, detail=False):
target = target.name if isinstance(target, Series) else target
cols = data.columns
for ivars in cols[~cols.isin([target])]:
if (data[ivars].dtype.kind in 'bifc') and (len(unique(data[ivars])) > cat_thresh):
if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars])) > cat_thresh):
binned_x = qcut(data[ivars], bins, duplicates='drop')
d0 = DataFrame({'x': binned_x, 'y': data[target]})
else:
d0 = DataFrame({'x': data[ivars], 'y': data[target]})
d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
d.columns = ['Cutoff', 'N', 'Events']
d['% of Events'] = maximum(d['Events'], 0.5) / d['Events'].sum()
d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()
d['Non-Events'] = d['N'] - d['Events']
d['% of Non-Events'] = maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
d['WoE'] = log(d['% of Events'] / d['% of Non-Events'])
d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
d['WoE'] = np.log(d['% of Events'] / d['% of Non-Events'])
d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
d.insert(loc=0, column='Variable', value=ivars)
temp = DataFrame({"Variable": [ivars], "IV": [d['IV'].sum()]}, columns=["Variable", "IV"])
Expand All @@ -171,41 +170,84 @@ def inforamtion_value_woe(data, target, bins=10, cat_thresh=10, detail=False):
return short_result if detail else detailed_result


def gain_curve(predict, exposure):
if isinstance(predict, (Series, ndarray)) and isinstance(exposure, Series):
def gain_curve(predict, exposure, step=1, figsize=(10, 6), gini_exact=False, output=False):
gini_df = DataFrame()
plt.figure(figsize=figsize)
if isinstance(predict, (Series, np.ndarray)) and isinstance(exposure, Series):
temp_df = concat([Series(predict, name='Predict').reset_index(drop=True),
exposure.reset_index(drop=True)], axis=1)
temp_df = temp_df.sort_values('Predict', ascending=False).reset_index(drop=True)
normalized_df = temp_df.cumsum()/temp_df.sum()
w = sum(temp_df[exposure.name])
m = true_divide(sum(temp_df[exposure.name] * temp_df['Predict']), sum(temp_df[exposure.name]))
temp_df['Rank'] = 0
temp_df.loc[0, 'Rank'] = 1 + 0.5 * (temp_df.loc[0, exposure.name] - 1)
for x in range(1, len(temp_df)):
temp_df.loc[x, 'Rank'] = (temp_df.loc[x-1, 'Rank'] + 0.5 * (temp_df.loc[x-1, exposure.name] + 1)
+ 0.5 * (temp_df.loc[x, exposure.name] - 1))
gini = 1 + 1/w - 2/(w**2 * m) * sum(temp_df[exposure.name] * temp_df['Predict'] * temp_df['Rank'])
plt.plot(normalized_df[exposure.name], normalized_df['Predict'], label=f'Predict (Gini: {round(gini, 3)})')
if gini_exact:
w = sum(temp_df[exposure.name])
m = np.true_divide(sum(temp_df[exposure.name] * temp_df['Predict']), sum(temp_df[exposure.name]))
temp_df['Rank'] = 0
temp_df.loc[0, 'Rank'] = 1 + 0.5 * (temp_df.loc[0, exposure.name] - 1)
for x in range(1, len(temp_df)):
temp_df.loc[x, 'Rank'] = (temp_df.loc[x-1, 'Rank'] + 0.5 * (temp_df.loc[x-1, exposure.name] + 1)
+ 0.5 * (temp_df.loc[x, exposure.name] - 1))
gini = 1 + 1/w - 2/(w**2 * m) * sum(temp_df[exposure.name] * temp_df['Predict'] * temp_df['Rank'])
else:
auc = (np.sum(normalized_df['Predict'] * np.append(np.diff(normalized_df['Exposure']), 0))
+ np.sum(np.append(np.diff(normalized_df['Predict']), 0) *
np.append(np.diff(normalized_df['Exposure']), 0))/2)
gini = 2 * auc - 1
gini_df = gini_df.append(DataFrame.from_dict({'Gini': {'Predict': gini}}))
plt.plot(normalized_df[exposure.name].values[::step], normalized_df['Predict'].values[::step],
label=f'Predict (Gini: {round(gini, 3)})')
elif isinstance(predict, DataFrame) and isinstance(exposure, Series):
temp_df = concat([predict.reset_index(drop=True), exposure.reset_index(drop=True)], axis=1)
for pred_col in temp_df.columns[:-1]:
temp_df2 = temp_df[[pred_col, exposure.name]].sort_values(pred_col, ascending=False).reset_index(drop=True)
normalized_df = temp_df2.cumsum()/temp_df2.sum()
w = sum(temp_df2[exposure.name])
m = true_divide(sum(temp_df2[exposure.name] * temp_df2[pred_col]), sum(temp_df2[exposure.name]))
temp_df2['Rank'] = 0
temp_df2.loc[0, 'Rank'] = 1 + 0.5 * (temp_df2.loc[0, exposure.name] - 1)
for x in range(1, len(temp_df2)):
temp_df2.loc[x, 'Rank'] = (temp_df2.loc[x-1, 'Rank'] + 0.5 * (temp_df2.loc[x-1, exposure.name] + 1)
+ 0.5 * (temp_df2.loc[x, exposure.name] - 1))
gini = 1 + 1/w - 2/(w**2 * m) * sum(temp_df2[exposure.name] * temp_df2[pred_col] * temp_df2['Rank'])
plt.plot(normalized_df[exposure.name], normalized_df[pred_col],
if gini_exact:
w = sum(temp_df2[exposure.name])
m = np.true_divide(sum(temp_df2[exposure.name] * temp_df2[pred_col]), sum(temp_df2[exposure.name]))
temp_df2['Rank'] = 0
temp_df2.loc[0, 'Rank'] = 1 + 0.5 * (temp_df2.loc[0, exposure.name] - 1)
for x in range(1, len(temp_df2)):
temp_df2.loc[x, 'Rank'] = (temp_df2.loc[x-1, 'Rank'] + 0.5 * (temp_df2.loc[x-1, exposure.name] + 1)
+ 0.5 * (temp_df2.loc[x, exposure.name] - 1))
gini = 1 + 1/w - 2/(w**2 * m) * sum(temp_df2[exposure.name] * temp_df2[pred_col] * temp_df2['Rank'])
else:
auc = (np.sum(normalized_df[pred_col] * np.append(np.diff(normalized_df[exposure.name]), 0))
+ np.sum(np.append(np.diff(normalized_df[pred_col]), 0) *
np.append(np.diff(normalized_df[exposure.name]), 0))/2)
gini = 2 * auc - 1
gini_df = gini_df.append(DataFrame.from_dict({'Gini': {pred_col: gini}}))
plt.plot(normalized_df[exposure.name].values[::step], normalized_df[pred_col].values[::step],
label=f'{pred_col} (Gini: {round(gini, 3)})')
else:
raise Exception
plt.legend()
plt.plot(linspace(0, 1, 2), linspace(0, 1, 2), c='red', linestyle='--', linewidth=0.7)
plt.plot(np.linspace(0, 1, 2), np.linspace(0, 1, 2), c='red', linestyle='--', linewidth=0.7)
plt.title('Gains curve')
plt.xlabel('Cumulative exposure')
plt.ylabel('Cumulative response')
plt.show()
if output:
return gini_df


def lift(predict, column, lift_type='groupby', q=10, output=False, reference='mean'):
df = concat([column.reset_index(drop=True), Series(predict, name='Predict')], axis=1)
if lift_type == 'groupby':
pass
elif lift_type == 'quantile':
df[column.name] = qcut(column, q=q).reset_index(drop=True)
else:
raise Exception
if reference == 'mean':
df = df.groupby(column.name).mean() / np.mean(predict)
elif reference == 'min':
df = df.groupby(column.name).mean() / df.groupby(column.name).min()
else:
raise Exception
plt.bar(df.index.astype(str), height=df['Predict'])
plt.title('Lift Metrics')
plt.xlabel(column.name)
plt.ylabel('Lift Score')
plt.xticks(rotation=90)
plt.show()
if output:
return df

0 comments on commit 707acd0

Please sign in to comment.