Skip to content

Commit

Permalink
Merge pull request #66 from MindSetLib/asupdates
Browse files Browse the repository at this point in the history
Random Forest, WoE & Gains curve
  • Loading branch information
alexmindset authored Apr 15, 2021
2 parents 4e1cad3 + 1dc0abf commit f9db93b
Show file tree
Hide file tree
Showing 4 changed files with 198 additions and 2 deletions.
82 changes: 81 additions & 1 deletion insolver/model_tools/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
from urllib.request import urlopen
from zipfile import ZipFile

from numpy import log, sum
from numpy import log, sum, maximum, unique, true_divide, linspace, ndarray

import matplotlib.pyplot as plt

from pandas import DataFrame, Series, concat, qcut
from sklearn.model_selection import train_test_split


Expand Down Expand Up @@ -129,3 +133,79 @@ def deviance_gamma(y_hat, y, weight=None):
return sum(2 * weight * (-log(y/y_hat) + (y-y_hat)/y_hat))
else:
return sum(2 * (-log(y/y_hat) + (y-y_hat)/y_hat))


def inforamtion_value_woe(data, target, bins=10, cat_thresh=10, detail=False):
"""Function for Information value and Weight of Evidence computation.
Args:
data (pd.DataFrame): DataFrame with data to compute IV and WoE.
target (:obj:`str` or :obj:`pd.Series`): Target variable to compute IV and WoE.
bins (:obj:`int`, optional): Number of bins for WoE calculation for continuous variables.
cat_thresh (:obj:`int`, optional): Maximum number of categories for non-binned WoE calculation.
detail (:obj:`bool`, optional): Whether to return detailed results DataFrame or not. Short by default.
Returns:
pd.DataFrame, DataFrame containing the data on Information Value (depends on detail argument).
"""
detailed_result, short_result = DataFrame(), DataFrame()
target = target.name if isinstance(target, Series) else target
cols = data.columns
for ivars in cols[~cols.isin([target])]:
if (data[ivars].dtype.kind in 'bifc') and (len(unique(data[ivars])) > cat_thresh):
binned_x = qcut(data[ivars], bins, duplicates='drop')
d0 = DataFrame({'x': binned_x, 'y': data[target]})
else:
d0 = DataFrame({'x': data[ivars], 'y': data[target]})
d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
d.columns = ['Cutoff', 'N', 'Events']
d['% of Events'] = maximum(d['Events'], 0.5) / d['Events'].sum()
d['Non-Events'] = d['N'] - d['Events']
d['% of Non-Events'] = maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
d['WoE'] = log(d['% of Events'] / d['% of Non-Events'])
d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
d.insert(loc=0, column='Variable', value=ivars)
temp = DataFrame({"Variable": [ivars], "IV": [d['IV'].sum()]}, columns=["Variable", "IV"])
detailed_result = concat([detailed_result, temp], axis=0)
short_result = concat([short_result, d], axis=0)
return short_result if detail else detailed_result


def gain_curve(predict, exposure):
if isinstance(predict, (Series, ndarray)) and isinstance(exposure, Series):
temp_df = concat([Series(predict, name='Predict').reset_index(drop=True),
exposure.reset_index(drop=True)], axis=1)
temp_df = temp_df.sort_values('Predict', ascending=False).reset_index(drop=True)
normalized_df = temp_df.cumsum()/temp_df.sum()
w = sum(temp_df[exposure.name])
m = true_divide(sum(temp_df[exposure.name] * temp_df['Predict']), sum(temp_df[exposure.name]))
temp_df['Rank'] = 0
temp_df.loc[0, 'Rank'] = 1 + 0.5 * (temp_df.loc[0, exposure.name] - 1)
for x in range(1, len(temp_df)):
temp_df.loc[x, 'Rank'] = (temp_df.loc[x-1, 'Rank'] + 0.5 * (temp_df.loc[x-1, exposure.name] + 1)
+ 0.5 * (temp_df.loc[x, exposure.name] - 1))
gini = 1 + 1/w - 2/(w**2 * m) * sum(temp_df[exposure.name] * temp_df['Predict'] * temp_df['Rank'])
plt.plot(normalized_df[exposure.name], normalized_df['Predict'], label=f'Predict (Gini: {round(gini, 3)})')
elif isinstance(predict, DataFrame) and isinstance(exposure, Series):
temp_df = concat([predict.reset_index(drop=True), exposure.reset_index(drop=True)], axis=1)
for pred_col in temp_df.columns[:-1]:
temp_df2 = temp_df[[pred_col, exposure.name]].sort_values(pred_col, ascending=False).reset_index(drop=True)
normalized_df = temp_df2.cumsum()/temp_df2.sum()
w = sum(temp_df2[exposure.name])
m = true_divide(sum(temp_df2[exposure.name] * temp_df2[pred_col]), sum(temp_df2[exposure.name]))
temp_df2['Rank'] = 0
temp_df2.loc[0, 'Rank'] = 1 + 0.5 * (temp_df2.loc[0, exposure.name] - 1)
for x in range(1, len(temp_df2)):
temp_df2.loc[x, 'Rank'] = (temp_df2.loc[x-1, 'Rank'] + 0.5 * (temp_df2.loc[x-1, exposure.name] + 1)
+ 0.5 * (temp_df2.loc[x, exposure.name] - 1))
gini = 1 + 1/w - 2/(w**2 * m) * sum(temp_df2[exposure.name] * temp_df2[pred_col] * temp_df2['Rank'])
plt.plot(normalized_df[exposure.name], normalized_df[pred_col],
label=f'{pred_col} (Gini: {round(gini, 3)})')
else:
raise Exception
plt.legend()
plt.plot(linspace(0, 1, 2), linspace(0, 1, 2), c='red', linestyle='--', linewidth=0.7)
plt.title('Gains curve')
plt.xlabel('Cumulative exposure')
plt.ylabel('Cumulative response')
plt.show()
1 change: 1 addition & 0 deletions insolver/wrappers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .glm import InsolverGLMWrapper
from .gbm import InsolverGBMWrapper
from .general import InsolverRFWrapper
from .base import InsolverTrivialWrapper
115 changes: 115 additions & 0 deletions insolver/wrappers/general.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
from numpy import cumsum, diff, exp, true_divide, add, append, nan, concatenate, array
from pandas import DataFrame, Series

from sklearn.metrics import mean_squared_error, SCORERS
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from .base import InsolverBaseWrapper
from .extensions import InsolverCVHPExtension, InsolverPDPExtension


class InsolverRFWrapper(InsolverBaseWrapper, InsolverCVHPExtension, InsolverPDPExtension):
"""Insolver wrapper for Random Forest.
Attributes:
backend (str): Framework for building RF, 'sklearn' is supported.
task (str): Task that RF should solve: Classification or Regression. Values 'reg' and 'class' are supported.
n_estimators (:obj:`int`, optional): Number of trees in the forest. Equals 100 by default.
load_path (:obj:`str`, optional): Path to RF model to load from disk.
**kwargs: Parameters for RF estimators except `n_estimators`. Will not be changed in hyperopt.
"""
def __init__(self, backend, task=None, n_estimators=100, load_path=None, **kwargs):
super(InsolverRFWrapper, self).__init__(backend)
self.algo, self._backends = 'Random Forest', ['sklearn']
self._tasks = ['class', 'reg']
self._back_load_dict = {'sklearn': self._pickle_load}
self._back_save_dict = {'sklearn': self._pickle_save}
self.n_estimators, self.params = n_estimators, None

if backend not in self._backends:
raise NotImplementedError(f'Error with the backend choice. Supported backends: {self._backends}')

if load_path is not None:
self.load_model(load_path)
else:
if task in self._tasks:
rf_init = {
'class': {'sklearn': RandomForestClassifier},
'reg': {'sklearn': RandomForestRegressor}
}

kwargs.update({'n_estimators': self.n_estimators})
self.model, self.params = rf_init[task][self.backend](**(kwargs if kwargs is not None else {})), kwargs

def __params_rf(**params):
params.update(self.params)
return rf_init[task][self.backend](**params)

self.object = __params_rf
else:
raise NotImplementedError(f'Task parameter supports values in {self._tasks}.')
self._update_meta()

def fit(self, X, y, report=None, **kwargs):
"""Fit a Random Forest.
Args:
X (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training data.
y (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training target values.
report (:obj:`list`, :obj:`tuple`, optional): A list of metrics to report after model fitting, optional.
**kwargs: Other parameters passed to Scikit-learn API .fit().
"""
self.model.fit(X, y, **kwargs)
if not hasattr(self.model, 'feature_name_'):
self.model.feature_name_ = X.columns if isinstance(X, DataFrame) else [X.name]
self._update_meta()
if report is not None:
if isinstance(report, (list, tuple)):
prediction = self.model.predict(X)
print(DataFrame([[x.__name__, x(y, prediction)] for x
in report]).rename({0: 'Metrics', 1: 'Value'}, axis=1).set_index('Metrics'))

def predict(self, X, **kwargs):
"""Predict using RF with feature matrix X.
Args:
X (:obj:`pd.DataFrame`, :obj:`pd.Series`): Samples.
**kwargs: Other parameters passed to Scikit-learn API .predict().
Returns:
array: Returns predicted values.
"""
return self.model.predict(X if not hasattr(self.model, 'feature_name_')
else X[self.model.feature_name_], **kwargs)

def cross_val(self, X, y, scoring=None, cv=None, **kwargs):
"""Method for performing cross-validation given the hyperparameters of initialized or fitted model.
Args:
X (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training data.
y (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training target values.
scoring (:obj:`callable`): Metrics passed to sklearn.model_selection.cross_validate calculation.
cv (:obj:`int, cross-validation generator or an iterable`, optional): Cross-validation strategy from
sklearn. Performs 5-fold cv by default.
**kwargs: Other parameters passed to sklearn.model_selection.cross_validate.
Returns:
pd.DataFrame, pd.DataFrame: DataFrame with metrics on folds, DataFrame with shap values on folds.
"""
scoring = mean_squared_error if scoring is None else scoring
models, metrics = self._cross_val(X, y, scoring=scoring, cv=cv, **kwargs)
if callable(scoring):
scorers = {scoring.__name__.replace('_', ' '): array([scoring(y, self.model.predict(X))])}
elif isinstance(scoring, (tuple, list)):
scorers = {scorer.__name__.replace('_', ' '): array([scorer(y, self.model.predict(X))]) for
scorer in scoring}
elif isinstance(scoring, str):
if scoring in SCORERS:
scorers = {scoring.replace('_', ' '): array([SCORERS[scoring](self.model, X=X, y=y)])}
else:
raise ValueError(f'Scorer {scoring} is not supported.')
else:
raise NotImplementedError(f'Scoring of type {scoring} is not supported')
metrics = DataFrame({key: concatenate((scorers[key], metrics[key])) for key in scorers.keys()}).T
metrics.columns = [f'Fold {i}' if i != 0 else 'Overall' for i in range(metrics.shape[1])]
return metrics
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@


setup(name='insolver',
version='0.4.7',
version='0.4.8',
description='Insolver is low-code machine learning library, initially created for the insurance industry, '
'but can be used in any other.\n You can find a detailed overview at '
'https://insolver.readthedocs.io/en/latest/source/overview.html.',
Expand Down

0 comments on commit f9db93b

Please sign in to comment.