Skip to content
This repository has been archived by the owner on Jun 28, 2024. It is now read-only.

Commit

Permalink
Merge pull request #220 from CamDavidsonPilon/0.10.0
Browse files Browse the repository at this point in the history
0.10.0
  • Loading branch information
CamDavidsonPilon authored Nov 22, 2018
2 parents 3bace68 + 9ac00c8 commit 9d6b1c5
Show file tree
Hide file tree
Showing 14 changed files with 291 additions and 150 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

### 0.10.0
- `BetaGeoBetaBinomFitter.fit` has replaced `n_custs` with the more appropriately named `weights` (to align with other statisical libraries). By default and if unspecified, `weights` is equal to an array of 1s.
- The `conditional_` methods on `BetaGeoBetaBinomFitter` have been updated to handle exogenously provided recency, frequency and periods.
- Performance improvements in `BetaGeoBetaBinomFitter`. `fit` takes about 50% less time than previously.
- `BetaGeoFitter`, `ParetoNBDFitter`, and `ModifiedBetaGeoFitter` both have a new `weights` argument in their `fit`. This can be used to reduce the size of the data (collapsing subjects with the same recency, frequency, T).

### 0.9.1
- Added a data generation method, `generate_new_data` to `BetaGeoBetaBinomFitter`. @zscore
- Fixed a bug in `summary_data_from_transaction_data` that was casting values to `int` prematurely. This was solved by including a new param `freq_multiplier` to be used to scale the resulting durations. See #100 for the original issue. @aprotopopov
Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@
# built documents.
#
# The short X.Y version.
version = '0.8.0.0'
version = '0.10.0.0'
# The full version, including alpha/beta/rc tags.
release = '0.8.0.0'
release = '0.10.0.0'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
2 changes: 1 addition & 1 deletion lifetimes/datasets/donations.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
frequency,recency,n,n_custs
frequency,recency,periods,weights
0,0,6,3464
1,1,6,1091
1,2,6,277
Expand Down
118 changes: 59 additions & 59 deletions lifetimes/fitters/beta_geo_beta_binom_fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pandas as pd
from numpy import log, exp, logaddexp, asarray, c_ as vconcat
from pandas import DataFrame
from scipy.special import gammaln, betaln, binom
from scipy.special import gammaln, betaln, binom, beta as betaf

from ..utils import _fit, _check_inputs
from . import BaseFitter
Expand Down Expand Up @@ -56,39 +56,37 @@ def _loglikelihood(params, x, tx, T):
"""Loglikelihood for optimizer."""
alpha, beta, gamma, delta = params

beta_ab = betaln(alpha, beta)
beta_gd = betaln(gamma, delta)

indiv_loglike = (betaln(alpha + x, beta + T - x) - beta_ab +
betaln(gamma, delta + T) - beta_gd)

betaln_ab = betaln(alpha, beta)
betaln_gd = betaln(gamma, delta)
recency_T = T - tx - 1

A = (betaln(alpha + x, beta + T - x) - betaln_ab +
betaln(gamma, delta + T) - betaln_gd)

J = np.arange(recency_T.max() + 1)

@np.vectorize
def _sum(x, tx, recency_T):
def _sum_(x, tx, recency_T):
if recency_T <= -1:
return -np.inf
return 10e-10
elif recency_T == 0:
return betaf(alpha + x, beta + tx - x) * betaf(gamma + 1, delta + tx)
else:
j = J[:recency_T + 1]
return (betaf(alpha + x, beta + tx - x + j) * betaf(gamma + 1, delta + tx + j)).sum()

j = J[:int(recency_T) + 1]
return log(
np.sum(exp(betaln(alpha + x, beta + tx - x + j) - beta_ab +
betaln(gamma + 1, delta + tx + j) - beta_gd)))
sum_ = np.vectorize(_sum_, [np.float])

s = _sum(x, tx, recency_T)
indiv_loglike = logaddexp(indiv_loglike, s)

return indiv_loglike
B = log(sum_(x, tx, recency_T)) - betaln_gd - betaln_ab
return logaddexp(A, B)

@staticmethod
def _negative_log_likelihood(params, frequency, recency, n, n_custs,
def _negative_log_likelihood(params, frequency, recency, n_periods, weights,
penalizer_coef=0):
penalizer_term = penalizer_coef * sum(np.asarray(params) ** 2)
return -np.mean(BetaGeoBetaBinomFitter._loglikelihood(
params, frequency, recency, n) * n_custs) + penalizer_term
params, frequency, recency, n_periods) * weights) + penalizer_term

def fit(self, frequency, recency, n, n_custs, verbose=False,
def fit(self, frequency, recency, n_periods, weights=None, verbose=False,
tol=1e-4, iterative_fitting=1, index=None,
fit_method='Nelder-Mead', maxiter=2000, initial_params=None,
**kwargs):
Expand All @@ -101,17 +99,18 @@ def fit(self, frequency, recency, n, n_custs, verbose=False,
Total periods with observed transactions
recency: array_like
Period of most recent transaction
n: array_like
Number of transaction opportunities.
n_custs: array_like
Number of customers with given frequency/recency/T. Fader
and Hardie condense the individual RFM matrix into all
n_periods: array_like
Number of transaction opportunities. Previously called `n`.
weights: None or array_like
Number of customers with given frequency/recency/T,
defaults to 1 if not specified. Fader and
Hardie condense the individual RFM matrix into all
observed combinations of frequency/recency/T. This
parameter represents the count of customers with a given
purchase pattern. Instead of calculating individual
loglikelihood, the loglikelihood is calculated for each
pattern and multiplied by the number of customers with
that pattern.
that pattern. Previously called `n_custs`.
verbose: boolean, optional
Set to true to print out convergence diagnostics.
tol: float, optional
Expand All @@ -137,15 +136,20 @@ def fit(self, frequency, recency, n, n_custs, verbose=False,
fitted and with parameters estimated
"""
frequency = asarray(frequency)
recency = asarray(recency)
n = asarray(n)
n_custs = asarray(n_custs)
_check_inputs(frequency, recency, n)
frequency = asarray(frequency).astype(int)
recency = asarray(recency).astype(int)
n_periods = asarray(n_periods).astype(int)

if weights is None:
weights = np.ones_like(recency, dtype=np.int64)
else:
weights = asarray(weights)

_check_inputs(frequency, recency, n_periods)

params, self._negative_log_likelihood_ = _fit(
self._negative_log_likelihood,
[frequency, recency, n, n_custs, self.penalizer_coef],
[frequency, recency, n_periods, weights, self.penalizer_coef],
iterative_fitting,
initial_params,
4,
Expand All @@ -156,44 +160,43 @@ def fit(self, frequency, recency, n, n_custs, verbose=False,
**kwargs)
self.params_ = OrderedDict(zip(['alpha', 'beta', 'gamma', 'delta'],
params))
self.data = DataFrame(vconcat[frequency, recency, n, n_custs],
columns=['frequency', 'recency', 'n', 'n_custs'])
self.data = DataFrame(vconcat[frequency, recency, n_periods, weights],
columns=['frequency', 'recency', 'n_periods', 'weights'])
if index is not None:
self.data.index = index
# Making a large array replicating n by n_custs having n.
n_exploded = []
for n_, n_cust in zip(n, n_custs):
n_exploded += [n_] * n_cust

self.generate_new_data = lambda size=1: beta_geometric_beta_binom_model(
np.array(n_exploded), *self._unload_params('alpha', 'beta', 'gamma', 'delta'), size=size)
# Making a large array replicating n by n_custs having n.
np.array(sum([n_] * n_cust for (n_, n_cust) in zip(n_periods, weights))),
*self._unload_params('alpha', 'beta', 'gamma', 'delta'), size=size)
return self

def conditional_expected_number_of_purchases_up_to_time(self, t):
def conditional_expected_number_of_purchases_up_to_time(self, m_periods_in_future, frequency, recency, n_periods):
"""
Conditional expected purchases in future time period.
The expected number of future transactions across the next t
The expected number of future transactions across the next m_periods_in_future
transaction opportunities by a customer with purchase history
(x, tx, n).
.. math:: E(X(n, n+n*)|alpha, beta, gamma, delta, frequency, recency, n)
.. math:: E(X(n_periods, n_periods+m_periods_in_future)|alpha, beta, gamma, delta, frequency, recency, n_periods)
See (13) in Fader & Hardie 2010.
Parameters
----------
t: array_like
time periods (n+t)
time n_periods (n+t)
Returns
-------
array_like
predicted transactions
"""
x = self.data['frequency']
tx = self.data['recency']
n = self.data['n']
x = frequency
tx = recency
n = n_periods

params = self._unload_params('alpha', 'beta', 'gamma', 'delta')
alpha, beta, gamma, delta = params
Expand All @@ -203,18 +206,18 @@ def conditional_expected_number_of_purchases_up_to_time(self, t):
p3 = delta / (gamma - 1) * exp(gammaln(gamma + delta) -
gammaln(1 + delta))
p4 = exp(gammaln(1 + delta + n) - gammaln(gamma + delta + n))
p5 = exp(gammaln(1 + delta + n + t) - gammaln(gamma + delta + n + t))
p5 = exp(gammaln(1 + delta + n + m_periods_in_future) - gammaln(gamma + delta + n + m_periods_in_future))

return p1 * p2 * p3 * (p4 - p5)

def conditional_probability_alive(self, m):
def conditional_probability_alive(self, m_periods_in_future, frequency, recency, n_periods):
"""
Conditional probability alive.
Conditional probability customer is alive at transaction opportunity
n + m.
n_periods + m_periods_in_future.
.. math:: P(alive at n + m|alpha, beta, gamma, delta, frequency, recency, n)
.. math:: P(alive at n_periods + m_periods_in_future|alpha, beta, gamma, delta, frequency, recency, n_periods)
See (A10) in Fader and Hardie 2010.
Expand All @@ -232,19 +235,16 @@ def conditional_probability_alive(self, m):
params = self._unload_params('alpha', 'beta', 'gamma', 'delta')
alpha, beta, gamma, delta = params

x = self.data['frequency']
tx = self.data['recency']
n = self.data['n']

p1 = betaln(alpha + x, beta + n - x) - betaln(alpha, beta)
p2 = betaln(gamma, delta + n + m) - betaln(gamma, delta)
p3 = self._loglikelihood(params, x, tx, n)
p1 = betaln(alpha + frequency, beta + n_periods - frequency) - betaln(alpha, beta)
p2 = betaln(gamma, delta + n_periods + m_periods_in_future) - betaln(gamma, delta)
p3 = self._loglikelihood(params, frequency, recency, n_periods)

return exp(p1 + p2) / exp(p3)

def expected_number_of_transactions_in_first_n_periods(self, n):
"""
Return expected number of transactions in first n periods.
Return expected number of transactions in first n n_periods.
Expected number of transactions occurring across first n transaction
opportunities.
Expand All @@ -268,7 +268,7 @@ def expected_number_of_transactions_in_first_n_periods(self, n):
params = self._unload_params('alpha', 'beta', 'gamma', 'delta')
alpha, beta, gamma, delta = params

x_counts = self.data.groupby('frequency')['n_custs'].sum()
x_counts = self.data.groupby('frequency')['weights'].sum()
x = asarray(x_counts.index)

p1 = binom(n, x) * exp(betaln(alpha + x, beta + n - x) -
Expand Down
28 changes: 22 additions & 6 deletions lifetimes/fitters/beta_geo_fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def __init__(self, penalizer_coef=0.0):
"""Initialization, set penalizer_coef."""
self.penalizer_coef = penalizer_coef

def fit(self, frequency, recency, T, iterative_fitting=1,
def fit(self, frequency, recency, T, weights=None, iterative_fitting=1,
initial_params=None, verbose=False, tol=1e-4, index=None,
fit_method='Nelder-Mead', maxiter=2000, **kwargs):
"""
Expand All @@ -71,6 +71,16 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
(denoted t_x in literature).
T: array_like
customers' age (time units since first purchase)
weights: None or array_like
Number of customers with given frequency/recency/T,
defaults to 1 if not specified. Fader and
Hardie condense the individual RFM matrix into all
observed combinations of frequency/recency/T. This
parameter represents the count of customers with a given
purchase pattern. Instead of calculating individual
loglikelihood, the loglikelihood is calculated for each
pattern and multiplied by the number of customers with
that pattern.
iterative_fitting: int, optional
perform iterative_fitting fits over random/warm-started initial params
initial_params: array_like, optional
Expand All @@ -97,18 +107,24 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
with additional properties like params_ and methods like predict
"""
frequency = asarray(frequency)
frequency = asarray(frequency).astype(int)
recency = asarray(recency)
T = asarray(T)
_check_inputs(frequency, recency, T)

if weights is None:
weights = np.ones_like(recency, dtype=np.int64)
else:
weights = asarray(weights)


self._scale = _scale_time(T)
scaled_recency = recency * self._scale
scaled_T = T * self._scale

params, self._negative_log_likelihood_ = _fit(
self._negative_log_likelihood,
[frequency, scaled_recency, scaled_T, self.penalizer_coef],
[frequency, scaled_recency, scaled_T, weights, self.penalizer_coef],
iterative_fitting,
initial_params,
4,
Expand All @@ -132,7 +148,7 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
return self

@staticmethod
def _negative_log_likelihood(params, freq, rec, T, penalizer_coef):
def _negative_log_likelihood(params, freq, rec, T, weights, penalizer_coef):
if npany(asarray(params) <= 0):
return np.inf

Expand All @@ -148,8 +164,8 @@ def _negative_log_likelihood(params, freq, rec, T, penalizer_coef):
(r + freq) * log(rec + alpha)
A_4[isnan(A_4) | isinf(A_4)] = 0
penalizer_term = penalizer_coef * sum(np.asarray(params) ** 2)
return -(A_1 + A_2 + misc.logsumexp(
vconcat[A_3, A_4], axis=1, b=d)).mean() + penalizer_term
return - (weights * (A_1 + A_2 + misc.logsumexp(vconcat[A_3, A_4], axis=1, b=d))).mean() \
+ penalizer_term

def conditional_expected_number_of_purchases_up_to_time(self, t, frequency,
recency, T):
Expand Down
17 changes: 14 additions & 3 deletions lifetimes/fitters/modified_beta_geo_fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self, penalizer_coef=0.0):
"""Initialization, set penalizer_coef."""
super(self.__class__, self).__init__(penalizer_coef)

def fit(self, frequency, recency, T, iterative_fitting=1,
def fit(self, frequency, recency, T, weights=None, iterative_fitting=1,
initial_params=None, verbose=False, tol=1e-4, index=None,
fit_method='Nelder-Mead', maxiter=2000, **kwargs):
"""
Expand All @@ -53,6 +53,16 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
(denoted t_x in literature).
T: array_like
customers' age (time units since first purchase)
weights: None or array_like
Number of customers with given frequency/recency/T,
defaults to 1 if not specified. Fader and
Hardie condense the individual RFM matrix into all
observed combinations of frequency/recency/T. This
parameter represents the count of customers with a given
purchase pattern. Instead of calculating individual
loglikelihood, the loglikelihood is calculated for each
pattern and multiplied by the number of customers with
that pattern.
iterative_fitting: int, optional
perform iterative_fitting fits over random/warm-started initial params
initial_params: array_like, optional
Expand Down Expand Up @@ -83,6 +93,7 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
super(self.__class__, self).fit(frequency,
recency,
T,
weights,
iterative_fitting,
initial_params,
verbose,
Expand All @@ -99,7 +110,7 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
return self

@staticmethod
def _negative_log_likelihood(params, freq, rec, T, penalizer_coef):
def _negative_log_likelihood(params, freq, rec, T, weights, penalizer_coef):
if npany(asarray(params) <= 0):
return np.inf

Expand All @@ -113,7 +124,7 @@ def _negative_log_likelihood(params, freq, rec, T, penalizer_coef):
log(alpha + rec))

penalizer_term = penalizer_coef * sum(np.asarray(params) ** 2)
return -(A_1 + A_2 + A_3 + logaddexp(A_4, 0)).mean() + penalizer_term
return -(weights * (A_1 + A_2 + A_3 + logaddexp(A_4, 0))).mean() + penalizer_term

def expected_number_of_purchases_up_to_time(self, t):
"""
Expand Down
Loading

0 comments on commit 9d6b1c5

Please sign in to comment.