Merge pull request #220 from CamDavidsonPilon/0.10.0

0.10.0
CamDavidsonPilon · Nov 22, 2018 · 9d6b1c5 · 9d6b1c5
2 parents 3bace68 + 9ac00c8
commit 9d6b1c5
Show file tree

Hide file tree

Showing 14 changed files with 291 additions and 150 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+### 0.10.0
+ - `BetaGeoBetaBinomFitter.fit` has replaced `n_custs` with the more appropriately named `weights` (to align with other statisical libraries). By default and if unspecified, `weights` is equal to an array of 1s. 
+ - The `conditional_` methods on `BetaGeoBetaBinomFitter` have been updated to handle exogenously provided recency, frequency and periods. 
+ - Performance improvements in `BetaGeoBetaBinomFitter`. `fit` takes about 50% less time than previously. 
+ - `BetaGeoFitter`, `ParetoNBDFitter`, and `ModifiedBetaGeoFitter` both have a new `weights` argument in their `fit`. This can be used to reduce the size of the data (collapsing subjects with the same recency, frequency, T). 
+
 ### 0.9.1
  - Added a data generation method, `generate_new_data` to `BetaGeoBetaBinomFitter`. @zscore
  - Fixed a bug in `summary_data_from_transaction_data` that was casting values to `int` prematurely. This was solved by including a new param `freq_multiplier` to be used to scale the resulting durations. See #100 for the original issue.  @aprotopopov

diff --git a/docs/conf.py b/docs/conf.py
@@ -77,9 +77,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.8.0.0'
+version = '0.10.0.0'
 # The full version, including alpha/beta/rc tags.
-release = '0.8.0.0'
+release = '0.10.0.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/lifetimes/datasets/donations.csv b/lifetimes/datasets/donations.csv
@@ -1,4 +1,4 @@
-frequency,recency,n,n_custs
+frequency,recency,periods,weights
 0,0,6,3464
 1,1,6,1091
 1,2,6,277

diff --git a/lifetimes/fitters/beta_geo_beta_binom_fitter.py b/lifetimes/fitters/beta_geo_beta_binom_fitter.py
@@ -7,7 +7,7 @@
 import pandas as pd
 from numpy import log, exp, logaddexp, asarray, c_ as vconcat
 from pandas import DataFrame
-from scipy.special import gammaln, betaln, binom
+from scipy.special import gammaln, betaln, binom, beta as betaf
 
 from ..utils import _fit, _check_inputs
 from . import BaseFitter
@@ -56,39 +56,37 @@ def _loglikelihood(params, x, tx, T):
         """Loglikelihood for optimizer."""
         alpha, beta, gamma, delta = params
 
-        beta_ab = betaln(alpha, beta)
-        beta_gd = betaln(gamma, delta)
-
-        indiv_loglike = (betaln(alpha + x, beta + T - x) - beta_ab +
-                         betaln(gamma, delta + T) - beta_gd)
-
+        betaln_ab = betaln(alpha, beta)
+        betaln_gd = betaln(gamma, delta)
         recency_T = T - tx - 1
 
+        A = (betaln(alpha + x, beta + T - x) - betaln_ab +
+                         betaln(gamma, delta + T) - betaln_gd)
+
         J = np.arange(recency_T.max() + 1)
 
-        @np.vectorize
-        def _sum(x, tx, recency_T):
+        def _sum_(x, tx, recency_T):
             if recency_T <= -1:
-                return -np.inf
+                return 10e-10
+            elif recency_T == 0:
+                return betaf(alpha + x, beta + tx - x) * betaf(gamma + 1, delta + tx)
+            else:
+                j = J[:recency_T + 1]
+                return (betaf(alpha + x, beta + tx - x + j) * betaf(gamma + 1, delta + tx + j)).sum()
 
-            j = J[:int(recency_T) + 1]
-            return log(
-                np.sum(exp(betaln(alpha + x, beta + tx - x + j) - beta_ab +
-                           betaln(gamma + 1, delta + tx + j) - beta_gd)))
+        sum_ = np.vectorize(_sum_, [np.float])
 
-        s = _sum(x, tx, recency_T)
-        indiv_loglike = logaddexp(indiv_loglike, s)
-
-        return indiv_loglike
+        B = log(sum_(x, tx, recency_T)) - betaln_gd - betaln_ab
+        return logaddexp(A, B)
 
     @staticmethod
-    def _negative_log_likelihood(params, frequency, recency, n, n_custs,
+    def _negative_log_likelihood(params, frequency, recency, n_periods, weights,
                                  penalizer_coef=0):
         penalizer_term = penalizer_coef * sum(np.asarray(params) ** 2)
         return -np.mean(BetaGeoBetaBinomFitter._loglikelihood(
-            params, frequency, recency, n) * n_custs) + penalizer_term
+            params, frequency, recency, n_periods) * weights) + penalizer_term
 
-    def fit(self, frequency, recency, n, n_custs, verbose=False,
+    def fit(self, frequency, recency, n_periods, weights=None, verbose=False,
             tol=1e-4, iterative_fitting=1, index=None,
             fit_method='Nelder-Mead', maxiter=2000, initial_params=None,
             **kwargs):
@@ -101,17 +99,18 @@ def fit(self, frequency, recency, n, n_custs, verbose=False,
             Total periods with observed transactions
         recency: array_like
             Period of most recent transaction
-        n: array_like
-            Number of transaction opportunities.
-        n_custs: array_like
-            Number of customers with given frequency/recency/T. Fader
-            and Hardie condense the individual RFM matrix into all
+        n_periods: array_like
+            Number of transaction opportunities. Previously called `n`.
+        weights: None or array_like
+            Number of customers with given frequency/recency/T,
+            defaults to 1 if not specified. Fader and
+            Hardie condense the individual RFM matrix into all
             observed combinations of frequency/recency/T. This
             parameter represents the count of customers with a given
             purchase pattern. Instead of calculating individual
             loglikelihood, the loglikelihood is calculated for each
             pattern and multiplied by the number of customers with
-            that pattern.
+            that pattern.  Previously called `n_custs`.
         verbose: boolean, optional
             Set to true to print out convergence diagnostics.
         tol: float, optional
@@ -137,15 +136,20 @@ def fit(self, frequency, recency, n, n_custs, verbose=False,
             fitted and with parameters estimated
 
         """
-        frequency = asarray(frequency)
-        recency = asarray(recency)
-        n = asarray(n)
-        n_custs = asarray(n_custs)
-        _check_inputs(frequency, recency, n)
+        frequency = asarray(frequency).astype(int)
+        recency = asarray(recency).astype(int)
+        n_periods = asarray(n_periods).astype(int)
+
+        if weights is None:
+            weights = np.ones_like(recency, dtype=np.int64)
+        else:
+            weights = asarray(weights)
+
+        _check_inputs(frequency, recency, n_periods)
 
         params, self._negative_log_likelihood_ = _fit(
             self._negative_log_likelihood,
-            [frequency, recency, n, n_custs, self.penalizer_coef],
+            [frequency, recency, n_periods, weights, self.penalizer_coef],
             iterative_fitting,
             initial_params,
             4,
@@ -156,44 +160,43 @@ def fit(self, frequency, recency, n, n_custs, verbose=False,
             **kwargs)
         self.params_ = OrderedDict(zip(['alpha', 'beta', 'gamma', 'delta'],
                                        params))
-        self.data = DataFrame(vconcat[frequency, recency, n, n_custs],
-                              columns=['frequency', 'recency', 'n', 'n_custs'])
+        self.data = DataFrame(vconcat[frequency, recency, n_periods, weights],
+                              columns=['frequency', 'recency', 'n_periods', 'weights'])
         if index is not None:
             self.data.index = index
-        # Making a large array replicating n by n_custs having n.
-        n_exploded = []
-        for n_, n_cust in zip(n, n_custs):
-            n_exploded += [n_] * n_cust
+
         self.generate_new_data = lambda size=1: beta_geometric_beta_binom_model(
-            np.array(n_exploded), *self._unload_params('alpha', 'beta', 'gamma', 'delta'), size=size)
+            # Making a large array replicating n by n_custs having n.
+            np.array(sum([n_] * n_cust for (n_, n_cust) in zip(n_periods, weights))),
+            *self._unload_params('alpha', 'beta', 'gamma', 'delta'), size=size)
         return self
 
-    def conditional_expected_number_of_purchases_up_to_time(self, t):
+    def conditional_expected_number_of_purchases_up_to_time(self, m_periods_in_future, frequency, recency, n_periods):
         """
         Conditional expected purchases in future time period.
 
-        The  expected  number  of  future  transactions across the next t
+        The  expected  number  of  future  transactions across the next m_periods_in_future
         transaction opportunities by a customer with purchase history
         (x, tx, n).
 
-        .. math:: E(X(n, n+n*)|alpha, beta, gamma, delta, frequency, recency, n)
+        .. math:: E(X(n_periods, n_periods+m_periods_in_future)|alpha, beta, gamma, delta, frequency, recency, n_periods)
 
         See (13) in Fader & Hardie 2010.
 
         Parameters
         ----------
         t: array_like
-            time periods (n+t)
+            time n_periods (n+t)
 
         Returns
         -------
         array_like
             predicted transactions
 
         """
-        x = self.data['frequency']
-        tx = self.data['recency']
-        n = self.data['n']
+        x = frequency
+        tx = recency
+        n = n_periods
 
         params = self._unload_params('alpha', 'beta', 'gamma', 'delta')
         alpha, beta, gamma, delta = params
@@ -203,18 +206,18 @@ def conditional_expected_number_of_purchases_up_to_time(self, t):
         p3 = delta / (gamma - 1) * exp(gammaln(gamma + delta) -
                                        gammaln(1 + delta))
         p4 = exp(gammaln(1 + delta + n) - gammaln(gamma + delta + n))
-        p5 = exp(gammaln(1 + delta + n + t) - gammaln(gamma + delta + n + t))
+        p5 = exp(gammaln(1 + delta + n + m_periods_in_future) - gammaln(gamma + delta + n + m_periods_in_future))
 
         return p1 * p2 * p3 * (p4 - p5)
 
-    def conditional_probability_alive(self, m):
+    def conditional_probability_alive(self, m_periods_in_future, frequency, recency, n_periods):
         """
         Conditional probability alive.
 
         Conditional probability customer is alive at transaction opportunity
-        n + m.
+        n_periods + m_periods_in_future.
 
-        .. math:: P(alive at n + m|alpha, beta, gamma, delta, frequency, recency, n)
+        .. math:: P(alive at n_periods + m_periods_in_future|alpha, beta, gamma, delta, frequency, recency, n_periods)
 
         See (A10) in Fader and Hardie 2010.
 
@@ -232,19 +235,16 @@ def conditional_probability_alive(self, m):
         params = self._unload_params('alpha', 'beta', 'gamma', 'delta')
         alpha, beta, gamma, delta = params
 
-        x = self.data['frequency']
-        tx = self.data['recency']
-        n = self.data['n']
 
-        p1 = betaln(alpha + x, beta + n - x) - betaln(alpha, beta)
-        p2 = betaln(gamma, delta + n + m) - betaln(gamma, delta)
-        p3 = self._loglikelihood(params, x, tx, n)
+        p1 = betaln(alpha + frequency, beta + n_periods - frequency) - betaln(alpha, beta)
+        p2 = betaln(gamma, delta + n_periods + m_periods_in_future) - betaln(gamma, delta)
+        p3 = self._loglikelihood(params, frequency, recency, n_periods)
 
         return exp(p1 + p2) / exp(p3)
 
     def expected_number_of_transactions_in_first_n_periods(self, n):
         """
-        Return expected number of transactions in first n periods.
+        Return expected number of transactions in first n n_periods.
 
         Expected number of transactions occurring across first n transaction
         opportunities.
@@ -268,7 +268,7 @@ def expected_number_of_transactions_in_first_n_periods(self, n):
         params = self._unload_params('alpha', 'beta', 'gamma', 'delta')
         alpha, beta, gamma, delta = params
 
-        x_counts = self.data.groupby('frequency')['n_custs'].sum()
+        x_counts = self.data.groupby('frequency')['weights'].sum()
         x = asarray(x_counts.index)
 
         p1 = binom(n, x) * exp(betaln(alpha + x, beta + n - x) -

diff --git a/lifetimes/fitters/beta_geo_fitter.py b/lifetimes/fitters/beta_geo_fitter.py
@@ -55,7 +55,7 @@ def __init__(self, penalizer_coef=0.0):
         """Initialization, set penalizer_coef."""
         self.penalizer_coef = penalizer_coef
 
-    def fit(self, frequency, recency, T, iterative_fitting=1,
+    def fit(self, frequency, recency, T, weights=None, iterative_fitting=1,
             initial_params=None, verbose=False, tol=1e-4, index=None,
             fit_method='Nelder-Mead', maxiter=2000, **kwargs):
         """
@@ -71,6 +71,16 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
             (denoted t_x in literature).
         T: array_like
             customers' age (time units since first purchase)
+        weights: None or array_like
+            Number of customers with given frequency/recency/T,
+            defaults to 1 if not specified. Fader and
+            Hardie condense the individual RFM matrix into all
+            observed combinations of frequency/recency/T. This
+            parameter represents the count of customers with a given
+            purchase pattern. Instead of calculating individual
+            loglikelihood, the loglikelihood is calculated for each
+            pattern and multiplied by the number of customers with
+            that pattern.
         iterative_fitting: int, optional
             perform iterative_fitting fits over random/warm-started initial params
         initial_params: array_like, optional
@@ -97,18 +107,24 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
             with additional properties like params_ and methods like predict
 
         """
-        frequency = asarray(frequency)
+        frequency = asarray(frequency).astype(int)
         recency = asarray(recency)
         T = asarray(T)
         _check_inputs(frequency, recency, T)
 
+        if weights is None:
+            weights = np.ones_like(recency, dtype=np.int64)
+        else:
+            weights = asarray(weights)
+
+
         self._scale = _scale_time(T)
         scaled_recency = recency * self._scale
         scaled_T = T * self._scale
 
         params, self._negative_log_likelihood_ = _fit(
             self._negative_log_likelihood,
-            [frequency, scaled_recency, scaled_T, self.penalizer_coef],
+            [frequency, scaled_recency, scaled_T, weights, self.penalizer_coef],
             iterative_fitting,
             initial_params,
             4,
@@ -132,7 +148,7 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
         return self
 
     @staticmethod
-    def _negative_log_likelihood(params, freq, rec, T, penalizer_coef):
+    def _negative_log_likelihood(params, freq, rec, T, weights, penalizer_coef):
         if npany(asarray(params) <= 0):
             return np.inf
 
@@ -148,8 +164,8 @@ def _negative_log_likelihood(params, freq, rec, T, penalizer_coef):
             (r + freq) * log(rec + alpha)
         A_4[isnan(A_4) | isinf(A_4)] = 0
         penalizer_term = penalizer_coef * sum(np.asarray(params) ** 2)
-        return -(A_1 + A_2 + misc.logsumexp(
-            vconcat[A_3, A_4], axis=1, b=d)).mean() + penalizer_term
+        return - (weights * (A_1 + A_2 + misc.logsumexp(vconcat[A_3, A_4], axis=1, b=d))).mean() \
+                            + penalizer_term
 
     def conditional_expected_number_of_purchases_up_to_time(self, t, frequency,
                                                             recency, T):

diff --git a/lifetimes/fitters/modified_beta_geo_fitter.py b/lifetimes/fitters/modified_beta_geo_fitter.py
@@ -37,7 +37,7 @@ def __init__(self, penalizer_coef=0.0):
         """Initialization, set penalizer_coef."""
         super(self.__class__, self).__init__(penalizer_coef)
 
-    def fit(self, frequency, recency, T, iterative_fitting=1,
+    def fit(self, frequency, recency, T, weights=None, iterative_fitting=1,
             initial_params=None, verbose=False, tol=1e-4, index=None,
             fit_method='Nelder-Mead', maxiter=2000, **kwargs):
         """
@@ -53,6 +53,16 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
             (denoted t_x in literature).
         T: array_like
             customers' age (time units since first purchase)
+        weights: None or array_like
+            Number of customers with given frequency/recency/T,
+            defaults to 1 if not specified. Fader and
+            Hardie condense the individual RFM matrix into all
+            observed combinations of frequency/recency/T. This
+            parameter represents the count of customers with a given
+            purchase pattern. Instead of calculating individual
+            loglikelihood, the loglikelihood is calculated for each
+            pattern and multiplied by the number of customers with
+            that pattern.
         iterative_fitting: int, optional
             perform iterative_fitting fits over random/warm-started initial params
         initial_params: array_like, optional
@@ -83,6 +93,7 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
         super(self.__class__, self).fit(frequency,
                                         recency,
                                         T,
+                                        weights,
                                         iterative_fitting,
                                         initial_params,
                                         verbose,
@@ -99,7 +110,7 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
         return self
 
     @staticmethod
-    def _negative_log_likelihood(params, freq, rec, T, penalizer_coef):
+    def _negative_log_likelihood(params, freq, rec, T, weights, penalizer_coef):
         if npany(asarray(params) <= 0):
             return np.inf
 
@@ -113,7 +124,7 @@ def _negative_log_likelihood(params, freq, rec, T, penalizer_coef):
                                                      log(alpha + rec))
 
         penalizer_term = penalizer_coef * sum(np.asarray(params) ** 2)
-        return -(A_1 + A_2 + A_3 + logaddexp(A_4, 0)).mean() + penalizer_term
+        return -(weights * (A_1 + A_2 + A_3 + logaddexp(A_4, 0))).mean() + penalizer_term
 
     def expected_number_of_purchases_up_to_time(self, t):
         """