Skip to content
This repository has been archived by the owner on Jun 28, 2024. It is now read-only.

Commit

Permalink
Merge pull request #377 from utkarshgupta137/master
Browse files Browse the repository at this point in the history
utils.py: Add option to include first transaction
  • Loading branch information
CamDavidsonPilon authored Jul 6, 2020
2 parents a432ee1 + f8ab14d commit c666fe5
Showing 1 changed file with 44 additions and 27 deletions.
71 changes: 44 additions & 27 deletions lifetimes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def calibration_and_holdout_data(
freq_multiplier=1,
datetime_format=None,
monetary_value_col=None,
include_first_transaction=False,
):
"""
Create a summary of each customer over a calibration and holdout period.
Expand Down Expand Up @@ -68,6 +69,11 @@ def calibration_and_holdout_data(
monetary_value_col: string, optional
the column in transactions that denotes the monetary value of the transaction.
Optional, only needed for customer lifetime value estimation models.
include_first_transaction: bool, optional
Default: False
By default the first transaction is not included while calculating frequency and
monetary_value. Can be set to True to include it.
Should be False if you are going to use this data with any fitters in lifetimes package
Returns
-------
Expand Down Expand Up @@ -103,6 +109,7 @@ def to_period(d):
freq=freq,
freq_multiplier=freq_multiplier,
monetary_value_col=monetary_value_col,
include_first_transaction=include_first_transaction,
)
calibration_summary_data.columns = [c + "_cal" for c in calibration_summary_data.columns]

Expand Down Expand Up @@ -229,6 +236,7 @@ def summary_data_from_transaction_data(
observation_period_end=None,
freq="D",
freq_multiplier=1,
include_first_transaction=False,
):
"""
Return summary data from transactions.
Expand Down Expand Up @@ -262,6 +270,11 @@ def summary_data_from_transaction_data(
Default: 1. Useful for getting exact recency & T. Example:
With freq='D' and freq_multiplier=1, we get recency=591 and T=632
With freq='h' and freq_multiplier=24, we get recency=590.125 and T=631.375
include_first_transaction: bool, optional
Default: False
By default the first transaction is not included while calculating frequency and
monetary_value. Can be set to True to include it.
Should be False if you are going to use this data with any fitters in lifetimes package
Returns
-------
Expand All @@ -288,20 +301,24 @@ def summary_data_from_transaction_data(
# count all orders by customer.
customers = repeated_transactions.groupby(customer_id_col, sort=False)[datetime_col].agg(["min", "max", "count"])

# subtract 1 from count, as we ignore their first order.
customers["frequency"] = customers["count"] - 1
if not include_first_transaction:
# subtract 1 from count, as we ignore their first order.
customers["frequency"] = customers["count"] - 1
else:
customers["frequency"] = customers["count"]

customers["T"] = (observation_period_end - customers["min"]) / np.timedelta64(1, freq) / freq_multiplier
customers["recency"] = (customers["max"] - customers["min"]) / np.timedelta64(1, freq) / freq_multiplier

summary_columns = ["frequency", "recency", "T"]

if monetary_value_col:
# create an index of all the first purchases
first_purchases = repeated_transactions[repeated_transactions["first"]].index
# by setting the monetary_value cells of all the first purchases to NaN,
# those values will be excluded from the mean value calculation
repeated_transactions.loc[first_purchases, monetary_value_col] = np.nan
if not include_first_transaction:
# create an index of all the first purchases
first_purchases = repeated_transactions[repeated_transactions["first"]].index
# by setting the monetary_value cells of all the first purchases to NaN,
# those values will be excluded from the mean value calculation
repeated_transactions.loc[first_purchases, monetary_value_col] = np.nan
customers["monetary_value"] = (
repeated_transactions.groupby(customer_id_col)[monetary_value_col].mean().fillna(0)
)
Expand All @@ -311,10 +328,10 @@ def summary_data_from_transaction_data(


def calculate_alive_path(
model,
transactions,
datetime_col,
t,
model,
transactions,
datetime_col,
t,
freq="D"
):
"""
Expand Down Expand Up @@ -381,9 +398,9 @@ def _scale_time(


def _check_inputs(
frequency,
recency=None,
T=None,
frequency,
recency=None,
T=None,
monetary_value=None
):
"""
Expand Down Expand Up @@ -430,13 +447,13 @@ def _check_inputs(


def _customer_lifetime_value(
transaction_prediction_model,
frequency,
recency,
T,
monetary_value,
time=12,
discount_rate=0.01,
transaction_prediction_model,
frequency,
recency,
T,
monetary_value,
time=12,
discount_rate=0.01,
freq="D"
):
"""
Expand Down Expand Up @@ -506,7 +523,7 @@ def expected_cumulative_transactions(
This function follows the formulation on page 8 of [1]_.
In more detail, we take only the customers who have made their first
transaction before the specific date and then multiply them by the distribution of the
transaction before the specific date and then multiply them by the distribution of the
``expected_number_of_purchases_up_to_time()`` for their whole future. Doing that for
all dates and then summing the distributions will give us the *complete cumulative
purchases*.
Expand Down Expand Up @@ -548,7 +565,7 @@ def expected_cumulative_transactions(
A Note on Implementing the Pareto/NBD Model in MATLAB.
http://brucehardie.com/notes/008/
"""

start_date = pd.to_datetime(transactions[datetime_col], format=datetime_format).min()
start_period = start_date.to_period(freq)
observation_period_end = start_period + t
Expand Down Expand Up @@ -621,9 +638,9 @@ def expected_cumulative_transactions(


def _save_obj_without_attr(
obj,
attr_list,
path,
obj,
attr_list,
path,
values_to_save=None
):
"""
Expand Down Expand Up @@ -657,4 +674,4 @@ def _save_obj_without_attr(
dill.dump(obj, out_file)

for attr, item in saved_attr_dict.items():
setattr(obj, attr, item)
setattr(obj, attr, item)

0 comments on commit c666fe5

Please sign in to comment.