in marketing-analytics/predicting/future-customer-value-segments/common/__init__.py [0:0]
def frequency_model_validation(model_type, cbs, cal_start_date, cal_end_date,
time_divisor, time_label, hold_end_date,
repeat_tx, output_folder, num_customers_cohort,
perc_customers_cohort, num_txns_val,
perc_txns_val, penalizer_coef):
"""Validates frequency model.
Validates the model type on the input data and parameters and calculate the
Mean Absolute Percent Error (MAPE).
Args:
model_type: String defining the type of model to be used.
cbs: Customer-by-sufficient-statistic (CBS) DataFrame.
cal_start_date: Calibration start date.
cal_end_date: Calibration end date.
time_divisor: Number of days depending on the selected granularity.
time_label: Selected time granularity label string.
hold_end_date: Holdout end date.
repeat_tx: Sorted cumulative sum of transactions for each time unit.
output_folder: Folder where the text file containing the result of the
validation will be saved.
num_customers_cohort: Number of customers in the cohort period of time.
perc_customers_cohort: Percentage of customers in the cohort period of
time.
num_txns_val: Number of transactions in the cohort period of time.
perc_txns_val: Percentage of transactions in the cohort period of time.
penalizer_coef: The coefficient applied to an l2 norm on the parameters.
Returns:
A Dict containing the params for the model validation.
"""
model_params = {}
if model_type == _MODEL_TYPE_BGNBD:
frequency_model = fit_bgnbd_model(cbs, penalizer_coef)
model_params['frequency_model'] = 'BG/NBD'
elif model_type == _MODEL_TYPE_MBGNBD:
frequency_model = fit_mbgnbd_model(cbs, penalizer_coef)
model_params['frequency_model'] = 'MBG/NBD'
elif model_type == _MODEL_TYPE_BGBB:
frequency_model = fit_bgbb_model(cbs, penalizer_coef)
model_params['frequency_model'] = 'BG/BB'
elif model_type == _MODEL_TYPE_PNBD:
frequency_model = fit_pnbd_model(cbs, penalizer_coef)
model_params['frequency_model'] = 'Pareto/NBD'
else:
raise ValueError('Model type %s is not valid' % model_type)
# Transactions by time unit predictions
intervals = calc_full_fit_period(cal_start_date, hold_end_date,
time_divisor)
predicted = predict_txs(model_type, frequency_model,
cbs['total_time_observed'].values, intervals)
# Actual transactions per time unit
txs = repeat_tx
# Join predicted to actual
txs = txs.merge(predicted, how='inner', on='time_unit_number')
# Remove the last row as it often has partial data from an incomplete week
txs = txs.drop(txs.index[len(txs) - 1])
# Location of the median line for validation plots
median_line = calc_calibration_period(cal_start_date, cal_end_date,
time_divisor)
# Plot creation
plot_repeat_transaction_over_time(txs, median_line, output_folder,
time_label)
plot_cumulative_repeat_transaction_over_time(txs, median_line,
output_folder, time_label)
# Output customers in cohort and txns observed
model_params['num_customers_cohort'] = num_customers_cohort
model_params['perc_customers_cohort'] = perc_customers_cohort
model_params['num_transactions_validation'] = num_txns_val
model_params['perc_transactions_validation'] = perc_txns_val
# Calculate MAPE (Mean Absolute Percent Error)
# NB: Do not use this as a performance metric, as it is computed on
# cumulative values and will most of the time underestimate the error.
# We are using it only for validation of fit purposes.
error_by_time = (
txs.iloc[median_line:, :]['repeat_transactions_cumulative'] -
txs.iloc[median_line:, :]['predicted_cumulative_transactions']
) / txs.iloc[median_line:, :]['repeat_transactions_cumulative'] * 100
mape = error_by_time.abs().mean()
model_params['validation_mape'] = (
'N/A' if model_type == _MODEL_TYPE_BGBB else str(round(mape, 2)))
# return tuple that includes the validation MAPE, which will be used for a
# threshold check
return model_params