def frequency_model_validation()

in marketing-analytics/predicting/future-customer-value-segments/common/__init__.py [0:0]
45 lines of code
11 McCabe index (conditional complexity)

def frequency_model_validation(model_type, cbs, cal_start_date, cal_end_date,
                               time_divisor, time_label, hold_end_date,
                               repeat_tx, output_folder, num_customers_cohort,
                               perc_customers_cohort, num_txns_val,
                               perc_txns_val, penalizer_coef):
    """Validates frequency model.

    Validates the model type on the input data and parameters and calculate the
    Mean Absolute Percent Error (MAPE).

    Args:
        model_type: String defining the type of model to be used.
        cbs: Customer-by-sufficient-statistic (CBS) DataFrame.
        cal_start_date: Calibration start date.
        cal_end_date: Calibration end date.
        time_divisor: Number of days depending on the selected granularity.
        time_label: Selected time granularity label string.
        hold_end_date: Holdout end date.
        repeat_tx: Sorted cumulative sum of transactions for each time unit.
        output_folder: Folder where the text file containing the result of the
            validation will be saved.
        num_customers_cohort: Number of customers in the cohort period of time.
        perc_customers_cohort: Percentage of customers in the cohort period of
            time.
        num_txns_val: Number of transactions in the cohort period of time.
        perc_txns_val: Percentage of transactions in the cohort period of time.
        penalizer_coef: The coefficient applied to an l2 norm on the parameters.

    Returns:
        A Dict containing the params for the model validation.
    """
    model_params = {}
    if model_type == _MODEL_TYPE_BGNBD:
        frequency_model = fit_bgnbd_model(cbs, penalizer_coef)
        model_params['frequency_model'] = 'BG/NBD'
    elif model_type == _MODEL_TYPE_MBGNBD:
        frequency_model = fit_mbgnbd_model(cbs, penalizer_coef)
        model_params['frequency_model'] = 'MBG/NBD'
    elif model_type == _MODEL_TYPE_BGBB:
        frequency_model = fit_bgbb_model(cbs, penalizer_coef)
        model_params['frequency_model'] = 'BG/BB'
    elif model_type == _MODEL_TYPE_PNBD:
        frequency_model = fit_pnbd_model(cbs, penalizer_coef)
        model_params['frequency_model'] = 'Pareto/NBD'
    else:
        raise ValueError('Model type %s is not valid' % model_type)

    # Transactions by time unit predictions
    intervals = calc_full_fit_period(cal_start_date, hold_end_date,
                                     time_divisor)
    predicted = predict_txs(model_type, frequency_model,
                            cbs['total_time_observed'].values, intervals)

    # Actual transactions per time unit
    txs = repeat_tx

    # Join predicted to actual
    txs = txs.merge(predicted, how='inner', on='time_unit_number')

    # Remove the last row as it often has partial data from an incomplete week
    txs = txs.drop(txs.index[len(txs) - 1])

    # Location of the median line for validation plots
    median_line = calc_calibration_period(cal_start_date, cal_end_date,
                                          time_divisor)

    # Plot creation
    plot_repeat_transaction_over_time(txs, median_line, output_folder,
                                      time_label)
    plot_cumulative_repeat_transaction_over_time(txs, median_line,
                                                 output_folder, time_label)

    # Output customers in cohort and txns observed
    model_params['num_customers_cohort'] = num_customers_cohort
    model_params['perc_customers_cohort'] = perc_customers_cohort
    model_params['num_transactions_validation'] = num_txns_val
    model_params['perc_transactions_validation'] = perc_txns_val

    # Calculate MAPE (Mean Absolute Percent Error)
    # NB: Do not use this as a performance metric, as it is computed on
    #     cumulative values and will most of the time underestimate the error.
    #     We are using it only for validation of fit purposes.
    error_by_time = (
        txs.iloc[median_line:, :]['repeat_transactions_cumulative'] -
        txs.iloc[median_line:, :]['predicted_cumulative_transactions']
    ) / txs.iloc[median_line:, :]['repeat_transactions_cumulative'] * 100
    mape = error_by_time.abs().mean()

    model_params['validation_mape'] = (
        'N/A' if model_type == _MODEL_TYPE_BGBB else str(round(mape, 2)))

    # return tuple that includes the validation MAPE, which will be used for a
    # threshold check
    return model_params