def calculate_model_fit_validation()

in marketing-analytics/predicting/future-customer-value-segments/common/__init__.py [0:0]


def calculate_model_fit_validation(_, options, dates, calcbs, repeat_tx,
                                   num_customers, num_txns):
    """Checks whether the input transactions data is good enough for the
    pipeline to continue.

    Args:
        _: Ignoring the first argument since it's only a value to trigger
            the call of this function (since it's running inside a FlatMap
            operator)
        options: Pipeline options.
        dates: Dictionary containing important border dates to use in the
            calculation.
        calcbs: customer-by-sufficient-statistic (CBS) matrix in calibration
            period.
        repeat_tx: Sorted cumulative sum of transactions for each time unit.
        num_customers: Dictionary containing statistics regarding the number
            of customers.
        num_txns: Dictionary containing statistics regarding the number
            of transactions.

    Returns:
        Predictions per customer (as lists).
        The result is wrapped in another list since this function is called
        inside a FlatMap operator.
    """
    cbs = pd.DataFrame(
        calcbs,
        columns=[
            'customer_id', 'number_of_transactions', 'average_order_value',
            'frequency', 'recency', 'total_time_observed'
        ])
    txs = pd.DataFrame(
        repeat_tx,
        columns=[
            'time_unit_number', 'repeat_transactions',
            'repeat_transactions_cumulative'
        ])

    model_time_divisor = TimeGranularityParams(
        options[_OPTION_MODEL_TIME_GRANULARITY]).get_days()

    model_params = frequency_model_validation(
        model_type=options[_OPTION_FREQUENCY_MODEL_TYPE],
        cbs=cbs,
        cal_start_date=dates[_OPTION_CALIBRATION_START_DATE],
        cal_end_date=dates[_OPTION_CALIBRATION_END_DATE],
        time_divisor=model_time_divisor,
        time_label=options[_OPTION_MODEL_TIME_GRANULARITY],
        hold_end_date=dates[_OPTION_HOLDOUT_END_DATE],
        repeat_tx=txs,
        output_folder=options[_OPTION_OUTPUT_FOLDER],
        num_customers_cohort=num_customers['num_customers_cohort'],
        perc_customers_cohort=num_customers['perc_customers_cohort'],
        num_txns_val=num_txns['num_txns_val'],
        perc_txns_val=num_txns['perc_txns_val'],
        penalizer_coef=options[_OPTION_PENALIZER_COEF],
    )

    # Validate the gamma-gamma (spend) model
    gamma_gamma_validation(cbs, options[_OPTION_PENALIZER_COEF])

    validation_params = {
        'calibration_start_date': date_to_str(dates[_OPTION_CALIBRATION_START_DATE]),
        'calibration_end_date': date_to_str(dates[_OPTION_CALIBRATION_END_DATE]),
        'cohort_start_date': date_to_str(dates[_OPTION_COHORT_START_DATE]),
        'cohort_end_date': date_to_str(dates[_OPTION_COHORT_END_DATE]),
        'holdout_end_date': date_to_str(dates[_OPTION_HOLDOUT_END_DATE]),
        'model_time_granularity': options[_OPTION_MODEL_TIME_GRANULARITY].capitalize(),
        'model': model_params,
    }

    # Let's check to see if the transaction frequency error is within
    # the allowed threshold.  If so, continue the calculation.  If not,
    # fail with an error and stop all calculations.
    error = None
    if (
            options[_OPTION_FREQUENCY_MODEL_TYPE] != _MODEL_TYPE_BGBB and
            float(model_params['validation_mape']) > float(
                options[_OPTION_TRANSACTION_FREQUENCY_THRESHOLD])):
        model_params['invalid_mape'] = True
        error = (
            f"Validation Mean Absolute Percent Error (MAPE) [{model_params['validation_mape']}%]"
            " exceeded the allowable threshold of "
            f"{options[_OPTION_TRANSACTION_FREQUENCY_THRESHOLD]}"
        )

    return [(validation_params, error)]