def get_thresholds()

in src/mozanalysis/bayesian_stats/survival_func.py [0:0]


def get_thresholds(col, max_num_thresholds=101):
    """Return a set of interesting thresholds for the dataset ``col``

    Assumes that the values are non-negative, with zero as a special case.

    Args:
        col: a Series of individuals' data for a metric
        max_num_thresholds (int): Return at most this many threshold values.

    Returns:
        A list of thresholds. By default these are de-duped percentiles
        of the nonzero data.
    """
    if col.isnull().any():
        raise ValueError("'col' contains null values")

    if col.min() < 0:
        raise ValueError("This function assumes non-negative data")

    # When taking quantiles, treat "0" as a special case so that we
    # still have resolution if 99% of users are 0.
    nonzero_quantiles = col[col > 0].quantile(
        np.linspace(0, 1, max_num_thresholds),
        # 'nearest' is not what we want, but is the least-bad option.
        # Can't use the default 'linear' because we want to call 'unique()'
        # to avoid duplicating work.
        # Can't use 'lower', 'higher', or 'midpoint' due to rounding issues
        # that lead to dumb choices. That leaves us with 'nearest'
        interpolation="nearest",
    )
    return sorted([np.float64(0)] + list(nonzero_quantiles.unique()))[
        :-1
    ]  # The thresholds get used as `>` not `>=`, so exclude the max value