in src/mozanalysis/bayesian_stats/survival_func.py [0:0]
def get_thresholds(col, max_num_thresholds=101):
"""Return a set of interesting thresholds for the dataset ``col``
Assumes that the values are non-negative, with zero as a special case.
Args:
col: a Series of individuals' data for a metric
max_num_thresholds (int): Return at most this many threshold values.
Returns:
A list of thresholds. By default these are de-duped percentiles
of the nonzero data.
"""
if col.isnull().any():
raise ValueError("'col' contains null values")
if col.min() < 0:
raise ValueError("This function assumes non-negative data")
# When taking quantiles, treat "0" as a special case so that we
# still have resolution if 99% of users are 0.
nonzero_quantiles = col[col > 0].quantile(
np.linspace(0, 1, max_num_thresholds),
# 'nearest' is not what we want, but is the least-bad option.
# Can't use the default 'linear' because we want to call 'unique()'
# to avoid duplicating work.
# Can't use 'lower', 'higher', or 'midpoint' due to rounding issues
# that lead to dumb choices. That leaves us with 'nearest'
interpolation="nearest",
)
return sorted([np.float64(0)] + list(nonzero_quantiles.unique()))[
:-1
] # The thresholds get used as `>` not `>=`, so exclude the max value