in src/mozanalysis/bayesian_stats/survival_func.py [0:0]
def compare_branches(df, col_label, ref_branch_label="control", thresholds=None):
"""Return the survival functions and relative uplifts thereupon.
This function generates data for a metric's survival function
(1 - cumulative distribution function) for each branch, and
calculates the relative uplift compared to the reference branch
identified by ``ref_branch_label``.
It converts the non-negative, real-valued per-user metric data in
``df[col_label]`` into ``n=len(thresholds)`` different binary
metrics, and analyzes these ``n`` metrics with the Bayesian binary
methods.
The precise values of the thresholds usually don't matter unless
certain thresholds have been standardized outside the context of
this experiment.
The results are related to those obtained by bootstrapping a range
of quantiles over the data:
* In the survival plot, we set a value for the metric and calculate
the fraction of the data that was above this value, with
uncertainty on the fraction.
* When bootstrapping quantiles, we set a quantile (a fraction of
the data) and find the value such that the given fraction of
data is greater than this value, with uncertainty on the value.
Reiterating: if we plot the survival function with metric values
on the x axis and "fractions" on the y axis, then this function
first chooses some sensible values for x then runs statistics to
compute values for y, with uncertainty. If we were bootstrapping
quantiles, then we would choose some sensible values for y then
run statistics to compute values for x, with uncertainty.
Args:
df: a pandas DataFrame of queried experiment data in the
standard format. Target metric should be non-negative.
col_label (str): Label for the df column contaning the metric
to be analyzed.
ref_branch_label (str, optional): String in ``df['branch']``
that identifies the the branch with respect to which we
want to calculate uplifts - usually the control branch.
thresholds (list/ndarray, optional): Thresholds that define the
metric's quantization; ``df[col_label]``
Returns a dictionary:
* 'individual': dictionary mapping branch names to a pandas
DataFrame containing values from the survival function.
The DataFrames' indexes are the list of thresholds; the
columns are summary statistics on the survival function.
* 'comparative': dictionary mapping branch names to a pandas
DataFrame of summary statistics for the possible uplifts of the
conversion rate relative to the reference branch - see docs
for
:meth:`mozanalysis.stats.summarize_samples.summarize_joint_samples_batch`.
"""
branch_list = df.branch.unique()
if not thresholds:
thresholds = get_thresholds(df[col_label])
data = {t: _one_thresh(t, df, col_label, ref_branch_label) for t in thresholds}
return {
"individual": {
b: pd.DataFrame(
{t: d["individual"][b] for t, d in data.items()}, columns=thresholds
).T
for b in branch_list
},
"comparative": {
b: pd.DataFrame({t: d["comparative"][b] for t, d in data.items()}).T
for b in set(branch_list) - {ref_branch_label}
},
}