in analysis/analyze.py [0:0]
def rawAggregateExperimentResults(dat, by=['qid', 'query_idx', 'query', 'sketch_name']):
by = cleanByClause(by)
aggdat = dat.groupby(by, as_index=False).agg(
bias=('error', np.mean),
std=('error', np.std),
rmse=('error', lambda x: np.linalg.norm(x)),
q5=('error',lambda x: np.quantile(x, 0.05)),
q25=('error',lambda x: np.quantile(x, 0.25)),
q75=('error',lambda x: np.quantile(x, 0.75)),
q95=('error',lambda x: np.quantile(x, 0.95)),
size=('sketch_size', np.mean),
size_bytes=('sketch_bytes', np.mean),
observed_max_size=('sketch_size', np.max),
count=('error', 'count'),
)
# two level calculation for variance
vardat_by_workload = dat.groupby(by + ['data_seed'], as_index=False).agg(
var=('error', populationVar),
bias=('error', np.mean),
count=('error','count'),
)
vardat_by_workload['var'] = vardat_by_workload['var'] / np.sqrt(vardat_by_workload['count']-1 + 1e-6)
vardat = vardat_by_workload.groupby(by, as_index=False).agg(
var_expectation=('bias', populationVar),
expected_var=('var', np.mean),
count_by_seed=('bias', 'count')
)
vardat['var_bias'] = (vardat['var_expectation'] + vardat['expected_var']) / (vardat['count_by_seed']-1 + 1e-6)
aggdat = aggdat.merge(vardat)
aggdat['bias_lower95'] = aggdat['bias'] - 1.96 * np.sqrt(aggdat['var_bias'])
aggdat['bias_upper95'] = aggdat['bias'] + 1.96 * np.sqrt(aggdat['var_bias'])
return aggdat