in src/smallmatter/ds.py [0:0]
def save_reports(self, output_dir) -> None:
"""Save the reports of a `CdfResult` instance.
This will save the intermediate stats as an .xlsx file, and each CDF as an .xlsx and an interactive .html.
"""
from .bkh import BokehPlotter
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Terminologies (I mixed this quite alot, so I feel it warrants a dedicated commentary):
# - x-axis denote "x_unit / y_basic_unit", e.g., "date_cnt / item".
# * x_unit comes from cdf_df.
# * y_basic_unit comes from stats_df.
# - y-axis is the probability, and the raw count is y_unit, e.g., "item_cnt".
# * y_unit comes from cdf_df.
#
# Grand example: x_unit="date_cnt", y_basic_unit="item", y_unit="item_cnt".
#
# When in doubt, see the sample dataframes in the docstring of cdf().
y_basic_unit = "#".join(self.stats_df.index.names)
y_unit = self.cdf_count_name
# Save the intermediate statistics
stats_fname = output_dir / f"stats-by-{y_basic_unit}.xlsx"
self.stats_df.to_excel(stats_fname, index=True, freeze_panes=(1, 1))
# Save the cdf statistics.
for col, cdf_df in self.cdf.items():
x_unit = cdf_df.index.name
# Save as table
self.cdf[col].to_excel(
output_dir / f"cdf-{x_unit}-per-{y_basic_unit}.xlsx", index=True, freeze_panes=(1, 0)
)
# Save as interactive html
bp = BokehPlotter(
self.cdf[col].reset_index().rename({col: "x", "cdf": "y"}, axis=1),
plot_width=960,
plot_height=480,
title=f"CDF of {x_unit} / {y_basic_unit}",
x_label=f"{x_unit} / {y_basic_unit}",
y_label="Cumulative Probability",
hover_tooltips={
f"cumprob": "@y",
f"{y_unit}": f"@{{{y_unit}}}",
f"{x_unit}": "@x",
"Here to left (aka cum-sum)": "@cum_sum",
"After here (aka right-hand side)": "@rhs",
},
)
bp.gen_plot()
# bokeh.plotting.show(bp.plot)
bp.save_html(output_dir / f"cdf-{x_unit}-per-{y_unit}.html")