in jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py [0:0]
def _summarize_legacy(self) -> pd.DataFrame:
"""
Converts a `self.summary_df` to the legacy format used in
`moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1`
"""
# TODO: This method should be removed once the forecasting data model is updated:
# https://mozilla-hub.atlassian.net/browse/DS-2676
df = self.summary_df.copy(deep=True)
# rename columns to legacy values
df.rename(
columns={
"forecast_end_date": "asofdate",
"submission_date": "date",
"metric_alias": "target",
"aggregation_period": "unit",
},
inplace=True,
)
df["forecast_date"] = df["forecast_predicted_at"].dt.date
df["type"] = df["source"].replace("historical", "actual")
df = df.replace(
{
"measure": {
"observed": "value",
"p05": "yhat_p5",
"p10": "yhat_p10",
"p20": "yhat_p20",
"p30": "yhat_p30",
"p40": "yhat_p40",
"p50": "yhat_p50",
"p60": "yhat_p60",
"p70": "yhat_p70",
"p80": "yhat_p80",
"p90": "yhat_p90",
"p95": "yhat_p95",
},
"target": {
"desktop_dau": "desktop",
"mobile_dau": "mobile",
},
}
)
# pivot the df from "long" to "wide" format
index_columns = [
"asofdate",
"date",
"target",
"unit",
"forecast_parameters",
"forecast_date",
]
df = (
df[index_columns + ["measure", "value"]]
.pivot(
index=index_columns,
columns="measure",
values="value",
)
.reset_index()
)
# pivot sets the "name" attribute of the columns for some reason. It's
# None by default, so we just reset that here.
df.columns.name = None
# When there's an overlap in the observed and forecasted period -- for
# example, when a monthly forecast is generated mid-month -- the legacy
# format only records the forecasted value, not the observed value. To
# account for this, we'll just find the max of the "mean" (forecasted) and
# "value" (observed) data. In all non-overlapping observed periods, the
# forecasted value will be NULL. In all non-overlapping forecasted periods,
# the observed value will be NULL. In overlapping periods, the forecasted
# value will always be larger because it is the sum of the observed and forecasted
# values. Below is a query that demonstrates the legacy behavior:
#
# SELECT *
# FROM `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1`
# WHERE asofdate = "2023-12-31"
# AND target = "mobile"
# AND unit = "month"
# AND forecast_date = "2022-06-04"
# AND date BETWEEN "2022-05-01" AND "2022-06-01"
# ORDER BY date
df["value"] = df[["mean", "value"]].max(axis=1)
df.drop(columns=["mean"], inplace=True)
# non-numeric columns are represented in the legacy bq schema as strings
string_cols = [
"asofdate",
"date",
"target",
"unit",
"forecast_parameters",
"forecast_date",
]
df[string_cols] = df[string_cols].astype(str)
return df