in orbit/forecaster/forecaster.py [0:0]
def _set_prediction_meta(self, df: pd.DataFrame) -> pd.DataFrame:
"""A default pre-processing and information gathering from prediction input dataframe"""
# remove reference from original input
df = df.copy()
# get prediction df meta
prediction_meta = {
PredictionMetaKeys.DATE_ARRAY.value: pd.to_datetime(
df[self.date_col]
).reset_index(drop=True),
PredictionMetaKeys.PREDICTION_DF_LEN.value: df.shape[0],
PredictionMetaKeys.START.value: df[self.date_col].iloc[0],
PredictionMetaKeys.END.value: df[self.date_col].iloc[-1],
}
if not is_ordered_datetime(prediction_meta[TrainingMetaKeys.DATE_ARRAY.value]):
raise ForecasterException("Datetime index must be ordered and not repeat")
if not is_even_gap_datetime(prediction_meta[TrainingMetaKeys.DATE_ARRAY.value]):
warnings.warn("Datetime index is not evenly distributed")
# TODO: validate that all regressor columns are present, if any
if (
prediction_meta[PredictionMetaKeys.START.value]
< self._training_meta[TrainingMetaKeys.START.value]
):
raise ForecasterException("Prediction start must be after training start.")
trained_len = self._training_meta[TrainingMetaKeys.NUM_OF_OBS.value]
# If we cannot find a match of prediction range, assume prediction starts right after train
# end
if (
prediction_meta[PredictionMetaKeys.START.value]
> self._training_meta[TrainingMetaKeys.END.value]
):
# time index for prediction start and end
start_idx = trained_len
end_idx = start_idx + df.shape[0]
forecast_dates = set(prediction_meta[TrainingMetaKeys.DATE_ARRAY.value])
n_forecast_steps = len(forecast_dates)
else:
# # compute how many steps to forecast
# forecast_dates = set(
# prediction_meta[TrainingMetaKeys.DATE_ARRAY.value]
# ) - set(self._training_meta[TrainingMetaKeys.DATE_ARRAY.value])
# # check if prediction df is a subset of training df
# # e.g. "negative" forecast steps
# n_forecast_steps = len(forecast_dates) or -(
# len(
# set(self._training_meta[TrainingMetaKeys.DATE_ARRAY.value])
# - set(prediction_meta[TrainingMetaKeys.DATE_ARRAY.value])
# )
# )
# time index for prediction start and end
start_idx = pd.Index(
self._training_meta[TrainingMetaKeys.DATE_ARRAY.value]
).get_loc(prediction_meta[PredictionMetaKeys.START.value])
end_idx = start_idx + df.shape[0]
n_forecast_steps = end_idx - trained_len
prediction_meta.update(
{
PredictionMetaKeys.START_INDEX.value: start_idx,
PredictionMetaKeys.END_INDEX.value: end_idx,
PredictionMetaKeys.FUTURE_STEPS.value: n_forecast_steps,
}
)
self._prediction_meta = prediction_meta