sdk/python/jobs/automl-standalone-jobs/automl-forecasting-forecast-function/helper.py (56 lines of code) (raw):

# Generate synthetic data import pandas as pd import numpy as np def get_timeseries( train_len: int, test_len: int, time_column_name: str, target_column_name: str, time_series_id_column_name: str, time_series_number: int = 1, freq: str = "H", ): """ Return the time series of designed length. :param train_len: The length of training data (one series). :type train_len: int :param test_len: The length of testing data (one series). :type test_len: int :param time_column_name: The desired name of a time column. :type time_column_name: str :param time_series_number: The number of time series in the data set. :type time_series_number: int :param freq: The frequency string representing pandas offset. see https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html :type freq: str :returns: the tuple of train and test data sets. :rtype: tuple """ data_train = [] # type: List[pd.DataFrame] data_test = [] # type: List[pd.DataFrame] data_length = train_len + test_len for i in range(time_series_number): X = pd.DataFrame( { time_column_name: pd.date_range( start="2000-01-01", periods=data_length, freq=freq ), target_column_name: np.arange(data_length).astype(float) + np.random.rand(data_length) + i * 5, "ext_predictor": np.asarray(range(42, 42 + data_length)), time_series_id_column_name: np.repeat("ts{}".format(i), data_length), } ) data_train.append(X[:train_len]) data_test.append(X[train_len:]) X_train = pd.concat(data_train) y_train = X_train.pop(target_column_name).values X_test = pd.concat(data_test) y_test = X_test.pop(target_column_name).values return X_train, y_train, X_test, y_test def make_forecasting_query( fulldata, time_column_name, target_column_name, forecast_origin, horizon, lookback ): """ This function will take the full dataset, and create the query to predict all values of the time series from the `forecast_origin` forward for the next `horizon` horizons. Context from previous `lookback` periods will be included. fulldata: pandas.DataFrame a time series dataset. Needs to contain X and y. time_column_name: string which column (must be in fulldata) is the time axis target_column_name: string which column (must be in fulldata) is to be forecast forecast_origin: datetime type the last time we (pretend to) have target values horizon: timedelta how far forward, in time units (not periods) lookback: timedelta how far back does the model look Example: ``` forecast_origin = pd.to_datetime("2012-09-01") + pd.DateOffset(days=5) # forecast 5 days after end of training print(forecast_origin) X_query, y_query = make_forecasting_query(data, forecast_origin = forecast_origin, horizon = pd.DateOffset(days=7), # 7 days into the future lookback = pd.DateOffset(days=1), # model has lag 1 period (day) ) ``` """ X_past = fulldata[ (fulldata[time_column_name] > forecast_origin - lookback) & (fulldata[time_column_name] <= forecast_origin) ] X_future = fulldata[ (fulldata[time_column_name] > forecast_origin) & (fulldata[time_column_name] <= forecast_origin + horizon) ] y_past = X_past.pop(target_column_name).values.astype(float) y_future = X_future.pop(target_column_name).values.astype(float) # Now take y_future and turn it into question marks y_query = y_future.copy().astype(float) # because sometimes life hands you an int y_query.fill(np.nan) print("X_past is " + str(X_past.shape) + " - shaped") print("X_future is " + str(X_future.shape) + " - shaped") print("y_past is " + str(y_past.shape) + " - shaped") print("y_query is " + str(y_query.shape) + " - shaped") X_pred = pd.concat([X_past, X_future]) y_pred = np.concatenate([y_past, y_query]) return X_pred, y_pred