in afa/core.py [0:0]
def load_data(data, impute_freq=None):
"""Read a raw timeseries dataset from disk, S3, or a dataframe. Note that
the "timestamp" column will be removed as a bonafide column and set as
the dataframe (timeseries) index.
Parameters
----------
data : str or pd.DataFrame
Path to the data file if `str`, otherwise a "raw" timeseries dataframe.
impute_freq : str or None; optional
If `str`, impute the missing dates between the first and last
dates, according to the sampling frequency `str` value.
Returns
-------
pd.DataFrame
"""
if isinstance(data, str):
if data.endswith(".csv.gz"):
_read_func = partial(pd.read_csv, compression="gzip")
elif data.endswith(".csv"):
_read_func = partial(pd.read_csv)
else:
raise NotImplementedError
# cast exp. columns to correct types
df = _read_func(data)
elif isinstance(data, pd.DataFrame):
df = data
else:
raise NotImplementedError
schema = DataFrameSchema({
"timestamp": Column(str, nullable=False, coerce=True),
"channel": Column(str, nullable=False, coerce=True),
"family": Column(str, nullable=False, coerce=True),
"item_id": Column(str, nullable=False, coerce=True),
"demand": Column(float, Check(lambda x: x >= 0), coerce=True)
})
schema.validate(df, lazy=True)
# enforce column datatypes
df = df.astype({"channel": str, "family": str, "item_id": str,
"demand": float})
# set timeseries dataframe index
if "timestamp" in df:
df.set_index(pd.DatetimeIndex(df.pop("timestamp")), inplace=True)
df.index.name = None
if impute_freq is not None:
df = impute_dates(df, impute_freq)
df["demand"] = df["demand"].clip(0).round(0)
return df