def load_data()

in afa/core.py [0:0]


def load_data(data, impute_freq=None):
    """Read a raw timeseries dataset from disk, S3, or a dataframe. Note that
    the "timestamp" column will be removed as a bonafide column and set as 
    the dataframe (timeseries) index.

    Parameters
    ----------
    data : str or pd.DataFrame
        Path to the data file if `str`, otherwise a "raw" timeseries dataframe.
    impute_freq : str or None; optional
        If `str`, impute the missing dates between the first and last
        dates, according to the sampling frequency `str` value.

    Returns
    -------
    pd.DataFrame

    """

    if isinstance(data, str):
        if data.endswith(".csv.gz"):
            _read_func = partial(pd.read_csv, compression="gzip")
        elif data.endswith(".csv"):
            _read_func = partial(pd.read_csv)
        else:
            raise NotImplementedError

        # cast exp. columns to correct types
        df = _read_func(data)
    elif isinstance(data, pd.DataFrame):
        df = data
    else:
        raise NotImplementedError
    
    schema = DataFrameSchema({
        "timestamp": Column(str, nullable=False, coerce=True),
        "channel": Column(str, nullable=False, coerce=True),
        "family": Column(str, nullable=False, coerce=True),
        "item_id": Column(str, nullable=False, coerce=True),
        "demand": Column(float, Check(lambda x: x >= 0), coerce=True)
    })

    schema.validate(df, lazy=True)

    # enforce column datatypes
    df = df.astype({"channel": str, "family": str, "item_id": str,
                    "demand": float})

    # set timeseries dataframe index
    if "timestamp" in df:
        df.set_index(pd.DatetimeIndex(df.pop("timestamp")), inplace=True)

    df.index.name = None

    if impute_freq is not None:
        df = impute_dates(df, impute_freq)

    df["demand"] = df["demand"].clip(0).round(0)

    return df