def _csv_loader()

in bayesmark/data.py [0:0]


def _csv_loader(dataset_name, return_X_y, data_root, clip_x=100):  # pragma: io
    """Load custom csv files for use in the benchmark.

    This function assumes ``dataset_name + ".csv"`` is a csv file found in the `data_root` path.  It also assumes the
    last column of the csv file is the target and the other columns are features.

    The target column should be `int` for classification and `float` for regression. Column names ending in ``"_cat"``
    are assumed to be categorical and will be one-hot encoded.

    The features (and target for regression) are robust standardized. The features are also clipped to be in
    ``[-clip_x, clip_x]`` *after* standardization.
    """
    assert return_X_y, "Only returning (X,y) tuple supported right now."
    assert clip_x >= 0

    # Quantile range for robust standardization. The 86% range is the most efficient for Gaussians. See:
    # https://github.com/scikit-learn/scikit-learn/issues/10139#issuecomment-344705040
    q_level = 0.86

    path = join_safe_r(data_root, dataset_name + ".csv")

    # For now, use convention that can get problem type based on data set name
    problem_type = get_problem_type(dataset_name)

    # Assuming no missing data in source csv files at the moment, these will
    # result in error.
    df = pd.read_csv(
        path, header=0, index_col=False, engine="c", na_filter=False, true_values=["true"], false_values=["false"]
    )

    label = df.columns[-1]  # Assume last col is target

    target = df.pop(label).values
    if problem_type == ProblemType.clf:
        assert target.dtype in (np.bool_, np.int_)
        target = target.astype(np.int_)  # convert to int for skl
    if problem_type == ProblemType.reg:
        assert target.dtype == np.float_
        # 86% range is the most efficient (at least for Gaussians)
        target = robust_standardize(target, q_level=q_level)

    # Fill in an categorical variables (object dtype of cols names ..._cat)
    cat_cols = sorted(cc for cc in df.columns if cc.endswith("_cat") or df[cc].dtype.kind == "O")
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True, dtype=np.float_)
    # Could also sort all columns to be sure it will be reprod

    # Everything should now be in float
    assert (df.dtypes == np.float_).all()

    data = df.values
    data = robust_standardize(data, q_level=q_level)
    # Debatable if we should include this, but there are a lot of outliers
    data = np.clip(data, -clip_x, clip_x)

    # We should probably do some logging or something to wrap up
    return data, target, problem_type