in bayesmark/data.py [0:0]
def _csv_loader(dataset_name, return_X_y, data_root, clip_x=100): # pragma: io
"""Load custom csv files for use in the benchmark.
This function assumes ``dataset_name + ".csv"`` is a csv file found in the `data_root` path. It also assumes the
last column of the csv file is the target and the other columns are features.
The target column should be `int` for classification and `float` for regression. Column names ending in ``"_cat"``
are assumed to be categorical and will be one-hot encoded.
The features (and target for regression) are robust standardized. The features are also clipped to be in
``[-clip_x, clip_x]`` *after* standardization.
"""
assert return_X_y, "Only returning (X,y) tuple supported right now."
assert clip_x >= 0
# Quantile range for robust standardization. The 86% range is the most efficient for Gaussians. See:
# https://github.com/scikit-learn/scikit-learn/issues/10139#issuecomment-344705040
q_level = 0.86
path = join_safe_r(data_root, dataset_name + ".csv")
# For now, use convention that can get problem type based on data set name
problem_type = get_problem_type(dataset_name)
# Assuming no missing data in source csv files at the moment, these will
# result in error.
df = pd.read_csv(
path, header=0, index_col=False, engine="c", na_filter=False, true_values=["true"], false_values=["false"]
)
label = df.columns[-1] # Assume last col is target
target = df.pop(label).values
if problem_type == ProblemType.clf:
assert target.dtype in (np.bool_, np.int_)
target = target.astype(np.int_) # convert to int for skl
if problem_type == ProblemType.reg:
assert target.dtype == np.float_
# 86% range is the most efficient (at least for Gaussians)
target = robust_standardize(target, q_level=q_level)
# Fill in an categorical variables (object dtype of cols names ..._cat)
cat_cols = sorted(cc for cc in df.columns if cc.endswith("_cat") or df[cc].dtype.kind == "O")
df = pd.get_dummies(df, columns=cat_cols, drop_first=True, dtype=np.float_)
# Could also sort all columns to be sure it will be reprod
# Everything should now be in float
assert (df.dtypes == np.float_).all()
data = df.values
data = robust_standardize(data, q_level=q_level)
# Debatable if we should include this, but there are a lot of outliers
data = np.clip(data, -clip_x, clip_x)
# We should probably do some logging or something to wrap up
return data, target, problem_type