in src/sagemaker_xgboost_container/data_utils.py [0:0]
def get_dmatrix(data_path, content_type, csv_weights=0, is_pipe=False):
"""Create Data Matrix from CSV or LIBSVM file.
Assumes that sanity validation for content type has been done.
:param data_path: Either directory or file
:param content_type:
:param csv_weights: Only used if file_type is 'csv'.
1 if the instance weights are in the second column of csv file; otherwise, 0
:param is_pipe: Boolean to indicate if data is being read in pipe mode
:return: xgb.DMatrix or None
"""
# To get best results from cross validation, we should merge train_dmatrix
# and val_dmatrix for bigger data. However, DMatrix doesn't support concat
# operation and it cannot be exported to other formats (e.g. numpy).
# It is possible to write it to a file in binary format matrix.save("data.buffer").
# However, xgb doesn't support read multiple buffer files.
#
# So the only way to combine the data is to read them in one shot.
# Fortunately, milo can read multiple pipes together. So we extends
# the parameter data_path to support list. If data_path is string as usual,
# get_dmatrix will work as before. When it is a list, it works as explained in respective functions.
if is_pipe:
files_path = _get_pipe_mode_files_path(data_path)
else:
files_path = _get_file_mode_files_path(data_path)
logging.info(f"files path: {files_path}")
if content_type.lower() == CSV:
dmatrix = get_csv_dmatrix(files_path, csv_weights, is_pipe)
elif content_type.lower() == LIBSVM:
dmatrix = get_libsvm_dmatrix(files_path, is_pipe)
elif content_type.lower() == PARQUET:
dmatrix = get_parquet_dmatrix(files_path, is_pipe)
elif content_type.lower() == RECORDIO_PROTOBUF:
dmatrix = get_recordio_protobuf_dmatrix(files_path, is_pipe)
if dmatrix and dmatrix.get_label().size == 0:
raise exc.UserError(
"Got input data without labels. Please check the input data set. "
"If training job is running on multiple instances, please switch "
"to using single instance if number of records in the data set "
"is less than number of workers (16 * number of instance) in the cluster."
)
return dmatrix