def get_dmatrix()

in src/sagemaker_xgboost_container/data_utils.py [0:0]


def get_dmatrix(data_path, content_type, csv_weights=0, is_pipe=False):
    """Create Data Matrix from CSV or LIBSVM file.

    Assumes that sanity validation for content type has been done.

    :param data_path: Either directory or file
    :param content_type:
    :param csv_weights: Only used if file_type is 'csv'.
                        1 if the instance weights are in the second column of csv file; otherwise, 0
    :param is_pipe: Boolean to indicate if data is being read in pipe mode
    :return: xgb.DMatrix or None
    """

    # To get best results from cross validation, we should merge train_dmatrix
    # and val_dmatrix for bigger data. However, DMatrix doesn't support concat
    # operation and it cannot be exported to other formats (e.g. numpy).
    # It is possible to write it to a file in binary format matrix.save("data.buffer").
    # However, xgb doesn't support read multiple buffer files.
    #
    # So the only way to combine the data is to read them in one shot.
    # Fortunately, milo can read multiple pipes together. So we extends
    # the parameter data_path to support list. If data_path is string as usual,
    # get_dmatrix will work as before. When it is a list, it works as explained in respective functions.

    if is_pipe:
        files_path = _get_pipe_mode_files_path(data_path)
    else:
        files_path = _get_file_mode_files_path(data_path)
    logging.info(f"files path: {files_path}")
    if content_type.lower() == CSV:
        dmatrix = get_csv_dmatrix(files_path, csv_weights, is_pipe)
    elif content_type.lower() == LIBSVM:
        dmatrix = get_libsvm_dmatrix(files_path, is_pipe)
    elif content_type.lower() == PARQUET:
        dmatrix = get_parquet_dmatrix(files_path, is_pipe)
    elif content_type.lower() == RECORDIO_PROTOBUF:
        dmatrix = get_recordio_protobuf_dmatrix(files_path, is_pipe)

    if dmatrix and dmatrix.get_label().size == 0:
        raise exc.UserError(
            "Got input data without labels. Please check the input data set. "
            "If training job is running on multiple instances, please switch "
            "to using single instance if number of records in the data set "
            "is less than number of workers (16 * number of instance) in the cluster."
        )

    return dmatrix