def _get_csv_dmatrix_pipe_mode()

in src/sagemaker_xgboost_container/data_utils.py [0:0]


def _get_csv_dmatrix_pipe_mode(pipe_path, csv_weights):
    """Get Data Matrix from CSV data in pipe mode.

    :param pipe_path: SageMaker pipe path where CSV formatted training data is piped
    :param csv_weights: 1 if instance weights are in second column of CSV data; else 0
    :return: xgb.DMatrix or None
    """
    try:
        pipes_path = pipe_path if isinstance(pipe_path, list) else [pipe_path]
        dataset = [mlio.SageMakerPipe(path) for path in pipes_path]
        reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE)
        csv_params = mlio.CsvParams(header_row_index=None)
        reader = mlio.CsvReader(reader_params, csv_params)

        # Check if data is present in reader
        if reader.peek_example() is not None:
            examples = []
            for example in reader:
                # Write each feature (column) of example into a single numpy array
                tmp = [as_numpy(feature).squeeze() for feature in example]
                tmp = np.array(tmp)
                if len(tmp.shape) > 1:
                    # Columns are written as rows, needs to be transposed
                    tmp = tmp.T
                else:
                    # If tmp is a 1-D array, it needs to be reshaped as a matrix
                    tmp = np.reshape(tmp, (1, tmp.shape[0]))
                examples.append(tmp)

            data = np.vstack(examples)
            del examples

            if csv_weights == 1:
                dmatrix = xgb.DMatrix(data[:, 2:], label=data[:, 0], weight=data[:, 1])
            else:
                dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0])

            return dmatrix
        else:
            return None

    except Exception as e:
        raise exc.UserError("Failed to load csv data with exception:\n{}".format(e))