def recordio_protobuf_to_dmatrix()

in src/sagemaker_xgboost_container/encoder.py [0:0]


def recordio_protobuf_to_dmatrix(string_like):  # type: (bytes) -> xgb.DMatrix
    """Convert a RecordIO-Protobuf byte representation to a DMatrix object.
    Args:
        string_like (bytes): RecordIO-Protobuf bytes.
    Returns:
    (xgb.DMatrix): XGBoost DataMatrix
    """
    buf = bytes(string_like)
    dataset = [mlio.InMemoryStore(buf)]
    reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=100)
    reader = mlio.RecordIOProtobufReader(reader_params)

    is_dense_tensor = type(reader.peek_example()["values"]) is mlio.DenseTensor

    examples = []
    for example in reader:
        # Ignore labels if present
        values = as_numpy(example["values"]) if is_dense_tensor else to_coo_matrix(example["values"])
        examples.append(values)

    data = np.vstack(examples) if is_dense_tensor else scipy_vstack(examples).tocsr()
    dmatrix = xgb.DMatrix(data)
    return dmatrix