in src/sagemaker_xgboost_container/encoder.py [0:0]
def recordio_protobuf_to_dmatrix(string_like): # type: (bytes) -> xgb.DMatrix
"""Convert a RecordIO-Protobuf byte representation to a DMatrix object.
Args:
string_like (bytes): RecordIO-Protobuf bytes.
Returns:
(xgb.DMatrix): XGBoost DataMatrix
"""
buf = bytes(string_like)
dataset = [mlio.InMemoryStore(buf)]
reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=100)
reader = mlio.RecordIOProtobufReader(reader_params)
is_dense_tensor = type(reader.peek_example()["values"]) is mlio.DenseTensor
examples = []
for example in reader:
# Ignore labels if present
values = as_numpy(example["values"]) if is_dense_tensor else to_coo_matrix(example["values"])
examples.append(values)
data = np.vstack(examples) if is_dense_tensor else scipy_vstack(examples).tocsr()
dmatrix = xgb.DMatrix(data)
return dmatrix