in src/sagemaker_xgboost_container/data_utils.py [0:0]
def _get_csv_dmatrix_pipe_mode(pipe_path, csv_weights):
"""Get Data Matrix from CSV data in pipe mode.
:param pipe_path: SageMaker pipe path where CSV formatted training data is piped
:param csv_weights: 1 if instance weights are in second column of CSV data; else 0
:return: xgb.DMatrix or None
"""
try:
pipes_path = pipe_path if isinstance(pipe_path, list) else [pipe_path]
dataset = [mlio.SageMakerPipe(path) for path in pipes_path]
reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE)
csv_params = mlio.CsvParams(header_row_index=None)
reader = mlio.CsvReader(reader_params, csv_params)
# Check if data is present in reader
if reader.peek_example() is not None:
examples = []
for example in reader:
# Write each feature (column) of example into a single numpy array
tmp = [as_numpy(feature).squeeze() for feature in example]
tmp = np.array(tmp)
if len(tmp.shape) > 1:
# Columns are written as rows, needs to be transposed
tmp = tmp.T
else:
# If tmp is a 1-D array, it needs to be reshaped as a matrix
tmp = np.reshape(tmp, (1, tmp.shape[0]))
examples.append(tmp)
data = np.vstack(examples)
del examples
if csv_weights == 1:
dmatrix = xgb.DMatrix(data[:, 2:], label=data[:, 0], weight=data[:, 1])
else:
dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0])
return dmatrix
else:
return None
except Exception as e:
raise exc.UserError("Failed to load csv data with exception:\n{}".format(e))