in src/sagemaker_xgboost_container/algorithm_mode/train.py [0:0]
def get_validated_dmatrices(train_path, validate_path, content_type, csv_weights=0, is_pipe=False,
combine_train_val=False):
"""Get training and validation Data Matrices for XGBoost training.
Check size and format of both training and validation data channels, and return parsed
Data Matrices.
:param train_path:
:param validate_path:
:param content_type: Content type of data. Supports 'libsvm' or 'csv'
:param csv_weights: 1 if instance weights are in the second column of csv data files; otherwise, 0
:param is_pipe: Boolean to indicate if data is being read in pipe mode
:combine_train_val: Boolean to indicate if returns a DMatrix combining train and validation data
:return: Parsed xgb.DMatrix
"""
train_files_size = get_size(train_path, is_pipe) if train_path else 0
val_files_size = get_size(validate_path, is_pipe) if validate_path else 0
if not is_pipe:
logging.debug("File size need to be processed in the node: {}mb.".format(
round((train_files_size + val_files_size) / (1024 * 1024), 2)))
if train_files_size > 0:
validate_data_file_path(train_path, content_type)
if val_files_size > 0:
validate_data_file_path(validate_path, content_type)
train_dmatrix = get_dmatrix(train_path, content_type, csv_weights=csv_weights, is_pipe=is_pipe) \
if train_files_size > 0 else None
val_dmatrix = get_dmatrix(validate_path, content_type, csv_weights=csv_weights, is_pipe=is_pipe) \
if val_files_size > 0 else None
train_val_dmatrix = train_dmatrix
if combine_train_val and train_dmatrix is not None and val_dmatrix is not None:
logging.info("Read both train and validation data into one DMatrix")
train_val_dmatrix = get_dmatrix([train_path, validate_path], content_type,
csv_weights=csv_weights, is_pipe=is_pipe)
return train_dmatrix, val_dmatrix, train_val_dmatrix