in src/sagemaker_xgboost_container/data_utils.py [0:0]
def _get_num_valid_libsvm_features(libsvm_line):
"""Get number of valid LIBSVM features.
XGBoost expects the following LIBSVM format:
<label>(:<instance weight>) <index>:<value> <index>:<value> <index>:<value> ...
:param libsvm_line:
:return: -1 if the line is not a valid LIBSVM line; otherwise, return number of correctly formatted features
"""
split_line = libsvm_line.split(" ")
num_sparse_features = 0
if not _is_valid_libsvm_label(split_line[0]):
logging.error("{} does not follow LIBSVM label format <label>(:<weight>).".format(split_line[0]))
return -1
if len(split_line) > 1:
for idx in range(1, len(split_line)):
if ":" not in split_line[idx]:
return -1
else:
libsvm_feature_contents = split_line[1].split(":")
if len(libsvm_feature_contents) != 2:
return -1
else:
num_sparse_features += 1
return num_sparse_features
else:
return 0