in src/sagemaker_xgboost_container/data_utils.py [0:0]
def get_content_type(content_type_cfg_val):
"""Get content type from data config.
Assumes that training and validation data have the same content type.
['libsvm', 'text/libsvm ;charset=utf8', 'text/x-libsvm'] will return 'libsvm'
['csv', 'text/csv', 'text/csv; label_size=1'] will return 'csv'
:param content_type_cfg_val
:return: Parsed content type
"""
if content_type_cfg_val is None:
return LIBSVM
else:
# cgi.parse_header extracts all arguments after ';' as key-value pairs
# e.g. cgi.parse_header('text/csv;label_size=1;charset=utf8') returns
# the tuple ('text/csv', {'label_size': '1', 'charset': 'utf8'})
content_type, params = cgi.parse_header(content_type_cfg_val.lower())
if content_type in [CSV, _content_types.CSV]:
# CSV content type allows a label_size parameter
# that should be 1 for XGBoost
if params and "label_size" in params and params["label_size"] != "1":
msg = (
"{} is not an accepted csv ContentType. "
"Optional parameter label_size must be equal to 1".format(content_type_cfg_val)
)
raise exc.UserError(msg)
return CSV
elif content_type in [LIBSVM, xgb_content_types.LIBSVM, xgb_content_types.X_LIBSVM]:
return LIBSVM
elif content_type in [PARQUET, xgb_content_types.X_PARQUET]:
return PARQUET
elif content_type in [RECORDIO_PROTOBUF, xgb_content_types.X_RECORDIO_PROTOBUF]:
return RECORDIO_PROTOBUF
else:
raise exc.UserError(_get_invalid_content_type_error_msg(content_type_cfg_val))