in src/sagemaker_xgboost_container/data_utils.py [0:0]
def validate_data_file_path(data_path, content_type):
"""Validate data in data_path are formatted correctly based on content_type.
Note: This is not a comprehensive validation. XGBoost has its own content validation.
:param data_path:
:param content_type:
"""
parsed_content_type = get_content_type(content_type)
if not os.path.exists(data_path):
raise exc.UserError("{} is not a valid path!".format(data_path))
else:
if os.path.isfile(data_path):
data_files = [data_path]
else:
dir_path = None
for root, dirs, files in os.walk(data_path):
if dirs == []:
dir_path = root
break
data_files = [
os.path.join(dir_path, file_name)
for file_name in os.listdir(dir_path)
if _is_data_file(dir_path, file_name)
]
if parsed_content_type.lower() == CSV:
for data_file_path in data_files:
_validate_csv_format(data_file_path)
elif parsed_content_type.lower() == LIBSVM:
for data_file_path in data_files:
_validate_libsvm_format(data_file_path)
elif parsed_content_type.lower() == PARQUET or parsed_content_type.lower() == RECORDIO_PROTOBUF:
# No op
return