def _validate_libsvm_format()

in src/sagemaker_xgboost_container/data_utils.py [0:0]


def _validate_libsvm_format(file_path):
    """Validate that data file is LIBSVM format.

    XGBoost expects the following LIBSVM format:
        <label>(:<instance weight>) <index>:<value> <index>:<value> <index>:<value> ...

    Note: This only validates the first line that has a feature. This is not a comprehensive file check,
    as XGBoost will have its own data validation.

    :param file_path
    """
    with open(file_path, "r", errors="ignore") as read_file:
        for line_to_validate in read_file:
            num_sparse_libsvm_features = _get_num_valid_libsvm_features(line_to_validate)

            if num_sparse_libsvm_features > 1:
                # Return after first valid LIBSVM line with features
                return
            elif num_sparse_libsvm_features < 0:
                raise exc.UserError(
                    _get_invalid_libsvm_error_msg(
                        line_snippet=line_to_validate[:50], file_name=file_path.split("/")[-1]
                    )
                )

    logging.warning(
        "File {} is not an invalid LIBSVM file but has no features. Accepting simple validation.".format(
            file_path.split("/")[-1]
        )
    )