def check_data_redundancy()

in src/sagemaker_xgboost_container/data_utils.py [0:0]


def check_data_redundancy(train_path, validate_path):
    """Log a warning if suspected duplicate files are found in the training and validation folders.

    The validation score of models would be invalid if the same data is used for both training and validation.
    Files are suspected of being duplicates when the file names are the same and their sizes are the same.

    param train_path : path to training data
    param validate_path : path to validation data
    """
    if not os.path.exists(train_path):
        raise exc.UserError("training data's path is not existed")
    if not os.path.exists(validate_path):
        raise exc.UserError("validation data's path is not existed")

    training_files_set = set(f for f in os.listdir(train_path) if os.path.isfile(os.path.join(train_path, f)))
    validation_files_set = set(f for f in os.listdir(validate_path) if os.path.isfile(os.path.join(validate_path, f)))
    same_name_files = training_files_set.intersection(validation_files_set)
    for f in same_name_files:
        f_train_path = os.path.join(train_path, f)
        f_validate_path = os.path.join(validate_path, f)
        f_train_size = os.path.getsize(f_train_path)
        f_validate_size = os.path.getsize(f_validate_path)
        if f_train_size == f_validate_size:
            logging.warning(
                f"Suspected identical files found. ({f_train_path} and {f_validate_path}"
                f"with same size {f_validate_size} bytes)."
                f" Note: Duplicate data in the training set and validation set is usually"
                f" not intentional and can impair the validity of the model evaluation by"
                f" the validation score."
            )