def prepare_data()

in horovod/spark/common/util.py [0:0]


def prepare_data(num_processes, store, df, label_columns, feature_columns,
                 validation=None, sample_weight_col=None, compress_sparse=False,
                 partitions_per_process=10, verbose=0):
    check_validation(validation, df=df)
    if num_processes <= 0 or partitions_per_process <= 0:
        raise ValueError('num_proc={} and partitions_per_process={} must both be > 0'
                         .format(num_processes, partitions_per_process))

    if not label_columns:
        raise ValueError('Parameter label_columns cannot be None or empty')

    num_partitions = num_processes * partitions_per_process
    if verbose:
        print('num_partitions={}'.format(num_partitions))

    for col in label_columns:
        if col not in df.columns:
            raise ValueError('Label column {} does not exist in the DataFrame'.format(col))

    if feature_columns is None:
        feature_columns = [col for col in df.columns if col not in set(label_columns)]
    else:
        for col in feature_columns:
            if col not in df.columns:
                raise ValueError('Feature column {} does not exist in the DataFrame'.format(col))

    key = _training_cache.create_key(df, store, validation)
    with _training_cache.use_key(key):
        dataset_idx = _get_or_create_dataset(key, store, df, feature_columns, label_columns,
                                             validation, sample_weight_col, compress_sparse,
                                             num_partitions, num_processes, verbose)
        yield dataset_idx