in horovod/spark/common/util.py [0:0]
def prepare_data(num_processes, store, df, label_columns, feature_columns,
validation=None, sample_weight_col=None, compress_sparse=False,
partitions_per_process=10, verbose=0):
check_validation(validation, df=df)
if num_processes <= 0 or partitions_per_process <= 0:
raise ValueError('num_proc={} and partitions_per_process={} must both be > 0'
.format(num_processes, partitions_per_process))
if not label_columns:
raise ValueError('Parameter label_columns cannot be None or empty')
num_partitions = num_processes * partitions_per_process
if verbose:
print('num_partitions={}'.format(num_partitions))
for col in label_columns:
if col not in df.columns:
raise ValueError('Label column {} does not exist in the DataFrame'.format(col))
if feature_columns is None:
feature_columns = [col for col in df.columns if col not in set(label_columns)]
else:
for col in feature_columns:
if col not in df.columns:
raise ValueError('Feature column {} does not exist in the DataFrame'.format(col))
key = _training_cache.create_key(df, store, validation)
with _training_cache.use_key(key):
dataset_idx = _get_or_create_dataset(key, store, df, feature_columns, label_columns,
validation, sample_weight_col, compress_sparse,
num_partitions, num_processes, verbose)
yield dataset_idx