in horovod/spark/common/util.py [0:0]
def _get_metadata(df):
"""
Infer the type and shape of all the columns and determines if what intermediate format they
need to be converted to in case they are a vector.
Example return value:
{
'col1': {
'dtype': <type 'float'>,
'intermediate_format': 'nochange',
'max_size': 1,
'shape': 1
},
'col2': {
'dtype': <type 'float'>,
'intermediate_format': 'nochange',
'max_size': 1,
'shape': 1
},
'col3': {
'dtype': <class 'pyspark.ml.linalg.SparseVector'>,
'intermediate_format': 'custom_sparse_format',
'max_size': 37,
'shape': 56
}
}
"""
all_col_types, col_shapes, col_max_sizes = _get_col_info(df)
metadata = dict()
for field in df.schema.fields:
col = field.name
col_types = all_col_types[col].copy()
if DenseVector in col_types:
# If a col has DenseVector type (whether it is mixed sparse and dense vector or just
# DenseVector), convert all of the values to dense vector
is_sparse_vector_only = False
spark_data_type = DenseVector
convert_to_target = constants.ARRAY
elif SparseVector in col_types:
# If a col has only sparse vectors, convert all the data into custom dense vectors
is_sparse_vector_only = True
spark_data_type = SparseVector
convert_to_target = constants.CUSTOM_SPARSE
else:
is_sparse_vector_only = False
spark_data_type = type(field.dataType)
convert_to_target = constants.NOCHANGE
# Explanation of the fields in metadata
# dtype:
#
# spark_data_type:
# The spark data type from dataframe schema: type(field.dataType). If column has
# mixed SparseVector and DenseVector we categorize it as DenseVector.
#
# is_sparse_vector_only:
# If all the rows in the column were sparse vectors.
#
# shape:
# Determines the shape of the data in the spark dataframe. It is useful for sparse
# vectors.
#
# intermediate_format:
# Specifies if the column need to be converted to a different format so that
# petastorm can read it. It can be one of ARRAY, CUSTOM_SPARSE, or NOCHANGE. It is
# required because petastorm cannot read DenseVector and SparseVectors. We need to
# identify these types and convert them to petastorm compatible type of array.
metadata[col] = {'spark_data_type': spark_data_type,
'is_sparse_vector_only': is_sparse_vector_only,
'shape': col_shapes[col],
'intermediate_format': convert_to_target,
'max_size': col_max_sizes[col]}
return metadata