def _get_metadata()

in horovod/spark/common/util.py [0:0]


def _get_metadata(df):
    """
    Infer the type and shape of all the columns and determines if what intermediate format they
    need to be converted to in case they are a vector.

    Example return value:
    {
    'col1': {
        'dtype': <type 'float'>,
        'intermediate_format': 'nochange',
        'max_size': 1,
        'shape': 1
        },
     'col2': {
        'dtype': <type 'float'>,
        'intermediate_format': 'nochange',
        'max_size': 1,
        'shape': 1
        },
     'col3': {
        'dtype': <class 'pyspark.ml.linalg.SparseVector'>,
        'intermediate_format': 'custom_sparse_format',
        'max_size': 37,
        'shape': 56
        }
    }
    """
    all_col_types, col_shapes, col_max_sizes = _get_col_info(df)

    metadata = dict()
    for field in df.schema.fields:
        col = field.name
        col_types = all_col_types[col].copy()

        if DenseVector in col_types:
            # If a col has DenseVector type (whether it is mixed sparse and dense vector or just
            # DenseVector), convert all of the values to dense vector
            is_sparse_vector_only = False
            spark_data_type = DenseVector
            convert_to_target = constants.ARRAY
        elif SparseVector in col_types:
            # If a col has only sparse vectors, convert all the data into custom dense vectors
            is_sparse_vector_only = True
            spark_data_type = SparseVector
            convert_to_target = constants.CUSTOM_SPARSE
        else:
            is_sparse_vector_only = False
            spark_data_type = type(field.dataType)
            convert_to_target = constants.NOCHANGE

        # Explanation of the fields in metadata
        #     dtype:
        #
        #     spark_data_type:
        #         The spark data type from dataframe schema: type(field.dataType). If column has
        #         mixed SparseVector and DenseVector we categorize it as DenseVector.
        #
        #     is_sparse_vector_only:
        #         If all the rows in the column were sparse vectors.
        #
        #     shape:
        #         Determines the shape of the data in the spark dataframe. It is useful for sparse
        #         vectors.
        #
        #     intermediate_format:
        #         Specifies if the column need to be converted to a different format so that
        #         petastorm can read it. It can be one of ARRAY, CUSTOM_SPARSE, or NOCHANGE. It is
        #         required because petastorm cannot read DenseVector and SparseVectors. We need to
        #         identify these types and convert them to petastorm compatible type of array.

        metadata[col] = {'spark_data_type': spark_data_type,
                         'is_sparse_vector_only': is_sparse_vector_only,
                         'shape': col_shapes[col],
                         'intermediate_format': convert_to_target,
                         'max_size': col_max_sizes[col]}

    return metadata