def get_schema_dataframe()

in tensorflow_data_validation/utils/display_util.py [0:0]


def get_schema_dataframe(
    schema: schema_pb2.Schema) -> Tuple[pd.DataFrame, pd.DataFrame]:
  """Returns a tuple of DataFrames containing the input schema information.

  Args:
    schema: A Schema protocol buffer.
  Returns:
    A tuple of DataFrames containing the features and domains of the schema.
  """
  if not isinstance(schema, schema_pb2.Schema):
    raise TypeError('schema is of type %s, should be a Schema proto.' %
                    type(schema).__name__)

  # Extract all the string domains at the schema level.
  domain_rows = []
  for domain in schema.string_domain:
    domain_rows.append(
        [_add_quotes(domain.name),
         ', '.join(_add_quotes(v) for v in domain.value)])

  feature_rows = []
  # Iterate over the features in the schema and extract the properties of each
  # feature.
  for feature in schema.feature:
    # Extract the presence information of the feature.
    if feature.HasField('presence'):
      if feature.presence.min_fraction == 1.0:
        feature_presence = 'required'
      else:
        feature_presence = 'optional'
    else:
      feature_presence = ''

    # Extract the valency information of the feature.
    valency = ''
    if feature.HasField('value_count'):
      if (feature.value_count.min == feature.value_count.max and
          feature.value_count.min == 1):
        valency = 'single'
      else:
        min_value_count = ('[%d' % feature.value_count.min
                           if feature.value_count.HasField('min') else '[0')
        max_value_count = ('%d]' % feature.value_count.max
                           if feature.value_count.HasField('max') else 'inf)')
        valency = min_value_count + ',' + max_value_count

    # Extract the feature type.
    feature_type = schema_pb2.FeatureType.Name(feature.type)
    # If the feature has a string domain, treat it as a string feature.
    if feature_type == 'BYTES' and (feature.HasField('domain') or
                                    feature.HasField('string_domain')):
      feature_type = 'STRING'

    # Extract the domain (if any) of the feature.
    def combine_min_max_strings(min_string, max_string):
      if min_string is not None and max_string is not None:
        domain_string = min_string + '; ' + max_string
      elif min_string is not None:
        domain_string = min_string
      elif max_string is not None:
        domain_string = max_string
      else:
        domain_string = '-'
      return domain_string

    domain = '-'
    if feature.HasField('domain'):
      domain = _add_quotes(feature.domain)
    elif feature.HasField('int_domain'):
      min_string = ('min: %d' % feature.int_domain.min
                    if feature.int_domain.HasField('min') else None)
      max_string = ('max: %d' % feature.int_domain.max
                    if feature.int_domain.HasField('max') else None)
      domain = combine_min_max_strings(min_string, max_string)
    elif feature.HasField('float_domain'):
      if feature.float_domain.HasField('min'):
        min_string = 'min: %f' % feature.float_domain.min
      elif feature.float_domain.disallow_inf:
        min_string = None
      else:
        min_string = 'min: -inf'
      if feature.float_domain.HasField('max'):
        max_string = 'max: %f' % feature.float_domain.max
      elif feature.float_domain.disallow_inf:
        max_string = None
      else:
        max_string = 'max: inf'
      domain = combine_min_max_strings(min_string, max_string)
    elif feature.HasField('string_domain'):
      domain = _add_quotes(feature.string_domain.name if
                           feature.string_domain.name else
                           feature.name + '_domain')
      domain_rows.append([domain,
                          ', '.join(_add_quotes(v) for v in
                                    feature.string_domain.value)])

    feature_rows.append(
        [_add_quotes(feature.name), feature_type, feature_presence, valency,
         domain])

  features = pd.DataFrame(
      feature_rows,
      columns=['Feature name', 'Type', 'Presence', 'Valency',
               'Domain']).set_index('Feature name')

  domains = pd.DataFrame(
      domain_rows, columns=['Domain', 'Values']).set_index('Domain')

  return features, domains