in tensorflow_data_validation/utils/display_util.py [0:0]
def get_schema_dataframe(
schema: schema_pb2.Schema) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Returns a tuple of DataFrames containing the input schema information.
Args:
schema: A Schema protocol buffer.
Returns:
A tuple of DataFrames containing the features and domains of the schema.
"""
if not isinstance(schema, schema_pb2.Schema):
raise TypeError('schema is of type %s, should be a Schema proto.' %
type(schema).__name__)
# Extract all the string domains at the schema level.
domain_rows = []
for domain in schema.string_domain:
domain_rows.append(
[_add_quotes(domain.name),
', '.join(_add_quotes(v) for v in domain.value)])
feature_rows = []
# Iterate over the features in the schema and extract the properties of each
# feature.
for feature in schema.feature:
# Extract the presence information of the feature.
if feature.HasField('presence'):
if feature.presence.min_fraction == 1.0:
feature_presence = 'required'
else:
feature_presence = 'optional'
else:
feature_presence = ''
# Extract the valency information of the feature.
valency = ''
if feature.HasField('value_count'):
if (feature.value_count.min == feature.value_count.max and
feature.value_count.min == 1):
valency = 'single'
else:
min_value_count = ('[%d' % feature.value_count.min
if feature.value_count.HasField('min') else '[0')
max_value_count = ('%d]' % feature.value_count.max
if feature.value_count.HasField('max') else 'inf)')
valency = min_value_count + ',' + max_value_count
# Extract the feature type.
feature_type = schema_pb2.FeatureType.Name(feature.type)
# If the feature has a string domain, treat it as a string feature.
if feature_type == 'BYTES' and (feature.HasField('domain') or
feature.HasField('string_domain')):
feature_type = 'STRING'
# Extract the domain (if any) of the feature.
def combine_min_max_strings(min_string, max_string):
if min_string is not None and max_string is not None:
domain_string = min_string + '; ' + max_string
elif min_string is not None:
domain_string = min_string
elif max_string is not None:
domain_string = max_string
else:
domain_string = '-'
return domain_string
domain = '-'
if feature.HasField('domain'):
domain = _add_quotes(feature.domain)
elif feature.HasField('int_domain'):
min_string = ('min: %d' % feature.int_domain.min
if feature.int_domain.HasField('min') else None)
max_string = ('max: %d' % feature.int_domain.max
if feature.int_domain.HasField('max') else None)
domain = combine_min_max_strings(min_string, max_string)
elif feature.HasField('float_domain'):
if feature.float_domain.HasField('min'):
min_string = 'min: %f' % feature.float_domain.min
elif feature.float_domain.disallow_inf:
min_string = None
else:
min_string = 'min: -inf'
if feature.float_domain.HasField('max'):
max_string = 'max: %f' % feature.float_domain.max
elif feature.float_domain.disallow_inf:
max_string = None
else:
max_string = 'max: inf'
domain = combine_min_max_strings(min_string, max_string)
elif feature.HasField('string_domain'):
domain = _add_quotes(feature.string_domain.name if
feature.string_domain.name else
feature.name + '_domain')
domain_rows.append([domain,
', '.join(_add_quotes(v) for v in
feature.string_domain.value)])
feature_rows.append(
[_add_quotes(feature.name), feature_type, feature_presence, valency,
domain])
features = pd.DataFrame(
feature_rows,
columns=['Feature name', 'Type', 'Presence', 'Valency',
'Domain']).set_index('Feature name')
domains = pd.DataFrame(
domain_rows, columns=['Domain', 'Values']).set_index('Domain')
return features, domains