in tensorflow_data_validation/statistics/generators/basic_stats_generator.py [0:0]
def _make_feature_stats_proto(
feature_path: types.FeaturePath, basic_stats: _PartialBasicStats,
parent_basic_stats: Optional[_PartialBasicStats],
make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch],
num_values_histogram_buckets: int, num_histogram_buckets: int,
num_quantiles_histogram_buckets: int, is_bytes: bool,
categorical_numeric_types: Mapping[types.FeaturePath,
'schema_pb2.FeatureType'],
has_weights: bool, num_examples: int,
weighted_num_examples: int) -> statistics_pb2.FeatureNameStatistics:
"""Convert the partial basic stats into a FeatureNameStatistics proto.
Args:
feature_path: The path of the feature.
basic_stats: The partial basic stats associated with the feature.
parent_basic_stats: The partial basic stats of the parent of the feature.
make_quantiles_sketch_fn: A callable to create a quantiles sketch.
num_values_histogram_buckets: Number of buckets in the quantiles
histogram for the number of values per feature.
num_histogram_buckets: Number of buckets in a standard
NumericStatistics.histogram with equal-width buckets.
num_quantiles_histogram_buckets: Number of buckets in a
quantiles NumericStatistics.histogram.
is_bytes: A boolean indicating whether the feature is bytes.
categorical_numeric_types: A mapping from feature path to type derived from
the schema.
has_weights: A boolean indicating whether a weight feature is specified.
num_examples: The global (across feature) number of examples.
weighted_num_examples: The global (across feature) weighted number of
examples.
Returns:
A statistics_pb2.FeatureNameStatistics proto.
"""
# Create a new FeatureNameStatistics proto.
result = statistics_pb2.FeatureNameStatistics()
result.path.CopyFrom(feature_path.to_proto())
# Set the feature type.
inferred_type = basic_stats.common_stats.type
if inferred_type is not None:
# The user claims the feature to be BYTES. Only trust them if the inferred
# type is STRING (which means the actual data is in strings/bytes). We
# never infer BYTES.
if (is_bytes and
inferred_type == statistics_pb2.FeatureNameStatistics.STRING):
result.type = statistics_pb2.FeatureNameStatistics.BYTES
else:
result.type = inferred_type
# The inferred type being None means we don't see any value for this feature.
# We trust user's claim.
elif is_bytes:
result.type = statistics_pb2.FeatureNameStatistics.BYTES
else:
# We don't have an "unknown" type, so default to STRING here.
result.type = statistics_pb2.FeatureNameStatistics.STRING
# Construct common statistics proto.
common_stats_proto = _make_common_stats_proto(
basic_stats.common_stats, parent_basic_stats.common_stats
if parent_basic_stats is not None else None, make_quantiles_sketch_fn,
num_values_histogram_buckets, has_weights, num_examples,
weighted_num_examples)
# this is the total number of values at the leaf level.
total_num_values = (
0 if basic_stats.common_stats.presence_and_valency_stats is None else
basic_stats.common_stats.presence_and_valency_stats[-1].total_num_values)
# Copy the common stats into appropriate numeric/string stats.
# If the type is not set, we currently wrap the common stats
# within numeric stats.
if result.type == statistics_pb2.FeatureNameStatistics.BYTES:
# Construct bytes statistics proto.
bytes_stats_proto = _make_bytes_stats_proto(
basic_stats.bytes_stats, common_stats_proto.tot_num_values)
# Add the common stats into bytes stats.
bytes_stats_proto.common_stats.CopyFrom(common_stats_proto)
result.bytes_stats.CopyFrom(bytes_stats_proto)
# TODO(b/187054148): Update to allow FLOAT
if (result.type == statistics_pb2.FeatureNameStatistics.STRING or
top_k_uniques_stats_util.output_categorical_numeric(
categorical_numeric_types, feature_path, result.type)):
# Construct string statistics proto.
string_stats_proto = _make_string_stats_proto(basic_stats.string_stats,
total_num_values)
# Add the common stats into string stats.
string_stats_proto.common_stats.CopyFrom(common_stats_proto)
result.string_stats.CopyFrom(string_stats_proto)
elif result.type == statistics_pb2.FeatureNameStatistics.STRUCT:
result.struct_stats.common_stats.CopyFrom(common_stats_proto)
elif result.type in (statistics_pb2.FeatureNameStatistics.INT,
statistics_pb2.FeatureNameStatistics.FLOAT):
# Construct numeric statistics proto.
numeric_stats_proto = _make_numeric_stats_proto(
basic_stats.numeric_stats, total_num_values,
num_histogram_buckets, num_quantiles_histogram_buckets, has_weights)
# Add the common stats into numeric stats.
numeric_stats_proto.common_stats.CopyFrom(common_stats_proto)
result.num_stats.CopyFrom(numeric_stats_proto)
result.custom_stats.extend(_make_num_values_custom_stats_proto(
basic_stats.common_stats,
num_values_histogram_buckets))
return result