in tensorflow_data_validation/statistics/stats_options.py [0:0]
def __init__(
self,
generators: Optional[List[stats_generator.StatsGenerator]] = None,
schema: Optional[schema_pb2.Schema] = None,
label_feature: Optional[types.FeatureName] = None,
weight_feature: Optional[types.FeatureName] = None,
slice_functions: Optional[List[types.SliceFunction]] = None,
sample_rate: Optional[float] = None,
num_top_values: int = 20,
frequency_threshold: int = 1,
weighted_frequency_threshold: float = 1.0,
num_rank_histogram_buckets: int = 1000,
num_values_histogram_buckets: int = 10,
num_histogram_buckets: int = 10,
num_quantiles_histogram_buckets: int = 10,
epsilon: float = 0.01,
infer_type_from_schema: bool = False,
desired_batch_size: Optional[int] = None,
enable_semantic_domain_stats: bool = False,
semantic_domain_stats_sample_rate: Optional[float] = None,
per_feature_weight_override: Optional[Dict[types.FeaturePath,
types.FeatureName]] = None,
vocab_paths: Optional[Dict[types.VocabName, types.VocabPath]] = None,
add_default_generators: bool = True,
feature_allowlist: Optional[List[types.FeatureName]] = None,
experimental_use_sketch_based_topk_uniques: bool = False,
experimental_slice_functions: Optional[List[types.SliceFunction]] = None,
experimental_slice_sqls: Optional[List[Text]] = None,
experimental_result_partitions: int = 1):
"""Initializes statistics options.
Args:
generators: An optional list of statistics generators. A statistics
generator must extend either CombinerStatsGenerator or
TransformStatsGenerator.
schema: An optional tensorflow_metadata Schema proto. Currently we use the
schema to infer categorical and bytes features.
label_feature: An optional feature name which represents the label.
weight_feature: An optional feature name whose numeric value represents
the weight of an example.
slice_functions: DEPRECATED. Use `experimental_slice_functions`.
sample_rate: An optional sampling rate. If specified, statistics is
computed over the sample.
num_top_values: An optional number of most frequent feature values to keep
for string features.
frequency_threshold: An optional minimum number of examples the most
frequent values must be present in.
weighted_frequency_threshold: An optional minimum weighted number of
examples the most frequent weighted values must be present in. This
option is only relevant when a weight_feature is specified.
num_rank_histogram_buckets: An optional number of buckets in the rank
histogram for string features.
num_values_histogram_buckets: An optional number of buckets in a quantiles
histogram for the number of values per Feature, which is stored in
CommonStatistics.num_values_histogram.
num_histogram_buckets: An optional number of buckets in a standard
NumericStatistics.histogram with equal-width buckets.
num_quantiles_histogram_buckets: An optional number of buckets in a
quantiles NumericStatistics.histogram.
epsilon: An optional error tolerance for the computation of quantiles,
typically a small fraction close to zero (e.g. 0.01). Higher values of
epsilon increase the quantile approximation, and hence result in more
unequal buckets, but could improve performance, and resource
consumption.
infer_type_from_schema: A boolean to indicate whether the feature types
should be inferred from the schema. If set to True, an input schema must
be provided. This flag is used only when invoking TFDV through
`tfdv.generate_statistics_from_csv`.
desired_batch_size: An optional maximum number of examples to include in
each batch that is passed to the statistics generators. When invoking
TFDV using its end-to-end APIs (e.g.
`generate_statistics_from_tfrecord`), this option also controls the
decoder batch size -- if provided, the decoded RecordBatches that are to
be fed to TFDV will have the fixed batch size. When invoking TFDV using
`tfdv.GenerateStatistics`, this option only controls the maximum size of
RecordBatches constructed within StatsGenerators (a generator may
combine RecordBatches).
enable_semantic_domain_stats: If True statistics for semantic domains are
generated (e.g: image, text domains).
semantic_domain_stats_sample_rate: An optional sampling rate for semantic
domain statistics. If specified, semantic domain statistics is computed
over a sample.
per_feature_weight_override: If specified, the "example weight" paired
with a feature will be first looked up in this map and if not found,
fall back to `weight_feature`.
vocab_paths: An optional dictionary mapping vocab names to paths. Used in
the schema when specifying a NaturalLanguageDomain. The paths can either
be to GZIP-compressed TF record files that have a tfrecord.gz suffix or
to text files.
add_default_generators: Whether to invoke the default set of stats
generators in the run. Generators invoked consists of 1) the default
generators (controlled by this option); 2) user-provided generators (
controlled by the `generators` option); 3) semantic generators
(controlled by `enable_semantic_domain_stats`) and 4) schema-based
generators that are enabled based on information provided in the schema.
feature_allowlist: An optional list of names of the features to calculate
statistics for.
experimental_use_sketch_based_topk_uniques: if True, use the sketch based
top-k and uniques stats generator.
experimental_slice_functions: An optional list of functions that generate
slice keys for each example. Each slice function should take
pyarrow.RecordBatch as input and return an Iterable[Tuple[Text,
pyarrow.RecordBatch]]. Each tuple contains the slice key and the
corresponding sliced RecordBatch. Only one of
experimental_slice_functions or experimental_slice_sqls must be
specified.
experimental_slice_sqls: List of slicing SQL queries. The query must have
the following pattern: "SELECT STRUCT({feature_name} [AS {slice_key}])
[FROM example.feature_name [, example.feature_name, ... ] [WHERE ...
]]" The “example.feature_name” inside the FROM statement is used to
flatten the repeated fields. For non-repeated fields, you can directly
write the
query as follows: “SELECT STRUCT(non_repeated_feature_a,
non_repeated_feature_b)” In the query, the “example” is a key word
that binds to each input "row". The semantics of this variable will
depend on the decoding of the input data to the Arrow representation
(e.g., for tf.Example, each key is decoded to a separate column).
Thus, structured data can be readily accessed by iterating/unnesting
the fields of the "example" variable.
Example 1: Slice on each value of a feature "SELECT STRUCT(gender) FROM
example.gender"
Example 2: Slice on each value of one feature and a specified value of
another. "SELECT STRUCT(gender, country) FROM example.gender,
example.country WHERE country = 'USA'" Only one of
experimental_slice_functions or experimental_slice_sqls must be
specified. Note that this option is not supported on Windows.
experimental_result_partitions: The number of feature partitions to
combine output DatasetFeatureStatisticsLists into. If set to 1 (default)
output is globally combined. If set to value greater than one, up to
that many shards are returned, each containing a subset of features.
"""
self.generators = generators
self.feature_allowlist = feature_allowlist
self.schema = schema
self.label_feature = label_feature
self.weight_feature = weight_feature
if slice_functions is not None and experimental_slice_functions is not None:
raise ValueError(
'Specify only one of slice_functions or experimental_slice_functions')
self.experimental_slice_functions = None
if slice_functions is not None:
self.experimental_slice_functions = slice_functions
elif experimental_slice_functions is not None:
self.experimental_slice_functions = experimental_slice_functions
self.sample_rate = sample_rate
self.num_top_values = num_top_values
self.frequency_threshold = frequency_threshold
self.weighted_frequency_threshold = weighted_frequency_threshold
self.num_rank_histogram_buckets = num_rank_histogram_buckets
self.num_values_histogram_buckets = num_values_histogram_buckets
self.num_histogram_buckets = num_histogram_buckets
self.num_quantiles_histogram_buckets = num_quantiles_histogram_buckets
self.epsilon = epsilon
self.infer_type_from_schema = infer_type_from_schema
self.desired_batch_size = desired_batch_size
self.enable_semantic_domain_stats = enable_semantic_domain_stats
self.semantic_domain_stats_sample_rate = semantic_domain_stats_sample_rate
self._per_feature_weight_override = per_feature_weight_override
self.vocab_paths = vocab_paths
self.add_default_generators = add_default_generators
self.experimental_use_sketch_based_topk_uniques = (
experimental_use_sketch_based_topk_uniques)
self.experimental_slice_sqls = experimental_slice_sqls
self.experimental_result_partitions = experimental_result_partitions