def __init__()

in tensorflow_data_validation/statistics/stats_options.py [0:0]


  def __init__(
      self,
      generators: Optional[List[stats_generator.StatsGenerator]] = None,
      schema: Optional[schema_pb2.Schema] = None,
      label_feature: Optional[types.FeatureName] = None,
      weight_feature: Optional[types.FeatureName] = None,
      slice_functions: Optional[List[types.SliceFunction]] = None,
      sample_rate: Optional[float] = None,
      num_top_values: int = 20,
      frequency_threshold: int = 1,
      weighted_frequency_threshold: float = 1.0,
      num_rank_histogram_buckets: int = 1000,
      num_values_histogram_buckets: int = 10,
      num_histogram_buckets: int = 10,
      num_quantiles_histogram_buckets: int = 10,
      epsilon: float = 0.01,
      infer_type_from_schema: bool = False,
      desired_batch_size: Optional[int] = None,
      enable_semantic_domain_stats: bool = False,
      semantic_domain_stats_sample_rate: Optional[float] = None,
      per_feature_weight_override: Optional[Dict[types.FeaturePath,
                                                 types.FeatureName]] = None,
      vocab_paths: Optional[Dict[types.VocabName, types.VocabPath]] = None,
      add_default_generators: bool = True,
      feature_allowlist: Optional[List[types.FeatureName]] = None,
      experimental_use_sketch_based_topk_uniques: bool = False,
      experimental_slice_functions: Optional[List[types.SliceFunction]] = None,
      experimental_slice_sqls: Optional[List[Text]] = None,
      experimental_result_partitions: int = 1):
    """Initializes statistics options.

    Args:
      generators: An optional list of statistics generators. A statistics
        generator must extend either CombinerStatsGenerator or
        TransformStatsGenerator.
      schema: An optional tensorflow_metadata Schema proto. Currently we use the
        schema to infer categorical and bytes features.
      label_feature: An optional feature name which represents the label.
      weight_feature: An optional feature name whose numeric value represents
        the weight of an example.
      slice_functions: DEPRECATED. Use `experimental_slice_functions`.
      sample_rate: An optional sampling rate. If specified, statistics is
        computed over the sample.
      num_top_values: An optional number of most frequent feature values to keep
        for string features.
      frequency_threshold: An optional minimum number of examples the most
        frequent values must be present in.
      weighted_frequency_threshold: An optional minimum weighted number of
        examples the most frequent weighted values must be present in. This
        option is only relevant when a weight_feature is specified.
      num_rank_histogram_buckets: An optional number of buckets in the rank
        histogram for string features.
      num_values_histogram_buckets: An optional number of buckets in a quantiles
        histogram for the number of values per Feature, which is stored in
        CommonStatistics.num_values_histogram.
      num_histogram_buckets: An optional number of buckets in a standard
        NumericStatistics.histogram with equal-width buckets.
      num_quantiles_histogram_buckets: An optional number of buckets in a
        quantiles NumericStatistics.histogram.
      epsilon: An optional error tolerance for the computation of quantiles,
        typically a small fraction close to zero (e.g. 0.01). Higher values of
        epsilon increase the quantile approximation, and hence result in more
        unequal buckets, but could improve performance, and resource
        consumption.
      infer_type_from_schema: A boolean to indicate whether the feature types
        should be inferred from the schema. If set to True, an input schema must
        be provided. This flag is used only when invoking TFDV through
        `tfdv.generate_statistics_from_csv`.
      desired_batch_size: An optional maximum number of examples to include in
        each batch that is passed to the statistics generators. When invoking
        TFDV using its end-to-end APIs (e.g.
        `generate_statistics_from_tfrecord`), this option also controls the
        decoder batch size -- if provided, the decoded RecordBatches that are to
        be fed to TFDV will have the fixed batch size. When invoking TFDV using
        `tfdv.GenerateStatistics`, this option only controls the maximum size of
        RecordBatches constructed within StatsGenerators (a generator may
        combine RecordBatches).
      enable_semantic_domain_stats: If True statistics for semantic domains are
        generated (e.g: image, text domains).
      semantic_domain_stats_sample_rate: An optional sampling rate for semantic
        domain statistics. If specified, semantic domain statistics is computed
        over a sample.
      per_feature_weight_override: If specified, the "example weight" paired
        with a feature will be first looked up in this map and if not found,
        fall back to `weight_feature`.
      vocab_paths: An optional dictionary mapping vocab names to paths. Used in
        the schema when specifying a NaturalLanguageDomain. The paths can either
        be to GZIP-compressed TF record files that have a tfrecord.gz suffix or
        to text files.
      add_default_generators: Whether to invoke the default set of stats
        generators in the run. Generators invoked consists of 1) the default
        generators (controlled by this option); 2) user-provided generators (
        controlled by the `generators` option); 3) semantic generators
        (controlled by `enable_semantic_domain_stats`) and 4) schema-based
        generators that are enabled based on information provided in the schema.
      feature_allowlist: An optional list of names of the features to calculate
        statistics for.
      experimental_use_sketch_based_topk_uniques: if True, use the sketch based
        top-k and uniques stats generator.
      experimental_slice_functions: An optional list of functions that generate
        slice keys for each example. Each slice function should take
        pyarrow.RecordBatch as input and return an Iterable[Tuple[Text,
        pyarrow.RecordBatch]]. Each tuple contains the slice key and the
        corresponding sliced RecordBatch. Only one of
        experimental_slice_functions or experimental_slice_sqls must be
        specified.
      experimental_slice_sqls: List of slicing SQL queries. The query must have
        the following pattern: "SELECT STRUCT({feature_name} [AS {slice_key}])
          [FROM example.feature_name [, example.feature_name, ... ] [WHERE ...
          ]]" The “example.feature_name” inside the FROM statement is used to
          flatten the repeated fields. For non-repeated fields, you can directly
          write the
        query as follows: “SELECT STRUCT(non_repeated_feature_a,
          non_repeated_feature_b)” In the query, the “example” is a key word
          that binds to each input "row". The semantics of this variable will
          depend on the decoding of the input data to the Arrow representation
          (e.g., for tf.Example, each key is decoded to a separate column).
          Thus, structured data can be readily accessed by iterating/unnesting
          the fields of the "example" variable.
        Example 1: Slice on each value of a feature "SELECT STRUCT(gender) FROM
          example.gender"
        Example 2: Slice on each value of one feature and a specified value of
          another. "SELECT STRUCT(gender, country) FROM example.gender,
          example.country WHERE country = 'USA'" Only one of
          experimental_slice_functions or experimental_slice_sqls must be
          specified. Note that this option is not supported on Windows.
      experimental_result_partitions: The number of feature partitions to
        combine output DatasetFeatureStatisticsLists into. If set to 1 (default)
        output is globally combined. If set to value greater than one, up to
        that many shards are returned, each containing a subset of features.
    """
    self.generators = generators
    self.feature_allowlist = feature_allowlist
    self.schema = schema
    self.label_feature = label_feature
    self.weight_feature = weight_feature
    if slice_functions is not None and experimental_slice_functions is not None:
      raise ValueError(
          'Specify only one of slice_functions or experimental_slice_functions')
    self.experimental_slice_functions = None
    if slice_functions is not None:
      self.experimental_slice_functions = slice_functions
    elif experimental_slice_functions is not None:
      self.experimental_slice_functions = experimental_slice_functions
    self.sample_rate = sample_rate
    self.num_top_values = num_top_values
    self.frequency_threshold = frequency_threshold
    self.weighted_frequency_threshold = weighted_frequency_threshold
    self.num_rank_histogram_buckets = num_rank_histogram_buckets
    self.num_values_histogram_buckets = num_values_histogram_buckets
    self.num_histogram_buckets = num_histogram_buckets
    self.num_quantiles_histogram_buckets = num_quantiles_histogram_buckets
    self.epsilon = epsilon
    self.infer_type_from_schema = infer_type_from_schema
    self.desired_batch_size = desired_batch_size
    self.enable_semantic_domain_stats = enable_semantic_domain_stats
    self.semantic_domain_stats_sample_rate = semantic_domain_stats_sample_rate
    self._per_feature_weight_override = per_feature_weight_override
    self.vocab_paths = vocab_paths
    self.add_default_generators = add_default_generators
    self.experimental_use_sketch_based_topk_uniques = (
        experimental_use_sketch_based_topk_uniques)
    self.experimental_slice_sqls = experimental_slice_sqls
    self.experimental_result_partitions = experimental_result_partitions