def generate_equi_width

def generate_equi_width_buckets()

in tensorflow_data_validation/utils/quantiles_util.py [0:0]
41 lines of code
25 McCabe index (conditional complexity)

def generate_equi_width_buckets(quantiles: List[float],
                                finite_min: float,
                                finite_max: float,
                                total_count: float,
                                num_buckets: int) -> List[Bucket]:
  """Generate buckets for equi-width histogram.

  Args:
    quantiles: A list containing the quantile boundaries.
    finite_min: The mimimum finite value.
    finite_max: The maximum finite value.
    total_count: The total number of values over which the quantiles
        are computed.
    num_buckets: The required number of buckets in the equi-width histogram.

  Returns:
    A list containing the buckets.
  """
  # We assume that the number of quantiles is much higher than
  # the required number of buckets in the equi-width histogram.
  assert len(quantiles) > num_buckets

  # If all values of a feature are equal, have only a single bucket.
  if quantiles[0] == quantiles[-1]:
    return [Bucket(quantiles[0], quantiles[-1], total_count)]

  # Find the index of the first and the last finite value. If there are only
  # -inf and +inf values, we generate two buckets (-inf, -inf) and (+inf, +inf).
  finite_min_index = np.searchsorted(quantiles, float('-inf'), side='right')
  finite_max_index = np.searchsorted(quantiles, float('inf'), side='left') - 1

  # Compute sample count associated with a quantile interval.
  sample_count = total_count / (len(quantiles) - 1)

  if finite_max_index < finite_min_index:
    return [
        # Divide the intersecting bucket (-inf, +inf) sample count equally.
        Bucket(float('-inf'), float('-inf'),
               (finite_min_index - 0.5) * sample_count),
        Bucket(float('inf'), float('inf'),
               (len(quantiles) - finite_max_index - 1.5) * sample_count),
    ]

  # Sample count to account for  (-inf, -inf) buckets.
  start_bucket_count = finite_min_index * sample_count
  # Sample count to account for (inf, inf) buckets.
  last_bucket_count = (len(quantiles) - finite_max_index - 1) * sample_count
  finite_values = quantiles[finite_min_index:finite_max_index+1]
  # Insert finite minimum and maximum if first and last finite quantiles are
  # greater or lesser than the finite mimimum and maximum respectively.
  # Note that if all values of a feature are finite, we will always have the
  # finite min and finite max as the first and last boundaries.
  if finite_min_index > 0 and finite_min < finite_values[0]:
    finite_values.insert(0, finite_min)
    # Since we are adding an extra boundary, we borrow the sample count from the
    # (-inf, -inf) buckets as the first bucket will anyhow be merged with all
    # the (-inf, -inf) buckets.
    start_bucket_count -= sample_count
  if finite_max_index < len(quantiles) - 1 and finite_max > finite_values[-1]:
    finite_values.append(finite_max)
    # Since we are adding an extra boundary, we borrow the sample count from the
    # (+inf, +inf) buckets as the last bucket will anyhow be merged with all
    # the (+inf, +inf) buckets.
    last_bucket_count -= sample_count

  # Cast finite boundaries from float32 to float64 to avoid precision errors.
  # This error can happen when both min and max in the boundaries are valid
  # float32 values, but when we compute (max-min) to compute the width it can
  # result in an overflow.
  # Example: min=-3.4e+38, max=3.4e+38
  finite_values = np.array(finite_values, dtype=np.float64)

  # Check if the finite quantile boundaries are sorted.
  assert np.all(np.diff(finite_values) >= 0), (
      'Quantiles output not sorted %r'  % ','.join(map(str, finite_values)))

  # Construct the list of buckets from finite boundaries.
  result = _generate_equi_width_buckets_from_finite_boundaries(
      finite_values, sample_count, num_buckets)

  # If we have -inf values, update first bucket's low value (to be -inf) and
  # sample count to account for remaining (-inf, -inf) buckets.
  if finite_min_index > 0:
    result[0] = Bucket(
        float('-inf'), result[0].high_value,
        result[0].sample_count + start_bucket_count)
  # If we have +inf values, update last bucket's high value (to be +inf) and
  # sample count to account for remaining (+inf, +inf) buckets.
  if finite_max_index < len(quantiles) - 1:
    result[-1] = Bucket(
        result[-1].low_value, float('inf'),
        result[-1].sample_count + last_bucket_count)
  return result