in tensorflow_data_validation/utils/quantiles_util.py [0:0]
def generate_equi_width_buckets(quantiles: List[float],
finite_min: float,
finite_max: float,
total_count: float,
num_buckets: int) -> List[Bucket]:
"""Generate buckets for equi-width histogram.
Args:
quantiles: A list containing the quantile boundaries.
finite_min: The mimimum finite value.
finite_max: The maximum finite value.
total_count: The total number of values over which the quantiles
are computed.
num_buckets: The required number of buckets in the equi-width histogram.
Returns:
A list containing the buckets.
"""
# We assume that the number of quantiles is much higher than
# the required number of buckets in the equi-width histogram.
assert len(quantiles) > num_buckets
# If all values of a feature are equal, have only a single bucket.
if quantiles[0] == quantiles[-1]:
return [Bucket(quantiles[0], quantiles[-1], total_count)]
# Find the index of the first and the last finite value. If there are only
# -inf and +inf values, we generate two buckets (-inf, -inf) and (+inf, +inf).
finite_min_index = np.searchsorted(quantiles, float('-inf'), side='right')
finite_max_index = np.searchsorted(quantiles, float('inf'), side='left') - 1
# Compute sample count associated with a quantile interval.
sample_count = total_count / (len(quantiles) - 1)
if finite_max_index < finite_min_index:
return [
# Divide the intersecting bucket (-inf, +inf) sample count equally.
Bucket(float('-inf'), float('-inf'),
(finite_min_index - 0.5) * sample_count),
Bucket(float('inf'), float('inf'),
(len(quantiles) - finite_max_index - 1.5) * sample_count),
]
# Sample count to account for (-inf, -inf) buckets.
start_bucket_count = finite_min_index * sample_count
# Sample count to account for (inf, inf) buckets.
last_bucket_count = (len(quantiles) - finite_max_index - 1) * sample_count
finite_values = quantiles[finite_min_index:finite_max_index+1]
# Insert finite minimum and maximum if first and last finite quantiles are
# greater or lesser than the finite mimimum and maximum respectively.
# Note that if all values of a feature are finite, we will always have the
# finite min and finite max as the first and last boundaries.
if finite_min_index > 0 and finite_min < finite_values[0]:
finite_values.insert(0, finite_min)
# Since we are adding an extra boundary, we borrow the sample count from the
# (-inf, -inf) buckets as the first bucket will anyhow be merged with all
# the (-inf, -inf) buckets.
start_bucket_count -= sample_count
if finite_max_index < len(quantiles) - 1 and finite_max > finite_values[-1]:
finite_values.append(finite_max)
# Since we are adding an extra boundary, we borrow the sample count from the
# (+inf, +inf) buckets as the last bucket will anyhow be merged with all
# the (+inf, +inf) buckets.
last_bucket_count -= sample_count
# Cast finite boundaries from float32 to float64 to avoid precision errors.
# This error can happen when both min and max in the boundaries are valid
# float32 values, but when we compute (max-min) to compute the width it can
# result in an overflow.
# Example: min=-3.4e+38, max=3.4e+38
finite_values = np.array(finite_values, dtype=np.float64)
# Check if the finite quantile boundaries are sorted.
assert np.all(np.diff(finite_values) >= 0), (
'Quantiles output not sorted %r' % ','.join(map(str, finite_values)))
# Construct the list of buckets from finite boundaries.
result = _generate_equi_width_buckets_from_finite_boundaries(
finite_values, sample_count, num_buckets)
# If we have -inf values, update first bucket's low value (to be -inf) and
# sample count to account for remaining (-inf, -inf) buckets.
if finite_min_index > 0:
result[0] = Bucket(
float('-inf'), result[0].high_value,
result[0].sample_count + start_bucket_count)
# If we have +inf values, update last bucket's high value (to be +inf) and
# sample count to account for remaining (+inf, +inf) buckets.
if finite_max_index < len(quantiles) - 1:
result[-1] = Bucket(
result[-1].low_value, float('inf'),
result[-1].sample_count + last_bucket_count)
return result