cpp/src/parquet/geospatial/statistics.h (51 lines of code) (raw):
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <optional>
#include "parquet/platform.h"
#include "parquet/types.h"
namespace parquet::geospatial {
/// \brief The maximum number of dimensions represented by a geospatial type
/// (i.e., X, Y, Z, and M)
inline constexpr int kMaxDimensions = 4;
/// \brief NaN, used to represent bounds for which predicate pushdown cannnot
/// be applied (e.g., because a writer did not provide bounds for a given dimension)
inline constexpr double kNaN = std::numeric_limits<double>::quiet_NaN();
/// \brief Structure represented encoded statistics to be written to and read from Parquet
/// serialized metadata.
///
/// See the Parquet Thrift definition and GeoStatistics for the specific definition
/// of field values.
struct PARQUET_EXPORT EncodedGeoStatistics {
bool xy_bounds_present{false};
double xmin{kNaN};
double xmax{kNaN};
double ymin{kNaN};
double ymax{kNaN};
bool z_bounds_present{false};
double zmin{kNaN};
double zmax{kNaN};
bool m_bounds_present{false};
double mmin{kNaN};
double mmax{kNaN};
bool geospatial_types_present() const { return !geospatial_types.empty(); }
std::vector<int32_t> geospatial_types;
};
class GeoStatisticsImpl;
/// \brief Base type for computing geospatial column statistics while writing a file
/// or representing them when reading a file
///
/// Note that NaN values that were encountered within coordinates are omitted; however,
/// NaN values that were obtained via decoding encoded statistics are propagated. This
/// behaviour ensures C++ clients that are inspecting statistics via the column metadata
/// can detect the case where a writer generated NaNs (even though this implementation
/// does not generate them).
///
/// The handling of NaN values in coordinates is not well-defined among bounding
/// implementations except for the WKB convention for POINT EMPTY, which is consistently
/// represented as a point whose ordinates are all NaN. Any other geometry that contains
/// NaNs cannot expect defined behaviour here or elsewhere; however, a row group that
/// contains both NaN-containing and normal (completely finite) geometries should not be
/// excluded from predicate pushdown.
///
/// EXPERIMENTAL
class PARQUET_EXPORT GeoStatistics {
public:
GeoStatistics();
explicit GeoStatistics(const EncodedGeoStatistics& encoded);
~GeoStatistics();
/// \brief Return true if bounds, geometry types, and validity are identical
bool Equals(const GeoStatistics& other) const;
/// \brief Update these statistics based on previously calculated or decoded statistics
///
/// Merging statistics with wraparound X values is not currently supported. Merging
/// two GeoStatistics where one or both has a wraparound X range will result in these
/// statistics having an X dimension marked as invalid.
void Merge(const GeoStatistics& other);
/// \brief Update these statistics based on values
void Update(const ByteArray* values, int64_t num_values);
/// \brief Update these statistics based on the non-null elements of values
void UpdateSpaced(const ByteArray* values, const uint8_t* valid_bits,
int64_t valid_bits_offset, int64_t num_spaced_values,
int64_t num_values);
/// \brief Update these statistics based on the non-null elements of values
///
/// Currently, BinaryArray and LargeBinaryArray input is supported.
void Update(const ::arrow::Array& values);
/// \brief Return these statistics to an empty state
void Reset();
/// \brief Encode the statistics for serializing to Thrift
///
/// If invalid WKB was encountered or if the statistics contain NaN
/// for any reason, Encode() will return nullopt to indicate that
/// statistics should not be written to thrift.
std::optional<EncodedGeoStatistics> Encode() const;
/// \brief Returns false if invalid WKB was encountered
bool is_valid() const;
/// \brief Reset existing statistics and populate them from previously-encoded ones
void Decode(const EncodedGeoStatistics& encoded);
/// \brief Minimum values in XYZM order
///
/// For dimensions where dimension_valid() is false, the value will be NaN. For
/// dimensions where dimension_empty() is true, the value will be +Inf.
///
/// For the first dimension (X) only, wraparound bounds apply where xmin > xmax. In this
/// case, these bounds represent the union of the intervals [xmax, Inf] and [-Inf,
/// xmin]. This implementation does not yet generate these types of bounds but they may
/// be encountered in statistics when reading a Parquet file.
std::array<double, kMaxDimensions> lower_bound() const;
/// \brief Maximum values in XYZM order
///
/// For dimensions where dimension_valid() is false, the value will be NaN. For
/// dimensions where dimension_empty() is true, the value will be -Inf.
///
/// For the first dimension (X) only, wraparound bounds apply where xmin > xmax. In this
/// case, these bounds represent the union of the intervals [xmax, Inf] and [-Inf,
/// xmin]. This implementation does not yet generate these types of bounds but they may
/// be encountered in statistics when reading a Parquet file.
std::array<double, kMaxDimensions> upper_bound() const;
/// \brief Dimension emptiness in XYZM order
///
/// True for a given dimension if and only if zero non-NaN values were encountered
/// in that dimension and dimension_valid() is true for that dimension.
///
/// When calculating statistics, zero or more of these values may be true because
/// this implementation calculates bounds for all dimensions; however, it may be
/// true that zero coordinates were encountered in a given dimension. For example,
/// dimension_empty() will return four true values if Update() was not called
/// or if Update() was called with only null values. If Update() was provided
/// one or more geometries with X and Y dimensions but not Z or M dimensions,
/// dimension_empty() will return true, true, false, false.
///
/// For statistics read from a Parquet file, dimension_empty() will always contain
/// false values because there is no mechanism to communicate an empty interval
/// in the Thrift metadata.
std::array<bool, kMaxDimensions> dimension_empty() const;
/// \brief Dimension validity (i.e. presence) in XYZM order
///
/// When calculating statistics, this will always be true because this implementation
/// calculates statistics for all dimensions. When reading a Parquet file, one or more
/// of these values may be false because the file may not have provided bounds for all
/// dimensions.
///
/// See documentation for dimension_empty(), lower_bound(), and/or upper_bound() for the
/// canonical values of those outputs for the dimensions where dimension_valid() is
/// false.
std::array<bool, kMaxDimensions> dimension_valid() const;
/// \brief Return the geometry type codes
///
/// This implementation always returns sorted output with no duplicates. When
/// calculating statistics, a value will always be returned (although the returned
/// vector may be empty if Update() was never called or was only called with null
/// values). When reading a Parquet file, std::nullopt may be returned because
/// the file may not have provided this information.
std::optional<std::vector<int32_t>> geometry_types() const;
/// \brief Return a string representation of these statistics
std::string ToString() const;
private:
std::unique_ptr<GeoStatisticsImpl> impl_;
};
} // namespace parquet::geospatial