cpp/src/parquet/geospatial/statistics.h (51 lines of code) (raw):

// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #pragma once #include <cstdint> #include <memory> #include <optional> #include "parquet/platform.h" #include "parquet/types.h" namespace parquet::geospatial { /// \brief The maximum number of dimensions represented by a geospatial type /// (i.e., X, Y, Z, and M) inline constexpr int kMaxDimensions = 4; /// \brief NaN, used to represent bounds for which predicate pushdown cannnot /// be applied (e.g., because a writer did not provide bounds for a given dimension) inline constexpr double kNaN = std::numeric_limits<double>::quiet_NaN(); /// \brief Structure represented encoded statistics to be written to and read from Parquet /// serialized metadata. /// /// See the Parquet Thrift definition and GeoStatistics for the specific definition /// of field values. struct PARQUET_EXPORT EncodedGeoStatistics { bool xy_bounds_present{false}; double xmin{kNaN}; double xmax{kNaN}; double ymin{kNaN}; double ymax{kNaN}; bool z_bounds_present{false}; double zmin{kNaN}; double zmax{kNaN}; bool m_bounds_present{false}; double mmin{kNaN}; double mmax{kNaN}; bool geospatial_types_present() const { return !geospatial_types.empty(); } std::vector<int32_t> geospatial_types; }; class GeoStatisticsImpl; /// \brief Base type for computing geospatial column statistics while writing a file /// or representing them when reading a file /// /// Note that NaN values that were encountered within coordinates are omitted; however, /// NaN values that were obtained via decoding encoded statistics are propagated. This /// behaviour ensures C++ clients that are inspecting statistics via the column metadata /// can detect the case where a writer generated NaNs (even though this implementation /// does not generate them). /// /// The handling of NaN values in coordinates is not well-defined among bounding /// implementations except for the WKB convention for POINT EMPTY, which is consistently /// represented as a point whose ordinates are all NaN. Any other geometry that contains /// NaNs cannot expect defined behaviour here or elsewhere; however, a row group that /// contains both NaN-containing and normal (completely finite) geometries should not be /// excluded from predicate pushdown. /// /// EXPERIMENTAL class PARQUET_EXPORT GeoStatistics { public: GeoStatistics(); explicit GeoStatistics(const EncodedGeoStatistics& encoded); ~GeoStatistics(); /// \brief Return true if bounds, geometry types, and validity are identical bool Equals(const GeoStatistics& other) const; /// \brief Update these statistics based on previously calculated or decoded statistics /// /// Merging statistics with wraparound X values is not currently supported. Merging /// two GeoStatistics where one or both has a wraparound X range will result in these /// statistics having an X dimension marked as invalid. void Merge(const GeoStatistics& other); /// \brief Update these statistics based on values void Update(const ByteArray* values, int64_t num_values); /// \brief Update these statistics based on the non-null elements of values void UpdateSpaced(const ByteArray* values, const uint8_t* valid_bits, int64_t valid_bits_offset, int64_t num_spaced_values, int64_t num_values); /// \brief Update these statistics based on the non-null elements of values /// /// Currently, BinaryArray and LargeBinaryArray input is supported. void Update(const ::arrow::Array& values); /// \brief Return these statistics to an empty state void Reset(); /// \brief Encode the statistics for serializing to Thrift /// /// If invalid WKB was encountered or if the statistics contain NaN /// for any reason, Encode() will return nullopt to indicate that /// statistics should not be written to thrift. std::optional<EncodedGeoStatistics> Encode() const; /// \brief Returns false if invalid WKB was encountered bool is_valid() const; /// \brief Reset existing statistics and populate them from previously-encoded ones void Decode(const EncodedGeoStatistics& encoded); /// \brief Minimum values in XYZM order /// /// For dimensions where dimension_valid() is false, the value will be NaN. For /// dimensions where dimension_empty() is true, the value will be +Inf. /// /// For the first dimension (X) only, wraparound bounds apply where xmin > xmax. In this /// case, these bounds represent the union of the intervals [xmax, Inf] and [-Inf, /// xmin]. This implementation does not yet generate these types of bounds but they may /// be encountered in statistics when reading a Parquet file. std::array<double, kMaxDimensions> lower_bound() const; /// \brief Maximum values in XYZM order /// /// For dimensions where dimension_valid() is false, the value will be NaN. For /// dimensions where dimension_empty() is true, the value will be -Inf. /// /// For the first dimension (X) only, wraparound bounds apply where xmin > xmax. In this /// case, these bounds represent the union of the intervals [xmax, Inf] and [-Inf, /// xmin]. This implementation does not yet generate these types of bounds but they may /// be encountered in statistics when reading a Parquet file. std::array<double, kMaxDimensions> upper_bound() const; /// \brief Dimension emptiness in XYZM order /// /// True for a given dimension if and only if zero non-NaN values were encountered /// in that dimension and dimension_valid() is true for that dimension. /// /// When calculating statistics, zero or more of these values may be true because /// this implementation calculates bounds for all dimensions; however, it may be /// true that zero coordinates were encountered in a given dimension. For example, /// dimension_empty() will return four true values if Update() was not called /// or if Update() was called with only null values. If Update() was provided /// one or more geometries with X and Y dimensions but not Z or M dimensions, /// dimension_empty() will return true, true, false, false. /// /// For statistics read from a Parquet file, dimension_empty() will always contain /// false values because there is no mechanism to communicate an empty interval /// in the Thrift metadata. std::array<bool, kMaxDimensions> dimension_empty() const; /// \brief Dimension validity (i.e. presence) in XYZM order /// /// When calculating statistics, this will always be true because this implementation /// calculates statistics for all dimensions. When reading a Parquet file, one or more /// of these values may be false because the file may not have provided bounds for all /// dimensions. /// /// See documentation for dimension_empty(), lower_bound(), and/or upper_bound() for the /// canonical values of those outputs for the dimensions where dimension_valid() is /// false. std::array<bool, kMaxDimensions> dimension_valid() const; /// \brief Return the geometry type codes /// /// This implementation always returns sorted output with no duplicates. When /// calculating statistics, a value will always be returned (although the returned /// vector may be empty if Update() was never called or was only called with null /// values). When reading a Parquet file, std::nullopt may be returned because /// the file may not have provided this information. std::optional<std::vector<int32_t>> geometry_types() const; /// \brief Return a string representation of these statistics std::string ToString() const; private: std::unique_ptr<GeoStatisticsImpl> impl_; }; } // namespace parquet::geospatial