parquet/metadata/statistics_types.gen.go (2,123 lines of code) (raw):
// Code generated by statistics_types.gen.go.tmpl. DO NOT EDIT.
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package metadata
import (
"fmt"
"math"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/array"
"github.com/apache/arrow-go/v18/arrow/float16"
"github.com/apache/arrow-go/v18/arrow/memory"
"github.com/apache/arrow-go/v18/internal/bitutils"
shared_utils "github.com/apache/arrow-go/v18/internal/utils"
"github.com/apache/arrow-go/v18/parquet"
"github.com/apache/arrow-go/v18/parquet/internal/encoding"
"github.com/apache/arrow-go/v18/parquet/schema"
)
type minmaxPairInt32 [2]int32
// Int32Statistics is the typed interface for managing stats for a column
// of Int32 type.
type Int32Statistics struct {
statistics
min int32
max int32
bitSetReader bitutils.SetBitRunReader
}
// NewInt32Statistics constructs an appropriate stat object type using the
// given column descriptor and allocator.
//
// Panics if the physical type of descr is not parquet.Type.Int32
func NewInt32Statistics(descr *schema.Column, mem memory.Allocator) *Int32Statistics {
if descr.PhysicalType() != parquet.Types.Int32 {
panic(fmt.Errorf("parquet: invalid type %s for constructing a Int32 stat object", descr.PhysicalType()))
}
return &Int32Statistics{
statistics: statistics{
descr: descr,
hasNullCount: true,
hasDistinctCount: true,
order: descr.SortOrder(),
encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem),
mem: mem,
},
}
}
// NewInt32StatisticsFromEncoded will construct a properly typed statistics object
// initializing it with the provided information.
func NewInt32StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Int32Statistics {
ret := NewInt32Statistics(descr, mem)
ret.nvalues += nvalues
if encoded.IsSetNullCount() {
ret.IncNulls(encoded.GetNullCount())
}
if encoded.IsSetDistinctCount() {
ret.IncDistinct(encoded.GetDistinctCount())
}
encodedMin := encoded.GetMin()
if encodedMin != nil && len(encodedMin) > 0 {
ret.min = ret.plainDecode(encodedMin)
}
encodedMax := encoded.GetMax()
if encodedMax != nil && len(encodedMax) > 0 {
ret.max = ret.plainDecode(encodedMax)
}
ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin()
return ret
}
func (s *Int32Statistics) plainEncode(src int32) []byte {
s.encoder.(encoding.Int32Encoder).Put([]int32{src})
buf, err := s.encoder.FlushValues()
if err != nil {
panic(err) // recovered by Encode
}
defer buf.Release()
out := make([]byte, buf.Len())
copy(out, buf.Bytes())
return out
}
func (s *Int32Statistics) plainDecode(src []byte) int32 {
var buf [1]int32
decoder := encoding.NewDecoder(s.descr.PhysicalType(), parquet.Encodings.Plain, s.descr, s.mem)
decoder.SetData(1, src)
decoder.(encoding.Int32Decoder).Decode(buf[:])
return buf[0]
}
func (s *Int32Statistics) minval(a, b int32) int32 {
if s.less(a, b) {
return a
}
return b
}
func (s *Int32Statistics) maxval(a, b int32) int32 {
if s.less(a, b) {
return b
}
return a
}
// MinMaxEqual returns true if both stat objects have the same Min and Max values
func (s *Int32Statistics) MinMaxEqual(rhs *Int32Statistics) bool {
return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max)
}
// Equals returns true only if both objects are the same type, have the same min and
// max values, null count, distinct count and number of values.
func (s *Int32Statistics) Equals(other TypedStatistics) bool {
if s.Type() != other.Type() {
return false
}
rhs, ok := other.(*Int32Statistics)
if !ok {
return false
}
if s.HasMinMax() != rhs.HasMinMax() {
return false
}
return (s.hasMinMax && s.MinMaxEqual(rhs)) &&
s.NullCount() == rhs.NullCount() &&
s.DistinctCount() == rhs.DistinctCount() &&
s.NumValues() == rhs.NumValues()
}
func (s *Int32Statistics) getMinMax(values []int32) (min, max int32) {
if s.order == schema.SortSIGNED {
min, max = shared_utils.GetMinMaxInt32(values)
} else {
umin, umax := shared_utils.GetMinMaxUint32(arrow.Uint32Traits.CastFromBytes(arrow.Int32Traits.CastToBytes(values)))
min, max = int32(umin), int32(umax)
}
return
}
func (s *Int32Statistics) getMinMaxSpaced(values []int32, validBits []byte, validBitsOffset int64) (min, max int32) {
min = s.defaultMin()
max = s.defaultMax()
var fn func([]int32)
if s.order == schema.SortSIGNED {
fn = func(v []int32) {
localMin, localMax := shared_utils.GetMinMaxInt32(v)
if min > localMin {
min = localMin
}
if max < localMax {
max = localMax
}
}
} else {
fn = func(v []int32) {
umin, umax := shared_utils.GetMinMaxUint32(arrow.Uint32Traits.CastFromBytes(arrow.Int32Traits.CastToBytes(v)))
if uint32(min) > umin {
min = int32(umin)
}
if uint32(max) < umax {
max = int32(umax)
}
}
}
if s.bitSetReader == nil {
s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values)))
} else {
s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values)))
}
for {
run := s.bitSetReader.NextRun()
if run.Length == 0 {
break
}
fn(values[int(run.Pos):int(run.Pos+run.Length)])
}
return
}
func (s *Int32Statistics) Min() int32 { return s.min }
func (s *Int32Statistics) Max() int32 { return s.max }
// Merge merges the stats from other into this stat object, updating
// the null count, distinct count, number of values and the min/max if
// appropriate.
func (s *Int32Statistics) Merge(other TypedStatistics) {
rhs, ok := other.(*Int32Statistics)
if !ok {
panic("incompatible stat type merge")
}
s.statistics.merge(rhs)
if rhs.HasMinMax() {
s.SetMinMax(rhs.Min(), rhs.Max())
}
}
// Update is used to add more values to the current stat object, finding the
// min and max values etc.
func (s *Int32Statistics) Update(values []int32, numNull int64) {
s.IncNulls(numNull)
s.nvalues += int64(len(values))
if len(values) == 0 {
return
}
s.SetMinMax(s.getMinMax(values))
}
// UpdateSpaced is just like Update, but for spaced values using validBits to determine
// and skip null values.
func (s *Int32Statistics) UpdateSpaced(values []int32, validBits []byte, validBitsOffset, numNull int64) {
s.IncNulls(numNull)
notnull := int64(len(values)) - numNull
s.nvalues += notnull
if notnull == 0 {
return
}
s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset))
}
func (s *Int32Statistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error {
if updateCounts {
s.IncNulls(int64(values.NullN()))
s.nvalues += int64(values.Len() - values.NullN())
}
if values.NullN() == values.Len() {
return nil
}
if values.DataType().(arrow.FixedWidthDataType).Bytes() != arrow.Int32SizeBytes {
return fmt.Errorf("%w: cannot update int32 stats with %s arrow array",
arrow.ErrInvalid, values.DataType())
}
rawBytes := values.Data().Buffers()[1].Bytes()[values.Data().Offset()*arrow.Int32SizeBytes:]
s.SetMinMax(s.getMinMax(arrow.Int32Traits.CastFromBytes(rawBytes)))
return nil
}
// SetMinMax updates the min and max values only if they are not currently set
// or if argMin is less than the current min / argMax is greater than the current max
func (s *Int32Statistics) SetMinMax(argMin, argMax int32) {
maybeMinMax := s.cleanStat([2]int32{argMin, argMax})
if maybeMinMax == nil {
return
}
min := (*maybeMinMax)[0]
max := (*maybeMinMax)[1]
if !s.hasMinMax {
s.hasMinMax = true
s.min = min
s.max = max
} else {
if !s.less(s.min, min) {
s.min = min
}
if s.less(s.max, max) {
s.max = max
}
}
}
// EncodeMin returns the encoded min value with plain encoding.
//
// ByteArray stats do not include the length in the encoding.
func (s *Int32Statistics) EncodeMin() []byte {
if s.HasMinMax() {
return s.plainEncode(s.min)
}
return nil
}
// EncodeMax returns the current encoded max value with plain encoding
//
// ByteArray stats do not include the length in the encoding
func (s *Int32Statistics) EncodeMax() []byte {
if s.HasMinMax() {
return s.plainEncode(s.max)
}
return nil
}
// Encode returns a populated EncodedStatistics object
func (s *Int32Statistics) Encode() (enc EncodedStatistics, err error) {
defer func() {
if r := recover(); r != nil {
err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r)
}
}()
if s.HasMinMax() {
enc.SetMax(s.EncodeMax())
enc.SetMin(s.EncodeMin())
}
if s.HasNullCount() {
enc.SetNullCount(s.NullCount())
enc.AllNullValue = s.NumValues() == 0
}
if s.HasDistinctCount() {
enc.SetDistinctCount(s.DistinctCount())
}
return
}
type minmaxPairInt64 [2]int64
// Int64Statistics is the typed interface for managing stats for a column
// of Int64 type.
type Int64Statistics struct {
statistics
min int64
max int64
bitSetReader bitutils.SetBitRunReader
}
// NewInt64Statistics constructs an appropriate stat object type using the
// given column descriptor and allocator.
//
// Panics if the physical type of descr is not parquet.Type.Int64
func NewInt64Statistics(descr *schema.Column, mem memory.Allocator) *Int64Statistics {
if descr.PhysicalType() != parquet.Types.Int64 {
panic(fmt.Errorf("parquet: invalid type %s for constructing a Int64 stat object", descr.PhysicalType()))
}
return &Int64Statistics{
statistics: statistics{
descr: descr,
hasNullCount: true,
hasDistinctCount: true,
order: descr.SortOrder(),
encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem),
mem: mem,
},
}
}
// NewInt64StatisticsFromEncoded will construct a properly typed statistics object
// initializing it with the provided information.
func NewInt64StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Int64Statistics {
ret := NewInt64Statistics(descr, mem)
ret.nvalues += nvalues
if encoded.IsSetNullCount() {
ret.IncNulls(encoded.GetNullCount())
}
if encoded.IsSetDistinctCount() {
ret.IncDistinct(encoded.GetDistinctCount())
}
encodedMin := encoded.GetMin()
if encodedMin != nil && len(encodedMin) > 0 {
ret.min = ret.plainDecode(encodedMin)
}
encodedMax := encoded.GetMax()
if encodedMax != nil && len(encodedMax) > 0 {
ret.max = ret.plainDecode(encodedMax)
}
ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin()
return ret
}
func (s *Int64Statistics) plainEncode(src int64) []byte {
s.encoder.(encoding.Int64Encoder).Put([]int64{src})
buf, err := s.encoder.FlushValues()
if err != nil {
panic(err) // recovered by Encode
}
defer buf.Release()
out := make([]byte, buf.Len())
copy(out, buf.Bytes())
return out
}
func (s *Int64Statistics) plainDecode(src []byte) int64 {
var buf [1]int64
decoder := encoding.NewDecoder(s.descr.PhysicalType(), parquet.Encodings.Plain, s.descr, s.mem)
decoder.SetData(1, src)
decoder.(encoding.Int64Decoder).Decode(buf[:])
return buf[0]
}
func (s *Int64Statistics) minval(a, b int64) int64 {
if s.less(a, b) {
return a
}
return b
}
func (s *Int64Statistics) maxval(a, b int64) int64 {
if s.less(a, b) {
return b
}
return a
}
// MinMaxEqual returns true if both stat objects have the same Min and Max values
func (s *Int64Statistics) MinMaxEqual(rhs *Int64Statistics) bool {
return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max)
}
// Equals returns true only if both objects are the same type, have the same min and
// max values, null count, distinct count and number of values.
func (s *Int64Statistics) Equals(other TypedStatistics) bool {
if s.Type() != other.Type() {
return false
}
rhs, ok := other.(*Int64Statistics)
if !ok {
return false
}
if s.HasMinMax() != rhs.HasMinMax() {
return false
}
return (s.hasMinMax && s.MinMaxEqual(rhs)) &&
s.NullCount() == rhs.NullCount() &&
s.DistinctCount() == rhs.DistinctCount() &&
s.NumValues() == rhs.NumValues()
}
func (s *Int64Statistics) getMinMax(values []int64) (min, max int64) {
if s.order == schema.SortSIGNED {
min, max = shared_utils.GetMinMaxInt64(values)
} else {
umin, umax := shared_utils.GetMinMaxUint64(arrow.Uint64Traits.CastFromBytes(arrow.Int64Traits.CastToBytes(values)))
min, max = int64(umin), int64(umax)
}
return
}
func (s *Int64Statistics) getMinMaxSpaced(values []int64, validBits []byte, validBitsOffset int64) (min, max int64) {
min = s.defaultMin()
max = s.defaultMax()
var fn func([]int64)
if s.order == schema.SortSIGNED {
fn = func(v []int64) {
localMin, localMax := shared_utils.GetMinMaxInt64(v)
if min > localMin {
min = localMin
}
if max < localMax {
max = localMax
}
}
} else {
fn = func(v []int64) {
umin, umax := shared_utils.GetMinMaxUint64(arrow.Uint64Traits.CastFromBytes(arrow.Int64Traits.CastToBytes(v)))
if uint64(min) > umin {
min = int64(umin)
}
if uint64(max) < umax {
max = int64(umax)
}
}
}
if s.bitSetReader == nil {
s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values)))
} else {
s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values)))
}
for {
run := s.bitSetReader.NextRun()
if run.Length == 0 {
break
}
fn(values[int(run.Pos):int(run.Pos+run.Length)])
}
return
}
func (s *Int64Statistics) Min() int64 { return s.min }
func (s *Int64Statistics) Max() int64 { return s.max }
// Merge merges the stats from other into this stat object, updating
// the null count, distinct count, number of values and the min/max if
// appropriate.
func (s *Int64Statistics) Merge(other TypedStatistics) {
rhs, ok := other.(*Int64Statistics)
if !ok {
panic("incompatible stat type merge")
}
s.statistics.merge(rhs)
if rhs.HasMinMax() {
s.SetMinMax(rhs.Min(), rhs.Max())
}
}
// Update is used to add more values to the current stat object, finding the
// min and max values etc.
func (s *Int64Statistics) Update(values []int64, numNull int64) {
s.IncNulls(numNull)
s.nvalues += int64(len(values))
if len(values) == 0 {
return
}
s.SetMinMax(s.getMinMax(values))
}
// UpdateSpaced is just like Update, but for spaced values using validBits to determine
// and skip null values.
func (s *Int64Statistics) UpdateSpaced(values []int64, validBits []byte, validBitsOffset, numNull int64) {
s.IncNulls(numNull)
notnull := int64(len(values)) - numNull
s.nvalues += notnull
if notnull == 0 {
return
}
s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset))
}
func (s *Int64Statistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error {
if updateCounts {
s.IncNulls(int64(values.NullN()))
s.nvalues += int64(values.Len() - values.NullN())
}
if values.NullN() == values.Len() {
return nil
}
if values.DataType().(arrow.FixedWidthDataType).Bytes() != arrow.Int64SizeBytes {
return fmt.Errorf("%w: cannot update int64 stats with %s arrow array",
arrow.ErrInvalid, values.DataType())
}
rawBytes := values.Data().Buffers()[1].Bytes()[values.Data().Offset()*arrow.Int64SizeBytes:]
s.SetMinMax(s.getMinMax(arrow.Int64Traits.CastFromBytes(rawBytes)))
return nil
}
// SetMinMax updates the min and max values only if they are not currently set
// or if argMin is less than the current min / argMax is greater than the current max
func (s *Int64Statistics) SetMinMax(argMin, argMax int64) {
maybeMinMax := s.cleanStat([2]int64{argMin, argMax})
if maybeMinMax == nil {
return
}
min := (*maybeMinMax)[0]
max := (*maybeMinMax)[1]
if !s.hasMinMax {
s.hasMinMax = true
s.min = min
s.max = max
} else {
if !s.less(s.min, min) {
s.min = min
}
if s.less(s.max, max) {
s.max = max
}
}
}
// EncodeMin returns the encoded min value with plain encoding.
//
// ByteArray stats do not include the length in the encoding.
func (s *Int64Statistics) EncodeMin() []byte {
if s.HasMinMax() {
return s.plainEncode(s.min)
}
return nil
}
// EncodeMax returns the current encoded max value with plain encoding
//
// ByteArray stats do not include the length in the encoding
func (s *Int64Statistics) EncodeMax() []byte {
if s.HasMinMax() {
return s.plainEncode(s.max)
}
return nil
}
// Encode returns a populated EncodedStatistics object
func (s *Int64Statistics) Encode() (enc EncodedStatistics, err error) {
defer func() {
if r := recover(); r != nil {
err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r)
}
}()
if s.HasMinMax() {
enc.SetMax(s.EncodeMax())
enc.SetMin(s.EncodeMin())
}
if s.HasNullCount() {
enc.SetNullCount(s.NullCount())
enc.AllNullValue = s.NumValues() == 0
}
if s.HasDistinctCount() {
enc.SetDistinctCount(s.DistinctCount())
}
return
}
type minmaxPairInt96 [2]parquet.Int96
// Int96Statistics is the typed interface for managing stats for a column
// of Int96 type.
type Int96Statistics struct {
statistics
min parquet.Int96
max parquet.Int96
bitSetReader bitutils.SetBitRunReader
}
// NewInt96Statistics constructs an appropriate stat object type using the
// given column descriptor and allocator.
//
// Panics if the physical type of descr is not parquet.Type.Int96
func NewInt96Statistics(descr *schema.Column, mem memory.Allocator) *Int96Statistics {
if descr.PhysicalType() != parquet.Types.Int96 {
panic(fmt.Errorf("parquet: invalid type %s for constructing a Int96 stat object", descr.PhysicalType()))
}
return &Int96Statistics{
statistics: statistics{
descr: descr,
hasNullCount: true,
hasDistinctCount: true,
order: descr.SortOrder(),
encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem),
mem: mem,
},
}
}
// NewInt96StatisticsFromEncoded will construct a properly typed statistics object
// initializing it with the provided information.
func NewInt96StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Int96Statistics {
ret := NewInt96Statistics(descr, mem)
ret.nvalues += nvalues
if encoded.IsSetNullCount() {
ret.IncNulls(encoded.GetNullCount())
}
if encoded.IsSetDistinctCount() {
ret.IncDistinct(encoded.GetDistinctCount())
}
encodedMin := encoded.GetMin()
if encodedMin != nil && len(encodedMin) > 0 {
ret.min = ret.plainDecode(encodedMin)
}
encodedMax := encoded.GetMax()
if encodedMax != nil && len(encodedMax) > 0 {
ret.max = ret.plainDecode(encodedMax)
}
ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin()
return ret
}
func (s *Int96Statistics) plainEncode(src parquet.Int96) []byte {
s.encoder.(encoding.Int96Encoder).Put([]parquet.Int96{src})
buf, err := s.encoder.FlushValues()
if err != nil {
panic(err) // recovered by Encode
}
defer buf.Release()
out := make([]byte, buf.Len())
copy(out, buf.Bytes())
return out
}
func (s *Int96Statistics) plainDecode(src []byte) parquet.Int96 {
var buf [1]parquet.Int96
decoder := encoding.NewDecoder(s.descr.PhysicalType(), parquet.Encodings.Plain, s.descr, s.mem)
decoder.SetData(1, src)
decoder.(encoding.Int96Decoder).Decode(buf[:])
return buf[0]
}
func (s *Int96Statistics) minval(a, b parquet.Int96) parquet.Int96 {
if s.less(a, b) {
return a
}
return b
}
func (s *Int96Statistics) maxval(a, b parquet.Int96) parquet.Int96 {
if s.less(a, b) {
return b
}
return a
}
// MinMaxEqual returns true if both stat objects have the same Min and Max values
func (s *Int96Statistics) MinMaxEqual(rhs *Int96Statistics) bool {
return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max)
}
// Equals returns true only if both objects are the same type, have the same min and
// max values, null count, distinct count and number of values.
func (s *Int96Statistics) Equals(other TypedStatistics) bool {
if s.Type() != other.Type() {
return false
}
rhs, ok := other.(*Int96Statistics)
if !ok {
return false
}
if s.HasMinMax() != rhs.HasMinMax() {
return false
}
return (s.hasMinMax && s.MinMaxEqual(rhs)) &&
s.NullCount() == rhs.NullCount() &&
s.DistinctCount() == rhs.DistinctCount() &&
s.NumValues() == rhs.NumValues()
}
func (s *Int96Statistics) getMinMax(values []parquet.Int96) (min, max parquet.Int96) {
defMin := s.defaultMin()
defMax := s.defaultMax()
min = defMin
max = defMax
for _, v := range values {
min = s.minval(min, v)
max = s.maxval(max, v)
}
return
}
func (s *Int96Statistics) getMinMaxSpaced(values []parquet.Int96, validBits []byte, validBitsOffset int64) (min, max parquet.Int96) {
min = s.defaultMin()
max = s.defaultMax()
if s.bitSetReader == nil {
s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values)))
} else {
s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values)))
}
for {
run := s.bitSetReader.NextRun()
if run.Length == 0 {
break
}
for _, v := range values[int(run.Pos):int(run.Pos+run.Length)] {
min = s.minval(min, v)
max = s.maxval(max, v)
}
}
return
}
func (s *Int96Statistics) Min() parquet.Int96 { return s.min }
func (s *Int96Statistics) Max() parquet.Int96 { return s.max }
// Merge merges the stats from other into this stat object, updating
// the null count, distinct count, number of values and the min/max if
// appropriate.
func (s *Int96Statistics) Merge(other TypedStatistics) {
rhs, ok := other.(*Int96Statistics)
if !ok {
panic("incompatible stat type merge")
}
s.statistics.merge(rhs)
if rhs.HasMinMax() {
s.SetMinMax(rhs.Min(), rhs.Max())
}
}
// Update is used to add more values to the current stat object, finding the
// min and max values etc.
func (s *Int96Statistics) Update(values []parquet.Int96, numNull int64) {
s.IncNulls(numNull)
s.nvalues += int64(len(values))
if len(values) == 0 {
return
}
s.SetMinMax(s.getMinMax(values))
}
// UpdateSpaced is just like Update, but for spaced values using validBits to determine
// and skip null values.
func (s *Int96Statistics) UpdateSpaced(values []parquet.Int96, validBits []byte, validBitsOffset, numNull int64) {
s.IncNulls(numNull)
notnull := int64(len(values)) - numNull
s.nvalues += notnull
if notnull == 0 {
return
}
s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset))
}
func (s *Int96Statistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error {
if updateCounts {
s.IncNulls(int64(values.NullN()))
s.nvalues += int64(values.Len() - values.NullN())
}
if values.NullN() == values.Len() {
return nil
}
return fmt.Errorf("%w: update int96 stats from Arrow", arrow.ErrNotImplemented)
}
// SetMinMax updates the min and max values only if they are not currently set
// or if argMin is less than the current min / argMax is greater than the current max
func (s *Int96Statistics) SetMinMax(argMin, argMax parquet.Int96) {
maybeMinMax := s.cleanStat([2]parquet.Int96{argMin, argMax})
if maybeMinMax == nil {
return
}
min := (*maybeMinMax)[0]
max := (*maybeMinMax)[1]
if !s.hasMinMax {
s.hasMinMax = true
s.min = min
s.max = max
} else {
if !s.less(s.min, min) {
s.min = min
}
if s.less(s.max, max) {
s.max = max
}
}
}
// EncodeMin returns the encoded min value with plain encoding.
//
// ByteArray stats do not include the length in the encoding.
func (s *Int96Statistics) EncodeMin() []byte {
if s.HasMinMax() {
return s.plainEncode(s.min)
}
return nil
}
// EncodeMax returns the current encoded max value with plain encoding
//
// ByteArray stats do not include the length in the encoding
func (s *Int96Statistics) EncodeMax() []byte {
if s.HasMinMax() {
return s.plainEncode(s.max)
}
return nil
}
// Encode returns a populated EncodedStatistics object
func (s *Int96Statistics) Encode() (enc EncodedStatistics, err error) {
defer func() {
if r := recover(); r != nil {
err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r)
}
}()
if s.HasMinMax() {
enc.SetMax(s.EncodeMax())
enc.SetMin(s.EncodeMin())
}
if s.HasNullCount() {
enc.SetNullCount(s.NullCount())
enc.AllNullValue = s.NumValues() == 0
}
if s.HasDistinctCount() {
enc.SetDistinctCount(s.DistinctCount())
}
return
}
type minmaxPairFloat32 [2]float32
// Float32Statistics is the typed interface for managing stats for a column
// of Float32 type.
type Float32Statistics struct {
statistics
min float32
max float32
bitSetReader bitutils.SetBitRunReader
}
// NewFloat32Statistics constructs an appropriate stat object type using the
// given column descriptor and allocator.
//
// Panics if the physical type of descr is not parquet.Type.Float
func NewFloat32Statistics(descr *schema.Column, mem memory.Allocator) *Float32Statistics {
if descr.PhysicalType() != parquet.Types.Float {
panic(fmt.Errorf("parquet: invalid type %s for constructing a Float32 stat object", descr.PhysicalType()))
}
return &Float32Statistics{
statistics: statistics{
descr: descr,
hasNullCount: true,
hasDistinctCount: true,
order: descr.SortOrder(),
encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem),
mem: mem,
},
}
}
// NewFloat32StatisticsFromEncoded will construct a properly typed statistics object
// initializing it with the provided information.
func NewFloat32StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Float32Statistics {
ret := NewFloat32Statistics(descr, mem)
ret.nvalues += nvalues
if encoded.IsSetNullCount() {
ret.IncNulls(encoded.GetNullCount())
}
if encoded.IsSetDistinctCount() {
ret.IncDistinct(encoded.GetDistinctCount())
}
encodedMin := encoded.GetMin()
if encodedMin != nil && len(encodedMin) > 0 {
ret.min = ret.plainDecode(encodedMin)
}
encodedMax := encoded.GetMax()
if encodedMax != nil && len(encodedMax) > 0 {
ret.max = ret.plainDecode(encodedMax)
}
ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin()
return ret
}
func (s *Float32Statistics) plainEncode(src float32) []byte {
s.encoder.(encoding.Float32Encoder).Put([]float32{src})
buf, err := s.encoder.FlushValues()
if err != nil {
panic(err) // recovered by Encode
}
defer buf.Release()
out := make([]byte, buf.Len())
copy(out, buf.Bytes())
return out
}
func (s *Float32Statistics) plainDecode(src []byte) float32 {
var buf [1]float32
decoder := encoding.NewDecoder(s.descr.PhysicalType(), parquet.Encodings.Plain, s.descr, s.mem)
decoder.SetData(1, src)
decoder.(encoding.Float32Decoder).Decode(buf[:])
return buf[0]
}
func (s *Float32Statistics) minval(a, b float32) float32 {
if s.less(a, b) {
return a
}
return b
}
func (s *Float32Statistics) maxval(a, b float32) float32 {
if s.less(a, b) {
return b
}
return a
}
// MinMaxEqual returns true if both stat objects have the same Min and Max values
func (s *Float32Statistics) MinMaxEqual(rhs *Float32Statistics) bool {
return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max)
}
// Equals returns true only if both objects are the same type, have the same min and
// max values, null count, distinct count and number of values.
func (s *Float32Statistics) Equals(other TypedStatistics) bool {
if s.Type() != other.Type() {
return false
}
rhs, ok := other.(*Float32Statistics)
if !ok {
return false
}
if s.HasMinMax() != rhs.HasMinMax() {
return false
}
return (s.hasMinMax && s.MinMaxEqual(rhs)) &&
s.NullCount() == rhs.NullCount() &&
s.DistinctCount() == rhs.DistinctCount() &&
s.NumValues() == rhs.NumValues()
}
func (s *Float32Statistics) coalesce(val, fallback float32) float32 {
if math.IsNaN(float64(val)) {
return fallback
}
return val
}
func (s *Float32Statistics) getMinMax(values []float32) (min, max float32) {
defMin := s.defaultMin()
defMax := s.defaultMax()
min = defMin
max = defMax
for _, v := range values {
min = s.minval(min, s.coalesce(v, defMin))
max = s.maxval(max, s.coalesce(v, defMax))
}
return
}
func (s *Float32Statistics) getMinMaxSpaced(values []float32, validBits []byte, validBitsOffset int64) (min, max float32) {
min = s.defaultMin()
max = s.defaultMax()
if s.bitSetReader == nil {
s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values)))
} else {
s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values)))
}
for {
run := s.bitSetReader.NextRun()
if run.Length == 0 {
break
}
for _, v := range values[int(run.Pos):int(run.Pos+run.Length)] {
min = s.minval(min, coalesce(v, s.defaultMin()).(float32))
max = s.maxval(max, coalesce(v, s.defaultMax()).(float32))
}
}
return
}
func (s *Float32Statistics) Min() float32 { return s.min }
func (s *Float32Statistics) Max() float32 { return s.max }
// Merge merges the stats from other into this stat object, updating
// the null count, distinct count, number of values and the min/max if
// appropriate.
func (s *Float32Statistics) Merge(other TypedStatistics) {
rhs, ok := other.(*Float32Statistics)
if !ok {
panic("incompatible stat type merge")
}
s.statistics.merge(rhs)
if rhs.HasMinMax() {
s.SetMinMax(rhs.Min(), rhs.Max())
}
}
// Update is used to add more values to the current stat object, finding the
// min and max values etc.
func (s *Float32Statistics) Update(values []float32, numNull int64) {
s.IncNulls(numNull)
s.nvalues += int64(len(values))
if len(values) == 0 {
return
}
s.SetMinMax(s.getMinMax(values))
}
// UpdateSpaced is just like Update, but for spaced values using validBits to determine
// and skip null values.
func (s *Float32Statistics) UpdateSpaced(values []float32, validBits []byte, validBitsOffset, numNull int64) {
s.IncNulls(numNull)
notnull := int64(len(values)) - numNull
s.nvalues += notnull
if notnull == 0 {
return
}
s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset))
}
func (s *Float32Statistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error {
if updateCounts {
s.IncNulls(int64(values.NullN()))
s.nvalues += int64(values.Len() - values.NullN())
}
if values.NullN() == values.Len() {
return nil
}
if values.DataType().(arrow.FixedWidthDataType).Bytes() != arrow.Float32SizeBytes {
return fmt.Errorf("%w: cannot update float32 stats with %s arrow array",
arrow.ErrInvalid, values.DataType())
}
rawBytes := values.Data().Buffers()[1].Bytes()[values.Data().Offset()*arrow.Float32SizeBytes:]
s.SetMinMax(s.getMinMax(arrow.Float32Traits.CastFromBytes(rawBytes)))
return nil
}
// SetMinMax updates the min and max values only if they are not currently set
// or if argMin is less than the current min / argMax is greater than the current max
func (s *Float32Statistics) SetMinMax(argMin, argMax float32) {
maybeMinMax := s.cleanStat([2]float32{argMin, argMax})
if maybeMinMax == nil {
return
}
min := (*maybeMinMax)[0]
max := (*maybeMinMax)[1]
if !s.hasMinMax {
s.hasMinMax = true
s.min = min
s.max = max
} else {
if !s.less(s.min, min) {
s.min = min
}
if s.less(s.max, max) {
s.max = max
}
}
}
// EncodeMin returns the encoded min value with plain encoding.
//
// ByteArray stats do not include the length in the encoding.
func (s *Float32Statistics) EncodeMin() []byte {
if s.HasMinMax() {
return s.plainEncode(s.min)
}
return nil
}
// EncodeMax returns the current encoded max value with plain encoding
//
// ByteArray stats do not include the length in the encoding
func (s *Float32Statistics) EncodeMax() []byte {
if s.HasMinMax() {
return s.plainEncode(s.max)
}
return nil
}
// Encode returns a populated EncodedStatistics object
func (s *Float32Statistics) Encode() (enc EncodedStatistics, err error) {
defer func() {
if r := recover(); r != nil {
err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r)
}
}()
if s.HasMinMax() {
enc.SetMax(s.EncodeMax())
enc.SetMin(s.EncodeMin())
}
if s.HasNullCount() {
enc.SetNullCount(s.NullCount())
enc.AllNullValue = s.NumValues() == 0
}
if s.HasDistinctCount() {
enc.SetDistinctCount(s.DistinctCount())
}
return
}
type minmaxPairFloat64 [2]float64
// Float64Statistics is the typed interface for managing stats for a column
// of Float64 type.
type Float64Statistics struct {
statistics
min float64
max float64
bitSetReader bitutils.SetBitRunReader
}
// NewFloat64Statistics constructs an appropriate stat object type using the
// given column descriptor and allocator.
//
// Panics if the physical type of descr is not parquet.Type.Double
func NewFloat64Statistics(descr *schema.Column, mem memory.Allocator) *Float64Statistics {
if descr.PhysicalType() != parquet.Types.Double {
panic(fmt.Errorf("parquet: invalid type %s for constructing a Float64 stat object", descr.PhysicalType()))
}
return &Float64Statistics{
statistics: statistics{
descr: descr,
hasNullCount: true,
hasDistinctCount: true,
order: descr.SortOrder(),
encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem),
mem: mem,
},
}
}
// NewFloat64StatisticsFromEncoded will construct a properly typed statistics object
// initializing it with the provided information.
func NewFloat64StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Float64Statistics {
ret := NewFloat64Statistics(descr, mem)
ret.nvalues += nvalues
if encoded.IsSetNullCount() {
ret.IncNulls(encoded.GetNullCount())
}
if encoded.IsSetDistinctCount() {
ret.IncDistinct(encoded.GetDistinctCount())
}
encodedMin := encoded.GetMin()
if encodedMin != nil && len(encodedMin) > 0 {
ret.min = ret.plainDecode(encodedMin)
}
encodedMax := encoded.GetMax()
if encodedMax != nil && len(encodedMax) > 0 {
ret.max = ret.plainDecode(encodedMax)
}
ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin()
return ret
}
func (s *Float64Statistics) plainEncode(src float64) []byte {
s.encoder.(encoding.Float64Encoder).Put([]float64{src})
buf, err := s.encoder.FlushValues()
if err != nil {
panic(err) // recovered by Encode
}
defer buf.Release()
out := make([]byte, buf.Len())
copy(out, buf.Bytes())
return out
}
func (s *Float64Statistics) plainDecode(src []byte) float64 {
var buf [1]float64
decoder := encoding.NewDecoder(s.descr.PhysicalType(), parquet.Encodings.Plain, s.descr, s.mem)
decoder.SetData(1, src)
decoder.(encoding.Float64Decoder).Decode(buf[:])
return buf[0]
}
func (s *Float64Statistics) minval(a, b float64) float64 {
if s.less(a, b) {
return a
}
return b
}
func (s *Float64Statistics) maxval(a, b float64) float64 {
if s.less(a, b) {
return b
}
return a
}
// MinMaxEqual returns true if both stat objects have the same Min and Max values
func (s *Float64Statistics) MinMaxEqual(rhs *Float64Statistics) bool {
return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max)
}
// Equals returns true only if both objects are the same type, have the same min and
// max values, null count, distinct count and number of values.
func (s *Float64Statistics) Equals(other TypedStatistics) bool {
if s.Type() != other.Type() {
return false
}
rhs, ok := other.(*Float64Statistics)
if !ok {
return false
}
if s.HasMinMax() != rhs.HasMinMax() {
return false
}
return (s.hasMinMax && s.MinMaxEqual(rhs)) &&
s.NullCount() == rhs.NullCount() &&
s.DistinctCount() == rhs.DistinctCount() &&
s.NumValues() == rhs.NumValues()
}
func (s *Float64Statistics) coalesce(val, fallback float64) float64 {
if math.IsNaN(float64(val)) {
return fallback
}
return val
}
func (s *Float64Statistics) getMinMax(values []float64) (min, max float64) {
defMin := s.defaultMin()
defMax := s.defaultMax()
min = defMin
max = defMax
for _, v := range values {
min = s.minval(min, s.coalesce(v, defMin))
max = s.maxval(max, s.coalesce(v, defMax))
}
return
}
func (s *Float64Statistics) getMinMaxSpaced(values []float64, validBits []byte, validBitsOffset int64) (min, max float64) {
min = s.defaultMin()
max = s.defaultMax()
if s.bitSetReader == nil {
s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values)))
} else {
s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values)))
}
for {
run := s.bitSetReader.NextRun()
if run.Length == 0 {
break
}
for _, v := range values[int(run.Pos):int(run.Pos+run.Length)] {
min = s.minval(min, coalesce(v, s.defaultMin()).(float64))
max = s.maxval(max, coalesce(v, s.defaultMax()).(float64))
}
}
return
}
func (s *Float64Statistics) Min() float64 { return s.min }
func (s *Float64Statistics) Max() float64 { return s.max }
// Merge merges the stats from other into this stat object, updating
// the null count, distinct count, number of values and the min/max if
// appropriate.
func (s *Float64Statistics) Merge(other TypedStatistics) {
rhs, ok := other.(*Float64Statistics)
if !ok {
panic("incompatible stat type merge")
}
s.statistics.merge(rhs)
if rhs.HasMinMax() {
s.SetMinMax(rhs.Min(), rhs.Max())
}
}
// Update is used to add more values to the current stat object, finding the
// min and max values etc.
func (s *Float64Statistics) Update(values []float64, numNull int64) {
s.IncNulls(numNull)
s.nvalues += int64(len(values))
if len(values) == 0 {
return
}
s.SetMinMax(s.getMinMax(values))
}
// UpdateSpaced is just like Update, but for spaced values using validBits to determine
// and skip null values.
func (s *Float64Statistics) UpdateSpaced(values []float64, validBits []byte, validBitsOffset, numNull int64) {
s.IncNulls(numNull)
notnull := int64(len(values)) - numNull
s.nvalues += notnull
if notnull == 0 {
return
}
s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset))
}
func (s *Float64Statistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error {
if updateCounts {
s.IncNulls(int64(values.NullN()))
s.nvalues += int64(values.Len() - values.NullN())
}
if values.NullN() == values.Len() {
return nil
}
if values.DataType().(arrow.FixedWidthDataType).Bytes() != arrow.Float64SizeBytes {
return fmt.Errorf("%w: cannot update float64 stats with %s arrow array",
arrow.ErrInvalid, values.DataType())
}
rawBytes := values.Data().Buffers()[1].Bytes()[values.Data().Offset()*arrow.Float64SizeBytes:]
s.SetMinMax(s.getMinMax(arrow.Float64Traits.CastFromBytes(rawBytes)))
return nil
}
// SetMinMax updates the min and max values only if they are not currently set
// or if argMin is less than the current min / argMax is greater than the current max
func (s *Float64Statistics) SetMinMax(argMin, argMax float64) {
maybeMinMax := s.cleanStat([2]float64{argMin, argMax})
if maybeMinMax == nil {
return
}
min := (*maybeMinMax)[0]
max := (*maybeMinMax)[1]
if !s.hasMinMax {
s.hasMinMax = true
s.min = min
s.max = max
} else {
if !s.less(s.min, min) {
s.min = min
}
if s.less(s.max, max) {
s.max = max
}
}
}
// EncodeMin returns the encoded min value with plain encoding.
//
// ByteArray stats do not include the length in the encoding.
func (s *Float64Statistics) EncodeMin() []byte {
if s.HasMinMax() {
return s.plainEncode(s.min)
}
return nil
}
// EncodeMax returns the current encoded max value with plain encoding
//
// ByteArray stats do not include the length in the encoding
func (s *Float64Statistics) EncodeMax() []byte {
if s.HasMinMax() {
return s.plainEncode(s.max)
}
return nil
}
// Encode returns a populated EncodedStatistics object
func (s *Float64Statistics) Encode() (enc EncodedStatistics, err error) {
defer func() {
if r := recover(); r != nil {
err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r)
}
}()
if s.HasMinMax() {
enc.SetMax(s.EncodeMax())
enc.SetMin(s.EncodeMin())
}
if s.HasNullCount() {
enc.SetNullCount(s.NullCount())
enc.AllNullValue = s.NumValues() == 0
}
if s.HasDistinctCount() {
enc.SetDistinctCount(s.DistinctCount())
}
return
}
type minmaxPairBoolean [2]bool
// BooleanStatistics is the typed interface for managing stats for a column
// of Boolean type.
type BooleanStatistics struct {
statistics
min bool
max bool
bitSetReader bitutils.SetBitRunReader
}
// NewBooleanStatistics constructs an appropriate stat object type using the
// given column descriptor and allocator.
//
// Panics if the physical type of descr is not parquet.Type.Boolean
func NewBooleanStatistics(descr *schema.Column, mem memory.Allocator) *BooleanStatistics {
if descr.PhysicalType() != parquet.Types.Boolean {
panic(fmt.Errorf("parquet: invalid type %s for constructing a Boolean stat object", descr.PhysicalType()))
}
return &BooleanStatistics{
statistics: statistics{
descr: descr,
hasNullCount: true,
hasDistinctCount: true,
order: descr.SortOrder(),
encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem),
mem: mem,
},
}
}
// NewBooleanStatisticsFromEncoded will construct a properly typed statistics object
// initializing it with the provided information.
func NewBooleanStatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *BooleanStatistics {
ret := NewBooleanStatistics(descr, mem)
ret.nvalues += nvalues
if encoded.IsSetNullCount() {
ret.IncNulls(encoded.GetNullCount())
}
if encoded.IsSetDistinctCount() {
ret.IncDistinct(encoded.GetDistinctCount())
}
encodedMin := encoded.GetMin()
if encodedMin != nil && len(encodedMin) > 0 {
ret.min = ret.plainDecode(encodedMin)
}
encodedMax := encoded.GetMax()
if encodedMax != nil && len(encodedMax) > 0 {
ret.max = ret.plainDecode(encodedMax)
}
ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin()
return ret
}
func (s *BooleanStatistics) plainEncode(src bool) []byte {
s.encoder.(encoding.BooleanEncoder).Put([]bool{src})
buf, err := s.encoder.FlushValues()
if err != nil {
panic(err) // recovered by Encode
}
defer buf.Release()
out := make([]byte, buf.Len())
copy(out, buf.Bytes())
return out
}
func (s *BooleanStatistics) plainDecode(src []byte) bool {
var buf [1]bool
decoder := encoding.NewDecoder(s.descr.PhysicalType(), parquet.Encodings.Plain, s.descr, s.mem)
decoder.SetData(1, src)
decoder.(encoding.BooleanDecoder).Decode(buf[:])
return buf[0]
}
func (s *BooleanStatistics) minval(a, b bool) bool {
if s.less(a, b) {
return a
}
return b
}
func (s *BooleanStatistics) maxval(a, b bool) bool {
if s.less(a, b) {
return b
}
return a
}
// MinMaxEqual returns true if both stat objects have the same Min and Max values
func (s *BooleanStatistics) MinMaxEqual(rhs *BooleanStatistics) bool {
return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max)
}
// Equals returns true only if both objects are the same type, have the same min and
// max values, null count, distinct count and number of values.
func (s *BooleanStatistics) Equals(other TypedStatistics) bool {
if s.Type() != other.Type() {
return false
}
rhs, ok := other.(*BooleanStatistics)
if !ok {
return false
}
if s.HasMinMax() != rhs.HasMinMax() {
return false
}
return (s.hasMinMax && s.MinMaxEqual(rhs)) &&
s.NullCount() == rhs.NullCount() &&
s.DistinctCount() == rhs.DistinctCount() &&
s.NumValues() == rhs.NumValues()
}
func (s *BooleanStatistics) getMinMax(values []bool) (min, max bool) {
defMin := s.defaultMin()
defMax := s.defaultMax()
min = defMin
max = defMax
for _, v := range values {
min = s.minval(min, v)
max = s.maxval(max, v)
}
return
}
func (s *BooleanStatistics) getMinMaxSpaced(values []bool, validBits []byte, validBitsOffset int64) (min, max bool) {
min = s.defaultMin()
max = s.defaultMax()
if s.bitSetReader == nil {
s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values)))
} else {
s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values)))
}
for {
run := s.bitSetReader.NextRun()
if run.Length == 0 {
break
}
for _, v := range values[int(run.Pos):int(run.Pos+run.Length)] {
min = s.minval(min, v)
max = s.maxval(max, v)
}
}
return
}
func (s *BooleanStatistics) Min() bool { return s.min }
func (s *BooleanStatistics) Max() bool { return s.max }
// Merge merges the stats from other into this stat object, updating
// the null count, distinct count, number of values and the min/max if
// appropriate.
func (s *BooleanStatistics) Merge(other TypedStatistics) {
rhs, ok := other.(*BooleanStatistics)
if !ok {
panic("incompatible stat type merge")
}
s.statistics.merge(rhs)
if rhs.HasMinMax() {
s.SetMinMax(rhs.Min(), rhs.Max())
}
}
// Update is used to add more values to the current stat object, finding the
// min and max values etc.
func (s *BooleanStatistics) Update(values []bool, numNull int64) {
s.IncNulls(numNull)
s.nvalues += int64(len(values))
if len(values) == 0 {
return
}
s.SetMinMax(s.getMinMax(values))
}
// UpdateSpaced is just like Update, but for spaced values using validBits to determine
// and skip null values.
func (s *BooleanStatistics) UpdateSpaced(values []bool, validBits []byte, validBitsOffset, numNull int64) {
s.IncNulls(numNull)
notnull := int64(len(values)) - numNull
s.nvalues += notnull
if notnull == 0 {
return
}
s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset))
}
func (s *BooleanStatistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error {
if updateCounts {
s.IncNulls(int64(values.NullN()))
s.nvalues += int64(values.Len() - values.NullN())
}
if values.NullN() == values.Len() {
return nil
}
return fmt.Errorf("%w: update boolean stats from Arrow", arrow.ErrNotImplemented)
}
// SetMinMax updates the min and max values only if they are not currently set
// or if argMin is less than the current min / argMax is greater than the current max
func (s *BooleanStatistics) SetMinMax(argMin, argMax bool) {
maybeMinMax := s.cleanStat([2]bool{argMin, argMax})
if maybeMinMax == nil {
return
}
min := (*maybeMinMax)[0]
max := (*maybeMinMax)[1]
if !s.hasMinMax {
s.hasMinMax = true
s.min = min
s.max = max
} else {
if !s.less(s.min, min) {
s.min = min
}
if s.less(s.max, max) {
s.max = max
}
}
}
// EncodeMin returns the encoded min value with plain encoding.
//
// ByteArray stats do not include the length in the encoding.
func (s *BooleanStatistics) EncodeMin() []byte {
if s.HasMinMax() {
return s.plainEncode(s.min)
}
return nil
}
// EncodeMax returns the current encoded max value with plain encoding
//
// ByteArray stats do not include the length in the encoding
func (s *BooleanStatistics) EncodeMax() []byte {
if s.HasMinMax() {
return s.plainEncode(s.max)
}
return nil
}
// Encode returns a populated EncodedStatistics object
func (s *BooleanStatistics) Encode() (enc EncodedStatistics, err error) {
defer func() {
if r := recover(); r != nil {
err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r)
}
}()
if s.HasMinMax() {
enc.SetMax(s.EncodeMax())
enc.SetMin(s.EncodeMin())
}
if s.HasNullCount() {
enc.SetNullCount(s.NullCount())
enc.AllNullValue = s.NumValues() == 0
}
if s.HasDistinctCount() {
enc.SetDistinctCount(s.DistinctCount())
}
return
}
type minmaxPairByteArray [2]parquet.ByteArray
// ByteArrayStatistics is the typed interface for managing stats for a column
// of ByteArray type.
type ByteArrayStatistics struct {
statistics
min parquet.ByteArray
max parquet.ByteArray
bitSetReader bitutils.SetBitRunReader
}
// NewByteArrayStatistics constructs an appropriate stat object type using the
// given column descriptor and allocator.
//
// Panics if the physical type of descr is not parquet.Type.ByteArray
func NewByteArrayStatistics(descr *schema.Column, mem memory.Allocator) *ByteArrayStatistics {
if descr.PhysicalType() != parquet.Types.ByteArray {
panic(fmt.Errorf("parquet: invalid type %s for constructing a ByteArray stat object", descr.PhysicalType()))
}
return &ByteArrayStatistics{
statistics: statistics{
descr: descr,
hasNullCount: true,
hasDistinctCount: true,
order: descr.SortOrder(),
encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem),
mem: mem,
},
min: make([]byte, 0),
max: make([]byte, 0),
}
}
// NewByteArrayStatisticsFromEncoded will construct a properly typed statistics object
// initializing it with the provided information.
func NewByteArrayStatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *ByteArrayStatistics {
ret := NewByteArrayStatistics(descr, mem)
ret.nvalues += nvalues
if encoded.IsSetNullCount() {
ret.IncNulls(encoded.GetNullCount())
}
if encoded.IsSetDistinctCount() {
ret.IncDistinct(encoded.GetDistinctCount())
}
encodedMin := encoded.GetMin()
if encodedMin != nil && len(encodedMin) > 0 {
ret.min = ret.plainDecode(encodedMin)
}
encodedMax := encoded.GetMax()
if encodedMax != nil && len(encodedMax) > 0 {
ret.max = ret.plainDecode(encodedMax)
}
ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin()
return ret
}
func (s *ByteArrayStatistics) plainEncode(src parquet.ByteArray) []byte {
out := make([]byte, len(src))
copy(out, src)
return out
}
func (s *ByteArrayStatistics) plainDecode(src []byte) parquet.ByteArray {
return src
}
func (s *ByteArrayStatistics) minval(a, b parquet.ByteArray) parquet.ByteArray {
switch {
case a == nil:
return b
case b == nil:
return a
case s.less(a, b):
return a
default:
return b
}
}
func (s *ByteArrayStatistics) maxval(a, b parquet.ByteArray) parquet.ByteArray {
switch {
case a == nil:
return b
case b == nil:
return a
case s.less(a, b):
return b
default:
return a
}
}
// MinMaxEqual returns true if both stat objects have the same Min and Max values
func (s *ByteArrayStatistics) MinMaxEqual(rhs *ByteArrayStatistics) bool {
return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max)
}
// Equals returns true only if both objects are the same type, have the same min and
// max values, null count, distinct count and number of values.
func (s *ByteArrayStatistics) Equals(other TypedStatistics) bool {
if s.Type() != other.Type() {
return false
}
rhs, ok := other.(*ByteArrayStatistics)
if !ok {
return false
}
if s.HasMinMax() != rhs.HasMinMax() {
return false
}
return (s.hasMinMax && s.MinMaxEqual(rhs)) &&
s.NullCount() == rhs.NullCount() &&
s.DistinctCount() == rhs.DistinctCount() &&
s.NumValues() == rhs.NumValues()
}
func (s *ByteArrayStatistics) getMinMax(values []parquet.ByteArray) (min, max parquet.ByteArray) {
defMin := s.defaultMin()
defMax := s.defaultMax()
min = defMin
max = defMax
for _, v := range values {
min = s.minval(min, v)
max = s.maxval(max, v)
}
return
}
func (s *ByteArrayStatistics) getMinMaxSpaced(values []parquet.ByteArray, validBits []byte, validBitsOffset int64) (min, max parquet.ByteArray) {
min = s.defaultMin()
max = s.defaultMax()
if s.bitSetReader == nil {
s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values)))
} else {
s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values)))
}
for {
run := s.bitSetReader.NextRun()
if run.Length == 0 {
break
}
for _, v := range values[int(run.Pos):int(run.Pos+run.Length)] {
min = s.minval(min, v)
max = s.maxval(max, v)
}
}
return
}
func (s *ByteArrayStatistics) Min() parquet.ByteArray { return s.min }
func (s *ByteArrayStatistics) Max() parquet.ByteArray { return s.max }
// Merge merges the stats from other into this stat object, updating
// the null count, distinct count, number of values and the min/max if
// appropriate.
func (s *ByteArrayStatistics) Merge(other TypedStatistics) {
rhs, ok := other.(*ByteArrayStatistics)
if !ok {
panic("incompatible stat type merge")
}
s.statistics.merge(rhs)
if rhs.HasMinMax() {
s.SetMinMax(rhs.Min(), rhs.Max())
}
}
// Update is used to add more values to the current stat object, finding the
// min and max values etc.
func (s *ByteArrayStatistics) Update(values []parquet.ByteArray, numNull int64) {
s.IncNulls(numNull)
s.nvalues += int64(len(values))
if len(values) == 0 {
return
}
s.SetMinMax(s.getMinMax(values))
}
// UpdateSpaced is just like Update, but for spaced values using validBits to determine
// and skip null values.
func (s *ByteArrayStatistics) UpdateSpaced(values []parquet.ByteArray, validBits []byte, validBitsOffset, numNull int64) {
s.IncNulls(numNull)
notnull := int64(len(values)) - numNull
s.nvalues += notnull
if notnull == 0 {
return
}
s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset))
}
func (s *ByteArrayStatistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error {
if updateCounts {
s.IncNulls(int64(values.NullN()))
s.nvalues += int64(values.Len() - values.NullN())
}
if values.NullN() == values.Len() {
return nil
}
if !arrow.IsBaseBinary(values.DataType().ID()) {
return fmt.Errorf("%w: can only update ByteArray stats from binary or string array", arrow.ErrInvalid)
}
var (
min = s.defaultMin()
max = s.defaultMax()
arr = values.(array.BinaryLike)
data = arr.ValueBytes()
curOffset = int64(0)
)
for i := 0; i < arr.Len(); i++ {
nextOffset := curOffset + int64(arr.ValueLen(i))
v := data[curOffset:nextOffset]
curOffset = nextOffset
if len(v) == 0 {
continue
}
min = s.minval(min, v)
max = s.maxval(max, v)
}
s.SetMinMax(min, max)
return nil
}
// SetMinMax updates the min and max values only if they are not currently set
// or if argMin is less than the current min / argMax is greater than the current max
func (s *ByteArrayStatistics) SetMinMax(argMin, argMax parquet.ByteArray) {
maybeMinMax := s.cleanStat([2]parquet.ByteArray{argMin, argMax})
if maybeMinMax == nil {
return
}
min := (*maybeMinMax)[0]
max := (*maybeMinMax)[1]
if !s.hasMinMax {
s.hasMinMax = true
s.min = min
s.max = max
} else {
if !s.less(s.min, min) {
s.min = min
}
if s.less(s.max, max) {
s.max = max
}
}
}
// EncodeMin returns the encoded min value with plain encoding.
//
// ByteArray stats do not include the length in the encoding.
func (s *ByteArrayStatistics) EncodeMin() []byte {
if s.HasMinMax() {
return s.plainEncode(s.min)
}
return nil
}
// EncodeMax returns the current encoded max value with plain encoding
//
// ByteArray stats do not include the length in the encoding
func (s *ByteArrayStatistics) EncodeMax() []byte {
if s.HasMinMax() {
return s.plainEncode(s.max)
}
return nil
}
// Encode returns a populated EncodedStatistics object
func (s *ByteArrayStatistics) Encode() (enc EncodedStatistics, err error) {
defer func() {
if r := recover(); r != nil {
err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r)
}
}()
if s.HasMinMax() {
enc.SetMax(s.EncodeMax())
enc.SetMin(s.EncodeMin())
}
if s.HasNullCount() {
enc.SetNullCount(s.NullCount())
enc.AllNullValue = s.NumValues() == 0
}
if s.HasDistinctCount() {
enc.SetDistinctCount(s.DistinctCount())
}
return
}
type minmaxPairFixedLenByteArray [2]parquet.FixedLenByteArray
// FixedLenByteArrayStatistics is the typed interface for managing stats for a column
// of FixedLenByteArray type.
type FixedLenByteArrayStatistics struct {
statistics
min parquet.FixedLenByteArray
max parquet.FixedLenByteArray
bitSetReader bitutils.SetBitRunReader
}
// NewFixedLenByteArrayStatistics constructs an appropriate stat object type using the
// given column descriptor and allocator.
//
// Panics if the physical type of descr is not parquet.Type.FixedLenByteArray
func NewFixedLenByteArrayStatistics(descr *schema.Column, mem memory.Allocator) *FixedLenByteArrayStatistics {
if descr.PhysicalType() != parquet.Types.FixedLenByteArray {
panic(fmt.Errorf("parquet: invalid type %s for constructing a FixedLenByteArray stat object", descr.PhysicalType()))
}
return &FixedLenByteArrayStatistics{
statistics: statistics{
descr: descr,
hasNullCount: true,
hasDistinctCount: true,
order: descr.SortOrder(),
encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem),
mem: mem,
},
}
}
// NewFixedLenByteArrayStatisticsFromEncoded will construct a properly typed statistics object
// initializing it with the provided information.
func NewFixedLenByteArrayStatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *FixedLenByteArrayStatistics {
ret := NewFixedLenByteArrayStatistics(descr, mem)
ret.nvalues += nvalues
if encoded.IsSetNullCount() {
ret.IncNulls(encoded.GetNullCount())
}
if encoded.IsSetDistinctCount() {
ret.IncDistinct(encoded.GetDistinctCount())
}
encodedMin := encoded.GetMin()
if encodedMin != nil && len(encodedMin) > 0 {
ret.min = ret.plainDecode(encodedMin)
}
encodedMax := encoded.GetMax()
if encodedMax != nil && len(encodedMax) > 0 {
ret.max = ret.plainDecode(encodedMax)
}
ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin()
return ret
}
func (s *FixedLenByteArrayStatistics) plainEncode(src parquet.FixedLenByteArray) []byte {
s.encoder.(encoding.FixedLenByteArrayEncoder).Put([]parquet.FixedLenByteArray{src})
buf, err := s.encoder.FlushValues()
if err != nil {
panic(err) // recovered by Encode
}
defer buf.Release()
out := make([]byte, buf.Len())
copy(out, buf.Bytes())
return out
}
func (s *FixedLenByteArrayStatistics) plainDecode(src []byte) parquet.FixedLenByteArray {
var buf [1]parquet.FixedLenByteArray
decoder := encoding.NewDecoder(s.descr.PhysicalType(), parquet.Encodings.Plain, s.descr, s.mem)
decoder.SetData(1, src)
decoder.(encoding.FixedLenByteArrayDecoder).Decode(buf[:])
return buf[0]
}
func (s *FixedLenByteArrayStatistics) minval(a, b parquet.FixedLenByteArray) parquet.FixedLenByteArray {
switch {
case a == nil:
return b
case b == nil:
return a
case s.less(a, b):
return a
default:
return b
}
}
func (s *FixedLenByteArrayStatistics) maxval(a, b parquet.FixedLenByteArray) parquet.FixedLenByteArray {
switch {
case a == nil:
return b
case b == nil:
return a
case s.less(a, b):
return b
default:
return a
}
}
// MinMaxEqual returns true if both stat objects have the same Min and Max values
func (s *FixedLenByteArrayStatistics) MinMaxEqual(rhs *FixedLenByteArrayStatistics) bool {
return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max)
}
// Equals returns true only if both objects are the same type, have the same min and
// max values, null count, distinct count and number of values.
func (s *FixedLenByteArrayStatistics) Equals(other TypedStatistics) bool {
if s.Type() != other.Type() {
return false
}
rhs, ok := other.(*FixedLenByteArrayStatistics)
if !ok {
return false
}
if s.HasMinMax() != rhs.HasMinMax() {
return false
}
return (s.hasMinMax && s.MinMaxEqual(rhs)) &&
s.NullCount() == rhs.NullCount() &&
s.DistinctCount() == rhs.DistinctCount() &&
s.NumValues() == rhs.NumValues()
}
func (s *FixedLenByteArrayStatistics) getMinMax(values []parquet.FixedLenByteArray) (min, max parquet.FixedLenByteArray) {
defMin := s.defaultMin()
defMax := s.defaultMax()
min = defMin
max = defMax
for _, v := range values {
min = s.minval(min, v)
max = s.maxval(max, v)
}
return
}
func (s *FixedLenByteArrayStatistics) getMinMaxSpaced(values []parquet.FixedLenByteArray, validBits []byte, validBitsOffset int64) (min, max parquet.FixedLenByteArray) {
min = s.defaultMin()
max = s.defaultMax()
if s.bitSetReader == nil {
s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values)))
} else {
s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values)))
}
for {
run := s.bitSetReader.NextRun()
if run.Length == 0 {
break
}
for _, v := range values[int(run.Pos):int(run.Pos+run.Length)] {
min = s.minval(min, v)
max = s.maxval(max, v)
}
}
return
}
func (s *FixedLenByteArrayStatistics) Min() parquet.FixedLenByteArray { return s.min }
func (s *FixedLenByteArrayStatistics) Max() parquet.FixedLenByteArray { return s.max }
// Merge merges the stats from other into this stat object, updating
// the null count, distinct count, number of values and the min/max if
// appropriate.
func (s *FixedLenByteArrayStatistics) Merge(other TypedStatistics) {
rhs, ok := other.(*FixedLenByteArrayStatistics)
if !ok {
panic("incompatible stat type merge")
}
s.statistics.merge(rhs)
if rhs.HasMinMax() {
s.SetMinMax(rhs.Min(), rhs.Max())
}
}
// Update is used to add more values to the current stat object, finding the
// min and max values etc.
func (s *FixedLenByteArrayStatistics) Update(values []parquet.FixedLenByteArray, numNull int64) {
s.IncNulls(numNull)
s.nvalues += int64(len(values))
if len(values) == 0 {
return
}
s.SetMinMax(s.getMinMax(values))
}
// UpdateSpaced is just like Update, but for spaced values using validBits to determine
// and skip null values.
func (s *FixedLenByteArrayStatistics) UpdateSpaced(values []parquet.FixedLenByteArray, validBits []byte, validBitsOffset, numNull int64) {
s.IncNulls(numNull)
notnull := int64(len(values)) - numNull
s.nvalues += notnull
if notnull == 0 {
return
}
s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset))
}
func (s *FixedLenByteArrayStatistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error {
if updateCounts {
s.IncNulls(int64(values.NullN()))
s.nvalues += int64(values.Len() - values.NullN())
}
if values.NullN() == values.Len() {
return nil
}
dt := values.DataType()
if dt.ID() != arrow.FIXED_SIZE_BINARY && dt.ID() != arrow.DECIMAL {
return fmt.Errorf("%w: only fixed size binary and decimal128 arrays are supported to update stats from arrow",
arrow.ErrInvalid)
}
var (
width = dt.(arrow.FixedWidthDataType).Bytes()
data = values.Data().Buffers()[1].Bytes()[values.Data().Offset()*width:]
min = s.defaultMin()
max = s.defaultMax()
)
for i := 0; i < values.Len(); i++ {
v := data[i*width : (i+1)*width]
min = s.minval(min, v)
max = s.maxval(min, v)
}
s.SetMinMax(min, max)
return nil
}
// SetMinMax updates the min and max values only if they are not currently set
// or if argMin is less than the current min / argMax is greater than the current max
func (s *FixedLenByteArrayStatistics) SetMinMax(argMin, argMax parquet.FixedLenByteArray) {
maybeMinMax := s.cleanStat([2]parquet.FixedLenByteArray{argMin, argMax})
if maybeMinMax == nil {
return
}
min := (*maybeMinMax)[0]
max := (*maybeMinMax)[1]
if !s.hasMinMax {
s.hasMinMax = true
s.min = min
s.max = max
} else {
if !s.less(s.min, min) {
s.min = min
}
if s.less(s.max, max) {
s.max = max
}
}
}
// EncodeMin returns the encoded min value with plain encoding.
//
// ByteArray stats do not include the length in the encoding.
func (s *FixedLenByteArrayStatistics) EncodeMin() []byte {
if s.HasMinMax() {
return s.plainEncode(s.min)
}
return nil
}
// EncodeMax returns the current encoded max value with plain encoding
//
// ByteArray stats do not include the length in the encoding
func (s *FixedLenByteArrayStatistics) EncodeMax() []byte {
if s.HasMinMax() {
return s.plainEncode(s.max)
}
return nil
}
// Encode returns a populated EncodedStatistics object
func (s *FixedLenByteArrayStatistics) Encode() (enc EncodedStatistics, err error) {
defer func() {
if r := recover(); r != nil {
err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r)
}
}()
if s.HasMinMax() {
enc.SetMax(s.EncodeMax())
enc.SetMin(s.EncodeMin())
}
if s.HasNullCount() {
enc.SetNullCount(s.NullCount())
enc.AllNullValue = s.NumValues() == 0
}
if s.HasDistinctCount() {
enc.SetDistinctCount(s.DistinctCount())
}
return
}
type minmaxPairFloat16 [2]parquet.FixedLenByteArray
// Float16Statistics is the typed interface for managing stats for a column
// of Float16 type.
type Float16Statistics struct {
statistics
min parquet.FixedLenByteArray
max parquet.FixedLenByteArray
bitSetReader bitutils.SetBitRunReader
}
// NewFloat16Statistics constructs an appropriate stat object type using the
// given column descriptor and allocator.
//
// Panics if the physical type of descr is not parquet.Type.FixedLenByteArray
// Panics if the logical type of descr is not schema.Float16LogicalType
func NewFloat16Statistics(descr *schema.Column, mem memory.Allocator) *Float16Statistics {
if descr.PhysicalType() != parquet.Types.FixedLenByteArray {
panic(fmt.Errorf("parquet: invalid type %s for constructing a Float16 stat object", descr.PhysicalType()))
}
if !descr.LogicalType().Equals(schema.Float16LogicalType{}) {
panic(fmt.Errorf("parquet: invalid logical type %s for constructing a Float16 stat object", descr.LogicalType().String()))
}
return &Float16Statistics{
statistics: statistics{
descr: descr,
hasNullCount: true,
hasDistinctCount: true,
order: descr.SortOrder(),
encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem),
mem: mem,
},
}
}
// NewFloat16StatisticsFromEncoded will construct a properly typed statistics object
// initializing it with the provided information.
func NewFloat16StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Float16Statistics {
ret := NewFloat16Statistics(descr, mem)
ret.nvalues += nvalues
if encoded.IsSetNullCount() {
ret.IncNulls(encoded.GetNullCount())
}
if encoded.IsSetDistinctCount() {
ret.IncDistinct(encoded.GetDistinctCount())
}
encodedMin := encoded.GetMin()
if encodedMin != nil && len(encodedMin) > 0 {
ret.min = ret.plainDecode(encodedMin)
}
encodedMax := encoded.GetMax()
if encodedMax != nil && len(encodedMax) > 0 {
ret.max = ret.plainDecode(encodedMax)
}
ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin()
return ret
}
func (s *Float16Statistics) plainEncode(src parquet.FixedLenByteArray) []byte {
s.encoder.(encoding.FixedLenByteArrayEncoder).Put([]parquet.FixedLenByteArray{src})
buf, err := s.encoder.FlushValues()
if err != nil {
panic(err) // recovered by Encode
}
defer buf.Release()
out := make([]byte, buf.Len())
copy(out, buf.Bytes())
return out
}
func (s *Float16Statistics) plainDecode(src []byte) parquet.FixedLenByteArray {
var buf [1]parquet.FixedLenByteArray
decoder := encoding.NewDecoder(s.descr.PhysicalType(), parquet.Encodings.Plain, s.descr, s.mem)
decoder.SetData(1, src)
decoder.(encoding.FixedLenByteArrayDecoder).Decode(buf[:])
return buf[0]
}
func (s *Float16Statistics) minval(a, b parquet.FixedLenByteArray) parquet.FixedLenByteArray {
switch {
case a == nil:
return b
case b == nil:
return a
case s.less(a, b):
return a
default:
return b
}
}
func (s *Float16Statistics) maxval(a, b parquet.FixedLenByteArray) parquet.FixedLenByteArray {
switch {
case a == nil:
return b
case b == nil:
return a
case s.less(a, b):
return b
default:
return a
}
}
// MinMaxEqual returns true if both stat objects have the same Min and Max values
func (s *Float16Statistics) MinMaxEqual(rhs *Float16Statistics) bool {
return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max)
}
// Equals returns true only if both objects are the same type, have the same min and
// max values, null count, distinct count and number of values.
func (s *Float16Statistics) Equals(other TypedStatistics) bool {
if s.Type() != other.Type() || !s.descr.LogicalType().Equals(other.Descr().LogicalType()) {
return false
}
rhs, ok := other.(*Float16Statistics)
if !ok {
return false
}
if s.HasMinMax() != rhs.HasMinMax() {
return false
}
return (s.hasMinMax && s.MinMaxEqual(rhs)) &&
s.NullCount() == rhs.NullCount() &&
s.DistinctCount() == rhs.DistinctCount() &&
s.NumValues() == rhs.NumValues()
}
func (s *Float16Statistics) coalesce(val, fallback parquet.FixedLenByteArray) parquet.FixedLenByteArray {
if float16.FromLEBytes(val).IsNaN() {
return fallback
}
return val
}
func (s *Float16Statistics) getMinMax(values []parquet.FixedLenByteArray) (min, max parquet.FixedLenByteArray) {
defMin := s.defaultMin()
defMax := s.defaultMax()
min = defMin
max = defMax
for _, v := range values {
min = s.minval(min, s.coalesce(v, defMin))
max = s.maxval(max, s.coalesce(v, defMax))
}
return
}
func (s *Float16Statistics) getMinMaxSpaced(values []parquet.FixedLenByteArray, validBits []byte, validBitsOffset int64) (min, max parquet.FixedLenByteArray) {
min = s.defaultMin()
max = s.defaultMax()
if s.bitSetReader == nil {
s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values)))
} else {
s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values)))
}
for {
run := s.bitSetReader.NextRun()
if run.Length == 0 {
break
}
for _, v := range values[int(run.Pos):int(run.Pos+run.Length)] {
min = s.minval(min, coalesce(v, s.defaultMin()).(parquet.FixedLenByteArray))
max = s.maxval(max, coalesce(v, s.defaultMax()).(parquet.FixedLenByteArray))
}
}
return
}
func (s *Float16Statistics) Min() parquet.FixedLenByteArray { return s.min }
func (s *Float16Statistics) Max() parquet.FixedLenByteArray { return s.max }
// Merge merges the stats from other into this stat object, updating
// the null count, distinct count, number of values and the min/max if
// appropriate.
func (s *Float16Statistics) Merge(other TypedStatistics) {
rhs, ok := other.(*Float16Statistics)
if !ok {
panic("incompatible stat type merge")
}
s.statistics.merge(rhs)
if rhs.HasMinMax() {
s.SetMinMax(rhs.Min(), rhs.Max())
}
}
// Update is used to add more values to the current stat object, finding the
// min and max values etc.
func (s *Float16Statistics) Update(values []parquet.FixedLenByteArray, numNull int64) {
s.IncNulls(numNull)
s.nvalues += int64(len(values))
if len(values) == 0 {
return
}
s.SetMinMax(s.getMinMax(values))
}
// UpdateSpaced is just like Update, but for spaced values using validBits to determine
// and skip null values.
func (s *Float16Statistics) UpdateSpaced(values []parquet.FixedLenByteArray, validBits []byte, validBitsOffset, numNull int64) {
s.IncNulls(numNull)
notnull := int64(len(values)) - numNull
s.nvalues += notnull
if notnull == 0 {
return
}
s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset))
}
func (s *Float16Statistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error {
if updateCounts {
s.IncNulls(int64(values.NullN()))
s.nvalues += int64(values.Len() - values.NullN())
}
if values.NullN() == values.Len() {
return nil
}
return fmt.Errorf("%w: update float16 stats from Arrow", arrow.ErrNotImplemented)
}
// SetMinMax updates the min and max values only if they are not currently set
// or if argMin is less than the current min / argMax is greater than the current max
func (s *Float16Statistics) SetMinMax(argMin, argMax parquet.FixedLenByteArray) {
maybeMinMax := s.cleanStat([2]parquet.FixedLenByteArray{argMin, argMax})
if maybeMinMax == nil {
return
}
min := (*maybeMinMax)[0]
max := (*maybeMinMax)[1]
if !s.hasMinMax {
s.hasMinMax = true
s.min = min
s.max = max
} else {
if !s.less(s.min, min) {
s.min = min
}
if s.less(s.max, max) {
s.max = max
}
}
}
// EncodeMin returns the encoded min value with plain encoding.
//
// ByteArray stats do not include the length in the encoding.
func (s *Float16Statistics) EncodeMin() []byte {
if s.HasMinMax() {
return s.plainEncode(s.min)
}
return nil
}
// EncodeMax returns the current encoded max value with plain encoding
//
// ByteArray stats do not include the length in the encoding
func (s *Float16Statistics) EncodeMax() []byte {
if s.HasMinMax() {
return s.plainEncode(s.max)
}
return nil
}
// Encode returns a populated EncodedStatistics object
func (s *Float16Statistics) Encode() (enc EncodedStatistics, err error) {
defer func() {
if r := recover(); r != nil {
err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r)
}
}()
if s.HasMinMax() {
enc.SetMax(s.EncodeMax())
enc.SetMin(s.EncodeMin())
}
if s.HasNullCount() {
enc.SetNullCount(s.NullCount())
enc.AllNullValue = s.NumValues() == 0
}
if s.HasDistinctCount() {
enc.SetDistinctCount(s.DistinctCount())
}
return
}
// NewStatistics uses the type in the column descriptor to construct the appropriate
// typed stats object. If mem is nil, then memory.DefaultAllocator will be used.
func NewStatistics(descr *schema.Column, mem memory.Allocator) TypedStatistics {
if mem == nil {
mem = memory.DefaultAllocator
}
switch descr.PhysicalType() {
case parquet.Types.Int32:
return NewInt32Statistics(descr, mem)
case parquet.Types.Int64:
return NewInt64Statistics(descr, mem)
case parquet.Types.Int96:
return NewInt96Statistics(descr, mem)
case parquet.Types.Float:
return NewFloat32Statistics(descr, mem)
case parquet.Types.Double:
return NewFloat64Statistics(descr, mem)
case parquet.Types.Boolean:
return NewBooleanStatistics(descr, mem)
case parquet.Types.ByteArray:
return NewByteArrayStatistics(descr, mem)
case parquet.Types.FixedLenByteArray:
if descr.LogicalType().Equals(schema.Float16LogicalType{}) {
return NewFloat16Statistics(descr, mem)
}
return NewFixedLenByteArrayStatistics(descr, mem)
default:
panic("not implemented")
}
}
// NewStatisticsFromEncoded uses the provided information to initialize a typed stat object
// by checking the type of the provided column descriptor.
//
// If mem is nil, then memory.DefaultAllocator is used.
func NewStatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) TypedStatistics {
if mem == nil {
mem = memory.DefaultAllocator
}
switch descr.PhysicalType() {
case parquet.Types.Int32:
return NewInt32StatisticsFromEncoded(descr, mem, nvalues, encoded)
case parquet.Types.Int64:
return NewInt64StatisticsFromEncoded(descr, mem, nvalues, encoded)
case parquet.Types.Int96:
return NewInt96StatisticsFromEncoded(descr, mem, nvalues, encoded)
case parquet.Types.Float:
return NewFloat32StatisticsFromEncoded(descr, mem, nvalues, encoded)
case parquet.Types.Double:
return NewFloat64StatisticsFromEncoded(descr, mem, nvalues, encoded)
case parquet.Types.Boolean:
return NewBooleanStatisticsFromEncoded(descr, mem, nvalues, encoded)
case parquet.Types.ByteArray:
return NewByteArrayStatisticsFromEncoded(descr, mem, nvalues, encoded)
case parquet.Types.FixedLenByteArray:
if descr.LogicalType().Equals(schema.Float16LogicalType{}) {
return NewFloat16StatisticsFromEncoded(descr, mem, nvalues, encoded)
}
return NewFixedLenByteArrayStatisticsFromEncoded(descr, mem, nvalues, encoded)
default:
panic("not implemented")
}
}