parquet/metadata/statistics

// Code generated by statistics_types.gen.go.tmpl. DO NOT EDIT. // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package metadata import ( "fmt" "math" "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/array" "github.com/apache/arrow-go/v18/arrow/float16" "github.com/apache/arrow-go/v18/arrow/memory" "github.com/apache/arrow-go/v18/internal/bitutils" shared_utils "github.com/apache/arrow-go/v18/internal/utils" "github.com/apache/arrow-go/v18/parquet" "github.com/apache/arrow-go/v18/parquet/internal/encoding" "github.com/apache/arrow-go/v18/parquet/schema" ) type minmaxPairInt32 [2]int32 // Int32Statistics is the typed interface for managing stats for a column // of Int32 type. type Int32Statistics struct { statistics min int32 max int32 bitSetReader bitutils.SetBitRunReader } // NewInt32Statistics constructs an appropriate stat object type using the // given column descriptor and allocator. // // Panics if the physical type of descr is not parquet.Type.Int32 func NewInt32Statistics(descr *schema.Column, mem memory.Allocator) *Int32Statistics { if descr.PhysicalType() != parquet.Types.Int32 { panic(fmt.Errorf("parquet: invalid type %s for constructing a Int32 stat object", descr.PhysicalType())) } return &Int32Statistics{ statistics: statistics{ descr: descr, hasNullCount: true, hasDistinctCount: true, order: descr.SortOrder(), encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), mem: mem, }, } } // NewInt32StatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewInt32StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Int32Statistics { ret := NewInt32Statistics(descr, mem) ret.nvalues += nvalues if encoded.IsSetNullCount() { ret.IncNulls(encoded.GetNullCount()) } if encoded.IsSetDistinctCount() { ret.IncDistinct(encoded.GetDistinctCount()) } encodedMin := encoded.GetMin() if encodedMin != nil && len(encodedMin) > 0 { ret.min = ret.plainDecode(encodedMin) } encodedMax := encoded.GetMax() if encodedMax != nil && len(encodedMax) > 0 { ret.max = ret.plainDecode(encodedMax) } ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin() return ret } func (s *Int32Statistics) plainEncode(src int32) []byte { s.encoder.(encoding.Int32Encoder).Put([]int32{src}) buf, err := s.encoder.FlushValues() if err != nil { panic(err) // recovered by Encode } defer buf.Release() out := make([]byte, buf.Len()) copy(out, buf.Bytes()) return out } func (s *Int32Statistics) plainDecode(src []byte) int32 { var buf [1]int32 decoder := encoding.NewDecoder(s.descr.PhysicalType(), parquet.Encodings.Plain, s.descr, s.mem) decoder.SetData(1, src) decoder.(encoding.Int32Decoder).Decode(buf[:]) return buf[0] } func (s *Int32Statistics) minval(a, b int32) int32 { if s.less(a, b) { return a } return b } func (s *Int32Statistics) maxval(a, b int32) int32 { if s.less(a, b) { return b } return a } // MinMaxEqual returns true if both stat objects have the same Min and Max values func (s *Int32Statistics) MinMaxEqual(rhs *Int32Statistics) bool { return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max) } // Equals returns true only if both objects are the same type, have the same min and // max values, null count, distinct count and number of values. func (s *Int32Statistics) Equals(other TypedStatistics) bool { if s.Type() != other.Type() { return false } rhs, ok := other.(*Int32Statistics) if !ok { return false } if s.HasMinMax() != rhs.HasMinMax() { return false } return (s.hasMinMax && s.MinMaxEqual(rhs)) && s.NullCount() == rhs.NullCount() && s.DistinctCount() == rhs.DistinctCount() && s.NumValues() == rhs.NumValues() } func (s *Int32Statistics) getMinMax(values []int32) (min, max int32) { if s.order == schema.SortSIGNED { min, max = shared_utils.GetMinMaxInt32(values) } else { umin, umax := shared_utils.GetMinMaxUint32(arrow.Uint32Traits.CastFromBytes(arrow.Int32Traits.CastToBytes(values))) min, max = int32(umin), int32(umax) } return } func (s *Int32Statistics) getMinMaxSpaced(values []int32, validBits []byte, validBitsOffset int64) (min, max int32) { min = s.defaultMin() max = s.defaultMax() var fn func([]int32) if s.order == schema.SortSIGNED { fn = func(v []int32) { localMin, localMax := shared_utils.GetMinMaxInt32(v) if min > localMin { min = localMin } if max < localMax { max = localMax } } } else { fn = func(v []int32) { umin, umax := shared_utils.GetMinMaxUint32(arrow.Uint32Traits.CastFromBytes(arrow.Int32Traits.CastToBytes(v))) if uint32(min) > umin { min = int32(umin) } if uint32(max) < umax { max = int32(umax) } } } if s.bitSetReader == nil { s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values))) } else { s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values))) } for { run := s.bitSetReader.NextRun() if run.Length == 0 { break } fn(values[int(run.Pos):int(run.Pos+run.Length)]) } return } func (s *Int32Statistics) Min() int32 { return s.min } func (s *Int32Statistics) Max() int32 { return s.max } // Merge merges the stats from other into this stat object, updating // the null count, distinct count, number of values and the min/max if // appropriate. func (s *Int32Statistics) Merge(other TypedStatistics) { rhs, ok := other.(*Int32Statistics) if !ok { panic("incompatible stat type merge") } s.statistics.merge(rhs) if rhs.HasMinMax() { s.SetMinMax(rhs.Min(), rhs.Max()) } } // Update is used to add more values to the current stat object, finding the // min and max values etc. func (s *Int32Statistics) Update(values []int32, numNull int64) { s.IncNulls(numNull) s.nvalues += int64(len(values)) if len(values) == 0 { return } s.SetMinMax(s.getMinMax(values)) } // UpdateSpaced is just like Update, but for spaced values using validBits to determine // and skip null values. func (s *Int32Statistics) UpdateSpaced(values []int32, validBits []byte, validBitsOffset, numNull int64) { s.IncNulls(numNull) notnull := int64(len(values)) - numNull s.nvalues += notnull if notnull == 0 { return } s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset)) } func (s *Int32Statistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error { if updateCounts { s.IncNulls(int64(values.NullN())) s.nvalues += int64(values.Len() - values.NullN()) } if values.NullN() == values.Len() { return nil } if values.DataType().(arrow.FixedWidthDataType).Bytes() != arrow.Int32SizeBytes { return fmt.Errorf("%w: cannot update int32 stats with %s arrow array", arrow.ErrInvalid, values.DataType()) } rawBytes := values.Data().Buffers()[1].Bytes()[values.Data().Offset()*arrow.Int32SizeBytes:] s.SetMinMax(s.getMinMax(arrow.Int32Traits.CastFromBytes(rawBytes))) return nil } // SetMinMax updates the min and max values only if they are not currently set // or if argMin is less than the current min / argMax is greater than the current max func (s *Int32Statistics) SetMinMax(argMin, argMax int32) { maybeMinMax := s.cleanStat([2]int32{argMin, argMax}) if maybeMinMax == nil { return } min := (*maybeMinMax)[0] max := (*maybeMinMax)[1] if !s.hasMinMax { s.hasMinMax = true s.min = min s.max = max } else { if !s.less(s.min, min) { s.min = min } if s.less(s.max, max) { s.max = max } } } // EncodeMin returns the encoded min value with plain encoding. // // ByteArray stats do not include the length in the encoding. func (s *Int32Statistics) EncodeMin() []byte { if s.HasMinMax() { return s.plainEncode(s.min) } return nil } // EncodeMax returns the current encoded max value with plain encoding // // ByteArray stats do not include the length in the encoding func (s *Int32Statistics) EncodeMax() []byte { if s.HasMinMax() { return s.plainEncode(s.max) } return nil } // Encode returns a populated EncodedStatistics object func (s *Int32Statistics) Encode() (enc EncodedStatistics, err error) { defer func() { if r := recover(); r != nil { err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r) } }() if s.HasMinMax() { enc.SetMax(s.EncodeMax()) enc.SetMin(s.EncodeMin()) } if s.HasNullCount() { enc.SetNullCount(s.NullCount()) enc.AllNullValue = s.NumValues() == 0 } if s.HasDistinctCount() { enc.SetDistinctCount(s.DistinctCount()) } return } type minmaxPairInt64 [2]int64 // Int64Statistics is the typed interface for managing stats for a column // of Int64 type. type Int64Statistics struct { statistics min int64 max int64 bitSetReader bitutils.SetBitRunReader } // NewInt64Statistics constructs an appropriate stat object type using the // given column descriptor and allocator. // // Panics if the physical type of descr is not parquet.Type.Int64 func NewInt64Statistics(descr *schema.Column, mem memory.Allocator) *Int64Statistics { if descr.PhysicalType() != parquet.Types.Int64 { panic(fmt.Errorf("parquet: invalid type %s for constructing a Int64 stat object", descr.PhysicalType())) } return &Int64Statistics{ statistics: statistics{ descr: descr, hasNullCount: true, hasDistinctCount: true, order: descr.SortOrder(), encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), mem: mem, }, } } // NewInt64StatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewInt64StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Int64Statistics { ret := NewInt64Statistics(descr, mem) ret.nvalues += nvalues if encoded.IsSetNullCount() { ret.IncNulls(encoded.GetNullCount()) } if encoded.IsSetDistinctCount() { ret.IncDistinct(encoded.GetDistinctCount()) } encodedMin := encoded.GetMin() if encodedMin != nil && len(encodedMin) > 0 { ret.min = ret.plainDecode(encodedMin) } encodedMax := encoded.GetMax() if encodedMax != nil && len(encodedMax) > 0 { ret.max = ret.plainDecode(encodedMax) } ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin() return ret } func (s *Int64Statistics) plainEncode(src int64) []byte { s.encoder.(encoding.Int64Encoder).Put([]int64{src}) buf, err := s.encoder.FlushValues() if err != nil { panic(err) // recovered by Encode } defer buf.Release() out := make([]byte, buf.Len()) copy(out, buf.Bytes()) return out } func (s *Int64Statistics) plainDecode(src []byte) int64 { var buf [1]int64 decoder := encoding.NewDecoder(s.descr.PhysicalType(), parquet.Encodings.Plain, s.descr, s.mem) decoder.SetData(1, src) decoder.(encoding.Int64Decoder).Decode(buf[:]) return buf[0] } func (s *Int64Statistics) minval(a, b int64) int64 { if s.less(a, b) { return a } return b } func (s *Int64Statistics) maxval(a, b int64) int64 { if s.less(a, b) { return b } return a } // MinMaxEqual returns true if both stat objects have the same Min and Max values func (s *Int64Statistics) MinMaxEqual(rhs *Int64Statistics) bool { return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max) } // Equals returns true only if both objects are the same type, have the same min and // max values, null count, distinct count and number of values. func (s *Int64Statistics) Equals(other TypedStatistics) bool { if s.Type() != other.Type() { return false } rhs, ok := other.(*Int64Statistics) if !ok { return false } if s.HasMinMax() != rhs.HasMinMax() { return false } return (s.hasMinMax && s.MinMaxEqual(rhs)) && s.NullCount() == rhs.NullCount() && s.DistinctCount() == rhs.DistinctCount() && s.NumValues() == rhs.NumValues() } func (s *Int64Statistics) getMinMax(values []int64) (min, max int64) { if s.order == schema.SortSIGNED { min, max = shared_utils.GetMinMaxInt64(values) } else { umin, umax := shared_utils.GetMinMaxUint64(arrow.Uint64Traits.CastFromBytes(arrow.Int64Traits.CastToBytes(values))) min, max = int64(umin), int64(umax) } return } func (s *Int64Statistics) getMinMaxSpaced(values []int64, validBits []byte, validBitsOffset int64) (min, max int64) { min = s.defaultMin() max = s.defaultMax() var fn func([]int64) if s.order == schema.SortSIGNED { fn = func(v []int64) { localMin, localMax := shared_utils.GetMinMaxInt64(v) if min > localMin { min = localMin } if max < localMax { max = localMax } } } else { fn = func(v []int64) { umin, umax := shared_utils.GetMinMaxUint64(arrow.Uint64Traits.CastFromBytes(arrow.Int64Traits.CastToBytes(v))) if uint64(min) > umin { min = int64(umin) } if uint64(max) < umax { max = int64(umax) } } } if s.bitSetReader == nil { s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values))) } else { s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values))) } for { run := s.bitSetReader.NextRun() if run.Length == 0 { break } fn(values[int(run.Pos):int(run.Pos+run.Length)]) } return } func (s *Int64Statistics) Min() int64 { return s.min } func (s *Int64Statistics) Max() int64 { return s.max } // Merge merges the stats from other into this stat object, updating // the null count, distinct count, number of values and the min/max if // appropriate. func (s *Int64Statistics) Merge(other TypedStatistics) { rhs, ok := other.(*Int64Statistics) if !ok { panic("incompatible stat type merge") } s.statistics.merge(rhs) if rhs.HasMinMax() { s.SetMinMax(rhs.Min(), rhs.Max()) } } // Update is used to add more values to the current stat object, finding the // min and max values etc. func (s *Int64Statistics) Update(values []int64, numNull int64) { s.IncNulls(numNull) s.nvalues += int64(len(values)) if len(values) == 0 { return } s.SetMinMax(s.getMinMax(values)) } // UpdateSpaced is just like Update, but for spaced values using validBits to determine // and skip null values. func (s *Int64Statistics) UpdateSpaced(values []int64, validBits []byte, validBitsOffset, numNull int64) { s.IncNulls(numNull) notnull := int64(len(values)) - numNull s.nvalues += notnull if notnull == 0 { return } s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset)) } func (s *Int64Statistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error { if updateCounts { s.IncNulls(int64(values.NullN())) s.nvalues += int64(values.Len() - values.NullN()) } if values.NullN() == values.Len() { return nil } if values.DataType().(arrow.FixedWidthDataType).Bytes() != arrow.Int64SizeBytes { return fmt.Errorf("%w: cannot update int64 stats with %s arrow array", arrow.ErrInvalid, values.DataType()) } rawBytes := values.Data().Buffers()[1].Bytes()[values.Data().Offset()*arrow.Int64SizeBytes:] s.SetMinMax(s.getMinMax(arrow.Int64Traits.CastFromBytes(rawBytes))) return nil } // SetMinMax updates the min and max values only if they are not currently set // or if argMin is less than the current min / argMax is greater than the current max func (s *Int64Statistics) SetMinMax(argMin, argMax int64) { maybeMinMax := s.cleanStat([2]int64{argMin, argMax}) if maybeMinMax == nil { return } min := (*maybeMinMax)[0] max := (*maybeMinMax)[1] if !s.hasMinMax { s.hasMinMax = true s.min = min s.max = max } else { if !s.less(s.min, min) { s.min = min } if s.less(s.max, max) { s.max = max } } } // EncodeMin returns the encoded min value with plain encoding. // // ByteArray stats do not include the length in the encoding. func (s *Int64Statistics) EncodeMin() []byte { if s.HasMinMax() { return s.plainEncode(s.min) } return nil } // EncodeMax returns the current encoded max value with plain encoding // // ByteArray stats do not include the length in the encoding func (s *Int64Statistics) EncodeMax() []byte { if s.HasMinMax() { return s.plainEncode(s.max) } return nil } // Encode returns a populated EncodedStatistics object func (s *Int64Statistics) Encode() (enc EncodedStatistics, err error) { defer func() { if r := recover(); r != nil { err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r) } }() if s.HasMinMax() { enc.SetMax(s.EncodeMax()) enc.SetMin(s.EncodeMin()) } if s.HasNullCount() { enc.SetNullCount(s.NullCount()) enc.AllNullValue = s.NumValues() == 0 } if s.HasDistinctCount() { enc.SetDistinctCount(s.DistinctCount()) } return } type minmaxPairInt96 [2]parquet.Int96 // Int96Statistics is the typed interface for managing stats for a column // of Int96 type. type Int96Statistics struct { statistics min parquet.Int96 max parquet.Int96 bitSetReader bitutils.SetBitRunReader } // NewInt96Statistics constructs an appropriate stat object type using the // given column descriptor and allocator. // // Panics if the physical type of descr is not parquet.Type.Int96 func NewInt96Statistics(descr *schema.Column, mem memory.Allocator) *Int96Statistics { if descr.PhysicalType() != parquet.Types.Int96 { panic(fmt.Errorf("parquet: invalid type %s for constructing a Int96 stat object", descr.PhysicalType())) } return &Int96Statistics{ statistics: statistics{ descr: descr, hasNullCount: true, hasDistinctCount: true, order: descr.SortOrder(), encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), mem: mem, }, } } // NewInt96StatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewInt96StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Int96Statistics { ret := NewInt96Statistics(descr, mem) ret.nvalues += nvalues if encoded.IsSetNullCount() { ret.IncNulls(encoded.GetNullCount()) } if encoded.IsSetDistinctCount() { ret.IncDistinct(encoded.GetDistinctCount()) } encodedMin := encoded.GetMin() if encodedMin != nil && len(encodedMin) > 0 { ret.min = ret.plainDecode(encodedMin) } encodedMax := encoded.GetMax() if encodedMax != nil && len(encodedMax) > 0 { ret.max = ret.plainDecode(encodedMax) } ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin() return ret } func (s *Int96Statistics) plainEncode(src parquet.Int96) []byte { s.encoder.(encoding.Int96Encoder).Put([]parquet.Int96{src}) buf, err := s.encoder.FlushValues() if err != nil { panic(err) // recovered by Encode } defer buf.Release() out := make([]byte, buf.Len()) copy(out, buf.Bytes()) return out } func (s *Int96Statistics) plainDecode(src []byte) parquet.Int96 { var buf [1]parquet.Int96 decoder := encoding.NewDecoder(s.descr.PhysicalType(), parquet.Encodings.Plain, s.descr, s.mem) decoder.SetData(1, src) decoder.(encoding.Int96Decoder).Decode(buf[:]) return buf[0] } func (s *Int96Statistics) minval(a, b parquet.Int96) parquet.Int96 { if s.less(a, b) { return a } return b } func (s *Int96Statistics) maxval(a, b parquet.Int96) parquet.Int96 { if s.less(a, b) { return b } return a } // MinMaxEqual returns true if both stat objects have the same Min and Max values func (s *Int96Statistics) MinMaxEqual(rhs *Int96Statistics) bool { return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max) } // Equals returns true only if both objects are the same type, have the same min and // max values, null count, distinct count and number of values. func (s *Int96Statistics) Equals(other TypedStatistics) bool { if s.Type() != other.Type() { return false } rhs, ok := other.(*Int96Statistics) if !ok { return false } if s.HasMinMax() != rhs.HasMinMax() { return false } return (s.hasMinMax && s.MinMaxEqual(rhs)) && s.NullCount() == rhs.NullCount() && s.DistinctCount() == rhs.DistinctCount() && s.NumValues() == rhs.NumValues() } func (s *Int96Statistics) getMinMax(values []parquet.Int96) (min, max parquet.Int96) { defMin := s.defaultMin() defMax := s.defaultMax() min = defMin max = defMax for _, v := range values { min = s.minval(min, v) max = s.maxval(max, v) } return } func (s *Int96Statistics) getMinMaxSpaced(values []parquet.Int96, validBits []byte, validBitsOffset int64) (min, max parquet.Int96) { min = s.defaultMin() max = s.defaultMax() if s.bitSetReader == nil { s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values))) } else { s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values))) } for { run := s.bitSetReader.NextRun() if run.Length == 0 { break } for _, v := range values[int(run.Pos):int(run.Pos+run.Length)] { min = s.minval(min, v) max = s.maxval(max, v) } } return } func (s *Int96Statistics) Min() parquet.Int96 { return s.min } func (s *Int96Statistics) Max() parquet.Int96 { return s.max } // Merge merges the stats from other into this stat object, updating // the null count, distinct count, number of values and the min/max if // appropriate. func (s *Int96Statistics) Merge(other TypedStatistics) { rhs, ok := other.(*Int96Statistics) if !ok { panic("incompatible stat type merge") } s.statistics.merge(rhs) if rhs.HasMinMax() { s.SetMinMax(rhs.Min(), rhs.Max()) } } // Update is used to add more values to the current stat object, finding the // min and max values etc. func (s *Int96Statistics) Update(values []parquet.Int96, numNull int64) { s.IncNulls(numNull) s.nvalues += int64(len(values)) if len(values) == 0 { return } s.SetMinMax(s.getMinMax(values)) } // UpdateSpaced is just like Update, but for spaced values using validBits to determine // and skip null values. func (s *Int96Statistics) UpdateSpaced(values []parquet.Int96, validBits []byte, validBitsOffset, numNull int64) { s.IncNulls(numNull) notnull := int64(len(values)) - numNull s.nvalues += notnull if notnull == 0 { return } s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset)) } func (s *Int96Statistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error { if updateCounts { s.IncNulls(int64(values.NullN())) s.nvalues += int64(values.Len() - values.NullN()) } if values.NullN() == values.Len() { return nil } return fmt.Errorf("%w: update int96 stats from Arrow", arrow.ErrNotImplemented) } // SetMinMax updates the min and max values only if they are not currently set // or if argMin is less than the current min / argMax is greater than the current max func (s *Int96Statistics) SetMinMax(argMin, argMax parquet.Int96) { maybeMinMax := s.cleanStat([2]parquet.Int96{argMin, argMax}) if maybeMinMax == nil { return } min := (*maybeMinMax)[0] max := (*maybeMinMax)[1] if !s.hasMinMax { s.hasMinMax = true s.min = min s.max = max } else { if !s.less(s.min, min) { s.min = min } if s.less(s.max, max) { s.max = max } } } // EncodeMin returns the encoded min value with plain encoding. // // ByteArray stats do not include the length in the encoding. func (s *Int96Statistics) EncodeMin() []byte { if s.HasMinMax() { return s.plainEncode(s.min) } return nil } // EncodeMax returns the current encoded max value with plain encoding // // ByteArray stats do not include the length in the encoding func (s *Int96Statistics) EncodeMax() []byte { if s.HasMinMax() { return s.plainEncode(s.max) } return nil } // Encode returns a populated EncodedStatistics object func (s *Int96Statistics) Encode() (enc EncodedStatistics, err error) { defer func() { if r := recover(); r != nil { err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r) } }() if s.HasMinMax() { enc.SetMax(s.EncodeMax()) enc.SetMin(s.EncodeMin()) } if s.HasNullCount() { enc.SetNullCount(s.NullCount()) enc.AllNullValue = s.NumValues() == 0 } if s.HasDistinctCount() { enc.SetDistinctCount(s.DistinctCount()) } return } type minmaxPairFloat32 [2]float32 // Float32Statistics is the typed interface for managing stats for a column // of Float32 type. type Float32Statistics struct { statistics min float32 max float32 bitSetReader bitutils.SetBitRunReader } // NewFloat32Statistics constructs an appropriate stat object type using the // given column descriptor and allocator. // // Panics if the physical type of descr is not parquet.Type.Float func NewFloat32Statistics(descr *schema.Column, mem memory.Allocator) *Float32Statistics { if descr.PhysicalType() != parquet.Types.Float { panic(fmt.Errorf("parquet: invalid type %s for constructing a Float32 stat object", descr.PhysicalType())) } return &Float32Statistics{ statistics: statistics{ descr: descr, hasNullCount: true, hasDistinctCount: true, order: descr.SortOrder(), encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), mem: mem, }, } } // NewFloat32StatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewFloat32StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Float32Statistics { ret := NewFloat32Statistics(descr, mem) ret.nvalues += nvalues if encoded.IsSetNullCount() { ret.IncNulls(encoded.GetNullCount()) } if encoded.IsSetDistinctCount() { ret.IncDistinct(encoded.GetDistinctCount()) } encodedMin := encoded.GetMin() if encodedMin != nil && len(encodedMin) > 0 { ret.min = ret.plainDecode(encodedMin) } encodedMax := encoded.GetMax() if encodedMax != nil && len(encodedMax) > 0 { ret.max = ret.plainDecode(encodedMax) } ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin() return ret } func (s *Float32Statistics) plainEncode(src float32) []byte { s.encoder.(encoding.Float32Encoder).Put([]float32{src}) buf, err := s.encoder.FlushValues() if err != nil { panic(err) // recovered by Encode } defer buf.Release() out := make([]byte, buf.Len()) copy(out, buf.Bytes()) return out } func (s *Float32Statistics) plainDecode(src []byte) float32 { var buf [1]float32 decoder := encoding.NewDecoder(s.descr.PhysicalType(), parquet.Encodings.Plain, s.descr, s.mem) decoder.SetData(1, src) decoder.(encoding.Float32Decoder).Decode(buf[:]) return buf[0] } func (s *Float32Statistics) minval(a, b float32) float32 { if s.less(a, b) { return a } return b } func (s *Float32Statistics) maxval(a, b float32) float32 { if s.less(a, b) { return b } return a } // MinMaxEqual returns true if both stat objects have the same Min and Max values func (s *Float32Statistics) MinMaxEqual(rhs *Float32Statistics) bool { return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max) } // Equals returns true only if both objects are the same type, have the same min and // max values, null count, distinct count and number of values. func (s *Float32Statistics) Equals(other TypedStatistics) bool { if s.Type() != other.Type() { return false } rhs, ok := other.(*Float32Statistics) if !ok { return false } if s.HasMinMax() != rhs.HasMinMax() { return false } return (s.hasMinMax && s.MinMaxEqual(rhs)) && s.NullCount() == rhs.NullCount() && s.DistinctCount() == rhs.DistinctCount() && s.NumValues() == rhs.NumValues() } func (s *Float32Statistics) coalesce(val, fallback float32) float32 { if math.IsNaN(float64(val)) { return fallback } return val } func (s *Float32Statistics) getMinMax(values []float32) (min, max float32) { defMin := s.defaultMin() defMax := s.defaultMax() min = defMin max = defMax for _, v := range values { min = s.minval(min, s.coalesce(v, defMin)) max = s.maxval(max, s.coalesce(v, defMax)) } return } func (s *Float32Statistics) getMinMaxSpaced(values []float32, validBits []byte, validBitsOffset int64) (min, max float32) { min = s.defaultMin() max = s.defaultMax() if s.bitSetReader == nil { s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values))) } else { s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values))) } for { run := s.bitSetReader.NextRun() if run.Length == 0 { break } for _, v := range values[int(run.Pos):int(run.Pos+run.Length)] { min = s.minval(min, coalesce(v, s.defaultMin()).(float32)) max = s.maxval(max, coalesce(v, s.defaultMax()).(float32)) } } return } func (s *Float32Statistics) Min() float32 { return s.min } func (s *Float32Statistics) Max() float32 { return s.max } // Merge merges the stats from other into this stat object, updating // the null count, distinct count, number of values and the min/max if // appropriate. func (s *Float32Statistics) Merge(other TypedStatistics) { rhs, ok := other.(*Float32Statistics) if !ok { panic("incompatible stat type merge") } s.statistics.merge(rhs) if rhs.HasMinMax() { s.SetMinMax(rhs.Min(), rhs.Max()) } } // Update is used to add more values to the current stat object, finding the // min and max values etc. func (s *Float32Statistics) Update(values []float32, numNull int64) { s.IncNulls(numNull) s.nvalues += int64(len(values)) if len(values) == 0 { return } s.SetMinMax(s.getMinMax(values)) } // UpdateSpaced is just like Update, but for spaced values using validBits to determine // and skip null values. func (s *Float32Statistics) UpdateSpaced(values []float32, validBits []byte, validBitsOffset, numNull int64) { s.IncNulls(numNull) notnull := int64(len(values)) - numNull s.nvalues += notnull if notnull == 0 { return } s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset)) } func (s *Float32Statistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error { if updateCounts { s.IncNulls(int64(values.NullN())) s.nvalues += int64(values.Len() - values.NullN()) } if values.NullN() == values.Len() { return nil } if values.DataType().(arrow.FixedWidthDataType).Bytes() != arrow.Float32SizeBytes { return fmt.Errorf("%w: cannot update float32 stats with %s arrow array", arrow.ErrInvalid, values.DataType()) } rawBytes := values.Data().Buffers()[1].Bytes()[values.Data().Offset()*arrow.Float32SizeBytes:] s.SetMinMax(s.getMinMax(arrow.Float32Traits.CastFromBytes(rawBytes))) return nil } // SetMinMax updates the min and max values only if they are not currently set // or if argMin is less than the current min / argMax is greater than the current max func (s *Float32Statistics) SetMinMax(argMin, argMax float32) { maybeMinMax := s.cleanStat([2]float32{argMin, argMax}) if maybeMinMax == nil { return } min := (*maybeMinMax)[0] max := (*maybeMinMax)[1] if !s.hasMinMax { s.hasMinMax = true s.min = min s.max = max } else { if !s.less(s.min, min) { s.min = min } if s.less(s.max, max) { s.max = max } } } // EncodeMin returns the encoded min value with plain encoding. // // ByteArray stats do not include the length in the encoding. func (s *Float32Statistics) EncodeMin() []byte { if s.HasMinMax() { return s.plainEncode(s.min) } return nil } // EncodeMax returns the current encoded max value with plain encoding // // ByteArray stats do not include the length in the encoding func (s *Float32Statistics) EncodeMax() []byte { if s.HasMinMax() { return s.plainEncode(s.max) } return nil } // Encode returns a populated EncodedStatistics object func (s *Float32Statistics) Encode() (enc EncodedStatistics, err error) { defer func() { if r := recover(); r != nil { err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r) } }() if s.HasMinMax() { enc.SetMax(s.EncodeMax()) enc.SetMin(s.EncodeMin()) } if s.HasNullCount() { enc.SetNullCount(s.NullCount()) enc.AllNullValue = s.NumValues() == 0 } if s.HasDistinctCount() { enc.SetDistinctCount(s.DistinctCount()) } return } type minmaxPairFloat64 [2]float64 // Float64Statistics is the typed interface for managing stats for a column // of Float64 type. type Float64Statistics struct { statistics min float64 max float64 bitSetReader bitutils.SetBitRunReader } // NewFloat64Statistics constructs an appropriate stat object type using the // given column descriptor and allocator. // // Panics if the physical type of descr is not parquet.Type.Double func NewFloat64Statistics(descr *schema.Column, mem memory.Allocator) *Float64Statistics { if descr.PhysicalType() != parquet.Types.Double { panic(fmt.Errorf("parquet: invalid type %s for constructing a Float64 stat object", descr.PhysicalType())) } return &Float64Statistics{ statistics: statistics{ descr: descr, hasNullCount: true, hasDistinctCount: true, order: descr.SortOrder(), encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), mem: mem, }, } } // NewFloat64StatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewFloat64StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Float64Statistics { ret := NewFloat64Statistics(descr, mem) ret.nvalues += nvalues if encoded.IsSetNullCount() { ret.IncNulls(encoded.GetNullCount()) } if encoded.IsSetDistinctCount() { ret.IncDistinct(encoded.GetDistinctCount()) } encodedMin := encoded.GetMin() if encodedMin != nil && len(encodedMin) > 0 { ret.min = ret.plainDecode(encodedMin) } encodedMax := encoded.GetMax() if encodedMax != nil && len(encodedMax) > 0 { ret.max = ret.plainDecode(encodedMax) } ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin() return ret } func (s *Float64Statistics) plainEncode(src float64) []byte { s.encoder.(encoding.Float64Encoder).Put([]float64{src}) buf, err := s.encoder.FlushValues() if err != nil { panic(err) // recovered by Encode } defer buf.Release() out := make([]byte, buf.Len()) copy(out, buf.Bytes()) return out } func (s *Float64Statistics) plainDecode(src []byte) float64 { var buf [1]float64 decoder := encoding.NewDecoder(s.descr.PhysicalType(), parquet.Encodings.Plain, s.descr, s.mem) decoder.SetData(1, src) decoder.(encoding.Float64Decoder).Decode(buf[:]) return buf[0] } func (s *Float64Statistics) minval(a, b float64) float64 { if s.less(a, b) { return a } return b } func (s *Float64Statistics) maxval(a, b float64) float64 { if s.less(a, b) { return b } return a } // MinMaxEqual returns true if both stat objects have the same Min and Max values func (s *Float64Statistics) MinMaxEqual(rhs *Float64Statistics) bool { return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max) } // Equals returns true only if both objects are the same type, have the same min and // max values, null count, distinct count and number of values. func (s *Float64Statistics) Equals(other TypedStatistics) bool { if s.Type() != other.Type() { return false } rhs, ok := other.(*Float64Statistics) if !ok { return false } if s.HasMinMax() != rhs.HasMinMax() { return false } return (s.hasMinMax && s.MinMaxEqual(rhs)) && s.NullCount() == rhs.NullCount() && s.DistinctCount() == rhs.DistinctCount() && s.NumValues() == rhs.NumValues() } func (s *Float64Statistics) coalesce(val, fallback float64) float64 { if math.IsNaN(float64(val)) { return fallback } return val } func (s *Float64Statistics) getMinMax(values []float64) (min, max float64) { defMin := s.defaultMin() defMax := s.defaultMax() min = defMin max = defMax for _, v := range values { min = s.minval(min, s.coalesce(v, defMin)) max = s.maxval(max, s.coalesce(v, defMax)) } return } func (s *Float64Statistics) getMinMaxSpaced(values []float64, validBits []byte, validBitsOffset int64) (min, max float64) { min = s.defaultMin() max = s.defaultMax() if s.bitSetReader == nil { s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values))) } else { s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values))) } for { run := s.bitSetReader.NextRun() if run.Length == 0 { break } for _, v := range values[int(run.Pos):int(run.Pos+run.Length)] { min = s.minval(min, coalesce(v, s.defaultMin()).(float64)) max = s.maxval(max, coalesce(v, s.defaultMax()).(float64)) } } return } func (s *Float64Statistics) Min() float64 { return s.min } func (s *Float64Statistics) Max() float64 { return s.max } // Merge merges the stats from other into this stat object, updating // the null count, distinct count, number of values and the min/max if // appropriate. func (s *Float64Statistics) Merge(other TypedStatistics) { rhs, ok := other.(*Float64Statistics) if !ok { panic("incompatible stat type merge") } s.statistics.merge(rhs) if rhs.HasMinMax() { s.SetMinMax(rhs.Min(), rhs.Max()) } } // Update is used to add more values to the current stat object, finding the // min and max values etc. func (s *Float64Statistics) Update(values []float64, numNull int64) { s.IncNulls(numNull) s.nvalues += int64(len(values)) if len(values) == 0 { return } s.SetMinMax(s.getMinMax(values)) } // UpdateSpaced is just like Update, but for spaced values using validBits to determine // and skip null values. func (s *Float64Statistics) UpdateSpaced(values []float64, validBits []byte, validBitsOffset, numNull int64) { s.IncNulls(numNull) notnull := int64(len(values)) - numNull s.nvalues += notnull if notnull == 0 { return } s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset)) } func (s *Float64Statistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error { if updateCounts { s.IncNulls(int64(values.NullN())) s.nvalues += int64(values.Len() - values.NullN()) } if values.NullN() == values.Len() { return nil } if values.DataType().(arrow.FixedWidthDataType).Bytes() != arrow.Float64SizeBytes { return fmt.Errorf("%w: cannot update float64 stats with %s arrow array", arrow.ErrInvalid, values.DataType()) } rawBytes := values.Data().Buffers()[1].Bytes()[values.Data().Offset()*arrow.Float64SizeBytes:] s.SetMinMax(s.getMinMax(arrow.Float64Traits.CastFromBytes(rawBytes))) return nil } // SetMinMax updates the min and max values only if they are not currently set // or if argMin is less than the current min / argMax is greater than the current max func (s *Float64Statistics) SetMinMax(argMin, argMax float64) { maybeMinMax := s.cleanStat([2]float64{argMin, argMax}) if maybeMinMax == nil { return } min := (*maybeMinMax)[0] max := (*maybeMinMax)[1] if !s.hasMinMax { s.hasMinMax = true s.min = min s.max = max } else { if !s.less(s.min, min) { s.min = min } if s.less(s.max, max) { s.max = max } } } // EncodeMin returns the encoded min value with plain encoding. // // ByteArray stats do not include the length in the encoding. func (s *Float64Statistics) EncodeMin() []byte { if s.HasMinMax() { return s.plainEncode(s.min) } return nil } // EncodeMax returns the current encoded max value with plain encoding // // ByteArray stats do not include the length in the encoding func (s *Float64Statistics) EncodeMax() []byte { if s.HasMinMax() { return s.plainEncode(s.max) } return nil } // Encode returns a populated EncodedStatistics object func (s *Float64Statistics) Encode() (enc EncodedStatistics, err error) { defer func() { if r := recover(); r != nil { err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r) } }() if s.HasMinMax() { enc.SetMax(s.EncodeMax()) enc.SetMin(s.EncodeMin()) } if s.HasNullCount() { enc.SetNullCount(s.NullCount()) enc.AllNullValue = s.NumValues() == 0 } if s.HasDistinctCount() { enc.SetDistinctCount(s.DistinctCount()) } return } type minmaxPairBoolean [2]bool // BooleanStatistics is the typed interface for managing stats for a column // of Boolean type. type BooleanStatistics struct { statistics min bool max bool bitSetReader bitutils.SetBitRunReader } // NewBooleanStatistics constructs an appropriate stat object type using the // given column descriptor and allocator. // // Panics if the physical type of descr is not parquet.Type.Boolean func NewBooleanStatistics(descr *schema.Column, mem memory.Allocator) *BooleanStatistics { if descr.PhysicalType() != parquet.Types.Boolean { panic(fmt.Errorf("parquet: invalid type %s for constructing a Boolean stat object", descr.PhysicalType())) } return &BooleanStatistics{ statistics: statistics{ descr: descr, hasNullCount: true, hasDistinctCount: true, order: descr.SortOrder(), encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), mem: mem, }, } } // NewBooleanStatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewBooleanStatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *BooleanStatistics { ret := NewBooleanStatistics(descr, mem) ret.nvalues += nvalues if encoded.IsSetNullCount() { ret.IncNulls(encoded.GetNullCount()) } if encoded.IsSetDistinctCount() { ret.IncDistinct(encoded.GetDistinctCount()) } encodedMin := encoded.GetMin() if encodedMin != nil && len(encodedMin) > 0 { ret.min = ret.plainDecode(encodedMin) } encodedMax := encoded.GetMax() if encodedMax != nil && len(encodedMax) > 0 { ret.max = ret.plainDecode(encodedMax) } ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin() return ret } func (s *BooleanStatistics) plainEncode(src bool) []byte { s.encoder.(encoding.BooleanEncoder).Put([]bool{src}) buf, err := s.encoder.FlushValues() if err != nil { panic(err) // recovered by Encode } defer buf.Release() out := make([]byte, buf.Len()) copy(out, buf.Bytes()) return out } func (s *BooleanStatistics) plainDecode(src []byte) bool { var buf [1]bool decoder := encoding.NewDecoder(s.descr.PhysicalType(), parquet.Encodings.Plain, s.descr, s.mem) decoder.SetData(1, src) decoder.(encoding.BooleanDecoder).Decode(buf[:]) return buf[0] } func (s *BooleanStatistics) minval(a, b bool) bool { if s.less(a, b) { return a } return b } func (s *BooleanStatistics) maxval(a, b bool) bool { if s.less(a, b) { return b } return a } // MinMaxEqual returns true if both stat objects have the same Min and Max values func (s *BooleanStatistics) MinMaxEqual(rhs *BooleanStatistics) bool { return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max) } // Equals returns true only if both objects are the same type, have the same min and // max values, null count, distinct count and number of values. func (s *BooleanStatistics) Equals(other TypedStatistics) bool { if s.Type() != other.Type() { return false } rhs, ok := other.(*BooleanStatistics) if !ok { return false } if s.HasMinMax() != rhs.HasMinMax() { return false } return (s.hasMinMax && s.MinMaxEqual(rhs)) && s.NullCount() == rhs.NullCount() && s.DistinctCount() == rhs.DistinctCount() && s.NumValues() == rhs.NumValues() } func (s *BooleanStatistics) getMinMax(values []bool) (min, max bool) { defMin := s.defaultMin() defMax := s.defaultMax() min = defMin max = defMax for _, v := range values { min = s.minval(min, v) max = s.maxval(max, v) } return } func (s *BooleanStatistics) getMinMaxSpaced(values []bool, validBits []byte, validBitsOffset int64) (min, max bool) { min = s.defaultMin() max = s.defaultMax() if s.bitSetReader == nil { s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values))) } else { s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values))) } for { run := s.bitSetReader.NextRun() if run.Length == 0 { break } for _, v := range values[int(run.Pos):int(run.Pos+run.Length)] { min = s.minval(min, v) max = s.maxval(max, v) } } return } func (s *BooleanStatistics) Min() bool { return s.min } func (s *BooleanStatistics) Max() bool { return s.max } // Merge merges the stats from other into this stat object, updating // the null count, distinct count, number of values and the min/max if // appropriate. func (s *BooleanStatistics) Merge(other TypedStatistics) { rhs, ok := other.(*BooleanStatistics) if !ok { panic("incompatible stat type merge") } s.statistics.merge(rhs) if rhs.HasMinMax() { s.SetMinMax(rhs.Min(), rhs.Max()) } } // Update is used to add more values to the current stat object, finding the // min and max values etc. func (s *BooleanStatistics) Update(values []bool, numNull int64) { s.IncNulls(numNull) s.nvalues += int64(len(values)) if len(values) == 0 { return } s.SetMinMax(s.getMinMax(values)) } // UpdateSpaced is just like Update, but for spaced values using validBits to determine // and skip null values. func (s *BooleanStatistics) UpdateSpaced(values []bool, validBits []byte, validBitsOffset, numNull int64) { s.IncNulls(numNull) notnull := int64(len(values)) - numNull s.nvalues += notnull if notnull == 0 { return } s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset)) } func (s *BooleanStatistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error { if updateCounts { s.IncNulls(int64(values.NullN())) s.nvalues += int64(values.Len() - values.NullN()) } if values.NullN() == values.Len() { return nil } return fmt.Errorf("%w: update boolean stats from Arrow", arrow.ErrNotImplemented) } // SetMinMax updates the min and max values only if they are not currently set // or if argMin is less than the current min / argMax is greater than the current max func (s *BooleanStatistics) SetMinMax(argMin, argMax bool) { maybeMinMax := s.cleanStat([2]bool{argMin, argMax}) if maybeMinMax == nil { return } min := (*maybeMinMax)[0] max := (*maybeMinMax)[1] if !s.hasMinMax { s.hasMinMax = true s.min = min s.max = max } else { if !s.less(s.min, min) { s.min = min } if s.less(s.max, max) { s.max = max } } } // EncodeMin returns the encoded min value with plain encoding. // // ByteArray stats do not include the length in the encoding. func (s *BooleanStatistics) EncodeMin() []byte { if s.HasMinMax() { return s.plainEncode(s.min) } return nil } // EncodeMax returns the current encoded max value with plain encoding // // ByteArray stats do not include the length in the encoding func (s *BooleanStatistics) EncodeMax() []byte { if s.HasMinMax() { return s.plainEncode(s.max) } return nil } // Encode returns a populated EncodedStatistics object func (s *BooleanStatistics) Encode() (enc EncodedStatistics, err error) { defer func() { if r := recover(); r != nil { err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r) } }() if s.HasMinMax() { enc.SetMax(s.EncodeMax()) enc.SetMin(s.EncodeMin()) } if s.HasNullCount() { enc.SetNullCount(s.NullCount()) enc.AllNullValue = s.NumValues() == 0 } if s.HasDistinctCount() { enc.SetDistinctCount(s.DistinctCount()) } return } type minmaxPairByteArray [2]parquet.ByteArray // ByteArrayStatistics is the typed interface for managing stats for a column // of ByteArray type. type ByteArrayStatistics struct { statistics min parquet.ByteArray max parquet.ByteArray bitSetReader bitutils.SetBitRunReader } // NewByteArrayStatistics constructs an appropriate stat object type using the // given column descriptor and allocator. // // Panics if the physical type of descr is not parquet.Type.ByteArray func NewByteArrayStatistics(descr *schema.Column, mem memory.Allocator) *ByteArrayStatistics { if descr.PhysicalType() != parquet.Types.ByteArray { panic(fmt.Errorf("parquet: invalid type %s for constructing a ByteArray stat object", descr.PhysicalType())) } return &ByteArrayStatistics{ statistics: statistics{ descr: descr, hasNullCount: true, hasDistinctCount: true, order: descr.SortOrder(), encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), mem: mem, }, min: make([]byte, 0), max: make([]byte, 0), } } // NewByteArrayStatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewByteArrayStatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *ByteArrayStatistics { ret := NewByteArrayStatistics(descr, mem) ret.nvalues += nvalues if encoded.IsSetNullCount() { ret.IncNulls(encoded.GetNullCount()) } if encoded.IsSetDistinctCount() { ret.IncDistinct(encoded.GetDistinctCount()) } encodedMin := encoded.GetMin() if encodedMin != nil && len(encodedMin) > 0 { ret.min = ret.plainDecode(encodedMin) } encodedMax := encoded.GetMax() if encodedMax != nil && len(encodedMax) > 0 { ret.max = ret.plainDecode(encodedMax) } ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin() return ret } func (s *ByteArrayStatistics) plainEncode(src parquet.ByteArray) []byte { out := make([]byte, len(src)) copy(out, src) return out } func (s *ByteArrayStatistics) plainDecode(src []byte) parquet.ByteArray { return src } func (s *ByteArrayStatistics) minval(a, b parquet.ByteArray) parquet.ByteArray { switch { case a == nil: return b case b == nil: return a case s.less(a, b): return a default: return b } } func (s *ByteArrayStatistics) maxval(a, b parquet.ByteArray) parquet.ByteArray { switch { case a == nil: return b case b == nil: return a case s.less(a, b): return b default: return a } } // MinMaxEqual returns true if both stat objects have the same Min and Max values func (s *ByteArrayStatistics) MinMaxEqual(rhs *ByteArrayStatistics) bool { return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max) } // Equals returns true only if both objects are the same type, have the same min and // max values, null count, distinct count and number of values. func (s *ByteArrayStatistics) Equals(other TypedStatistics) bool { if s.Type() != other.Type() { return false } rhs, ok := other.(*ByteArrayStatistics) if !ok { return false } if s.HasMinMax() != rhs.HasMinMax() { return false } return (s.hasMinMax && s.MinMaxEqual(rhs)) && s.NullCount() == rhs.NullCount() && s.DistinctCount() == rhs.DistinctCount() && s.NumValues() == rhs.NumValues() } func (s *ByteArrayStatistics) getMinMax(values []parquet.ByteArray) (min, max parquet.ByteArray) { defMin := s.defaultMin() defMax := s.defaultMax() min = defMin max = defMax for _, v := range values { min = s.minval(min, v) max = s.maxval(max, v) } return } func (s *ByteArrayStatistics) getMinMaxSpaced(values []parquet.ByteArray, validBits []byte, validBitsOffset int64) (min, max parquet.ByteArray) { min = s.defaultMin() max = s.defaultMax() if s.bitSetReader == nil { s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values))) } else { s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values))) } for { run := s.bitSetReader.NextRun() if run.Length == 0 { break } for _, v := range values[int(run.Pos):int(run.Pos+run.Length)] { min = s.minval(min, v) max = s.maxval(max, v) } } return } func (s *ByteArrayStatistics) Min() parquet.ByteArray { return s.min } func (s *ByteArrayStatistics) Max() parquet.ByteArray { return s.max } // Merge merges the stats from other into this stat object, updating // the null count, distinct count, number of values and the min/max if // appropriate. func (s *ByteArrayStatistics) Merge(other TypedStatistics) { rhs, ok := other.(*ByteArrayStatistics) if !ok { panic("incompatible stat type merge") } s.statistics.merge(rhs) if rhs.HasMinMax() { s.SetMinMax(rhs.Min(), rhs.Max()) } } // Update is used to add more values to the current stat object, finding the // min and max values etc. func (s *ByteArrayStatistics) Update(values []parquet.ByteArray, numNull int64) { s.IncNulls(numNull) s.nvalues += int64(len(values)) if len(values) == 0 { return } s.SetMinMax(s.getMinMax(values)) } // UpdateSpaced is just like Update, but for spaced values using validBits to determine // and skip null values. func (s *ByteArrayStatistics) UpdateSpaced(values []parquet.ByteArray, validBits []byte, validBitsOffset, numNull int64) { s.IncNulls(numNull) notnull := int64(len(values)) - numNull s.nvalues += notnull if notnull == 0 { return } s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset)) } func (s *ByteArrayStatistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error { if updateCounts { s.IncNulls(int64(values.NullN())) s.nvalues += int64(values.Len() - values.NullN()) } if values.NullN() == values.Len() { return nil } if !arrow.IsBaseBinary(values.DataType().ID()) { return fmt.Errorf("%w: can only update ByteArray stats from binary or string array", arrow.ErrInvalid) } var ( min = s.defaultMin() max = s.defaultMax() arr = values.(array.BinaryLike) data = arr.ValueBytes() curOffset = int64(0) ) for i := 0; i < arr.Len(); i++ { nextOffset := curOffset + int64(arr.ValueLen(i)) v := data[curOffset:nextOffset] curOffset = nextOffset if len(v) == 0 { continue } min = s.minval(min, v) max = s.maxval(max, v) } s.SetMinMax(min, max) return nil } // SetMinMax updates the min and max values only if they are not currently set // or if argMin is less than the current min / argMax is greater than the current max func (s *ByteArrayStatistics) SetMinMax(argMin, argMax parquet.ByteArray) { maybeMinMax := s.cleanStat([2]parquet.ByteArray{argMin, argMax}) if maybeMinMax == nil { return } min := (*maybeMinMax)[0] max := (*maybeMinMax)[1] if !s.hasMinMax { s.hasMinMax = true s.min = min s.max = max } else { if !s.less(s.min, min) { s.min = min } if s.less(s.max, max) { s.max = max } } } // EncodeMin returns the encoded min value with plain encoding. // // ByteArray stats do not include the length in the encoding. func (s *ByteArrayStatistics) EncodeMin() []byte { if s.HasMinMax() { return s.plainEncode(s.min) } return nil } // EncodeMax returns the current encoded max value with plain encoding // // ByteArray stats do not include the length in the encoding func (s *ByteArrayStatistics) EncodeMax() []byte { if s.HasMinMax() { return s.plainEncode(s.max) } return nil } // Encode returns a populated EncodedStatistics object func (s *ByteArrayStatistics) Encode() (enc EncodedStatistics, err error) { defer func() { if r := recover(); r != nil { err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r) } }() if s.HasMinMax() { enc.SetMax(s.EncodeMax()) enc.SetMin(s.EncodeMin()) } if s.HasNullCount() { enc.SetNullCount(s.NullCount()) enc.AllNullValue = s.NumValues() == 0 } if s.HasDistinctCount() { enc.SetDistinctCount(s.DistinctCount()) } return } type minmaxPairFixedLenByteArray [2]parquet.FixedLenByteArray // FixedLenByteArrayStatistics is the typed interface for managing stats for a column // of FixedLenByteArray type. type FixedLenByteArrayStatistics struct { statistics min parquet.FixedLenByteArray max parquet.FixedLenByteArray bitSetReader bitutils.SetBitRunReader } // NewFixedLenByteArrayStatistics constructs an appropriate stat object type using the // given column descriptor and allocator. // // Panics if the physical type of descr is not parquet.Type.FixedLenByteArray func NewFixedLenByteArrayStatistics(descr *schema.Column, mem memory.Allocator) *FixedLenByteArrayStatistics { if descr.PhysicalType() != parquet.Types.FixedLenByteArray { panic(fmt.Errorf("parquet: invalid type %s for constructing a FixedLenByteArray stat object", descr.PhysicalType())) } return &FixedLenByteArrayStatistics{ statistics: statistics{ descr: descr, hasNullCount: true, hasDistinctCount: true, order: descr.SortOrder(), encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), mem: mem, }, } } // NewFixedLenByteArrayStatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewFixedLenByteArrayStatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *FixedLenByteArrayStatistics { ret := NewFixedLenByteArrayStatistics(descr, mem) ret.nvalues += nvalues if encoded.IsSetNullCount() { ret.IncNulls(encoded.GetNullCount()) } if encoded.IsSetDistinctCount() { ret.IncDistinct(encoded.GetDistinctCount()) } encodedMin := encoded.GetMin() if encodedMin != nil && len(encodedMin) > 0 { ret.min = ret.plainDecode(encodedMin) } encodedMax := encoded.GetMax() if encodedMax != nil && len(encodedMax) > 0 { ret.max = ret.plainDecode(encodedMax) } ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin() return ret } func (s *FixedLenByteArrayStatistics) plainEncode(src parquet.FixedLenByteArray) []byte { s.encoder.(encoding.FixedLenByteArrayEncoder).Put([]parquet.FixedLenByteArray{src}) buf, err := s.encoder.FlushValues() if err != nil { panic(err) // recovered by Encode } defer buf.Release() out := make([]byte, buf.Len()) copy(out, buf.Bytes()) return out } func (s *FixedLenByteArrayStatistics) plainDecode(src []byte) parquet.FixedLenByteArray { var buf [1]parquet.FixedLenByteArray decoder := encoding.NewDecoder(s.descr.PhysicalType(), parquet.Encodings.Plain, s.descr, s.mem) decoder.SetData(1, src) decoder.(encoding.FixedLenByteArrayDecoder).Decode(buf[:]) return buf[0] } func (s *FixedLenByteArrayStatistics) minval(a, b parquet.FixedLenByteArray) parquet.FixedLenByteArray { switch { case a == nil: return b case b == nil: return a case s.less(a, b): return a default: return b } } func (s *FixedLenByteArrayStatistics) maxval(a, b parquet.FixedLenByteArray) parquet.FixedLenByteArray { switch { case a == nil: return b case b == nil: return a case s.less(a, b): return b default: return a } } // MinMaxEqual returns true if both stat objects have the same Min and Max values func (s *FixedLenByteArrayStatistics) MinMaxEqual(rhs *FixedLenByteArrayStatistics) bool { return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max) } // Equals returns true only if both objects are the same type, have the same min and // max values, null count, distinct count and number of values. func (s *FixedLenByteArrayStatistics) Equals(other TypedStatistics) bool { if s.Type() != other.Type() { return false } rhs, ok := other.(*FixedLenByteArrayStatistics) if !ok { return false } if s.HasMinMax() != rhs.HasMinMax() { return false } return (s.hasMinMax && s.MinMaxEqual(rhs)) && s.NullCount() == rhs.NullCount() && s.DistinctCount() == rhs.DistinctCount() && s.NumValues() == rhs.NumValues() } func (s *FixedLenByteArrayStatistics) getMinMax(values []parquet.FixedLenByteArray) (min, max parquet.FixedLenByteArray) { defMin := s.defaultMin() defMax := s.defaultMax() min = defMin max = defMax for _, v := range values { min = s.minval(min, v) max = s.maxval(max, v) } return } func (s *FixedLenByteArrayStatistics) getMinMaxSpaced(values []parquet.FixedLenByteArray, validBits []byte, validBitsOffset int64) (min, max parquet.FixedLenByteArray) { min = s.defaultMin() max = s.defaultMax() if s.bitSetReader == nil { s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values))) } else { s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values))) } for { run := s.bitSetReader.NextRun() if run.Length == 0 { break } for _, v := range values[int(run.Pos):int(run.Pos+run.Length)] { min = s.minval(min, v) max = s.maxval(max, v) } } return } func (s *FixedLenByteArrayStatistics) Min() parquet.FixedLenByteArray { return s.min } func (s *FixedLenByteArrayStatistics) Max() parquet.FixedLenByteArray { return s.max } // Merge merges the stats from other into this stat object, updating // the null count, distinct count, number of values and the min/max if // appropriate. func (s *FixedLenByteArrayStatistics) Merge(other TypedStatistics) { rhs, ok := other.(*FixedLenByteArrayStatistics) if !ok { panic("incompatible stat type merge") } s.statistics.merge(rhs) if rhs.HasMinMax() { s.SetMinMax(rhs.Min(), rhs.Max()) } } // Update is used to add more values to the current stat object, finding the // min and max values etc. func (s *FixedLenByteArrayStatistics) Update(values []parquet.FixedLenByteArray, numNull int64) { s.IncNulls(numNull) s.nvalues += int64(len(values)) if len(values) == 0 { return } s.SetMinMax(s.getMinMax(values)) } // UpdateSpaced is just like Update, but for spaced values using validBits to determine // and skip null values. func (s *FixedLenByteArrayStatistics) UpdateSpaced(values []parquet.FixedLenByteArray, validBits []byte, validBitsOffset, numNull int64) { s.IncNulls(numNull) notnull := int64(len(values)) - numNull s.nvalues += notnull if notnull == 0 { return } s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset)) } func (s *FixedLenByteArrayStatistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error { if updateCounts { s.IncNulls(int64(values.NullN())) s.nvalues += int64(values.Len() - values.NullN()) } if values.NullN() == values.Len() { return nil } dt := values.DataType() if dt.ID() != arrow.FIXED_SIZE_BINARY && dt.ID() != arrow.DECIMAL { return fmt.Errorf("%w: only fixed size binary and decimal128 arrays are supported to update stats from arrow", arrow.ErrInvalid) } var ( width = dt.(arrow.FixedWidthDataType).Bytes() data = values.Data().Buffers()[1].Bytes()[values.Data().Offset()*width:] min = s.defaultMin() max = s.defaultMax() ) for i := 0; i < values.Len(); i++ { v := data[i*width : (i+1)*width] min = s.minval(min, v) max = s.maxval(min, v) } s.SetMinMax(min, max) return nil } // SetMinMax updates the min and max values only if they are not currently set // or if argMin is less than the current min / argMax is greater than the current max func (s *FixedLenByteArrayStatistics) SetMinMax(argMin, argMax parquet.FixedLenByteArray) { maybeMinMax := s.cleanStat([2]parquet.FixedLenByteArray{argMin, argMax}) if maybeMinMax == nil { return } min := (*maybeMinMax)[0] max := (*maybeMinMax)[1] if !s.hasMinMax { s.hasMinMax = true s.min = min s.max = max } else { if !s.less(s.min, min) { s.min = min } if s.less(s.max, max) { s.max = max } } } // EncodeMin returns the encoded min value with plain encoding. // // ByteArray stats do not include the length in the encoding. func (s *FixedLenByteArrayStatistics) EncodeMin() []byte { if s.HasMinMax() { return s.plainEncode(s.min) } return nil } // EncodeMax returns the current encoded max value with plain encoding // // ByteArray stats do not include the length in the encoding func (s *FixedLenByteArrayStatistics) EncodeMax() []byte { if s.HasMinMax() { return s.plainEncode(s.max) } return nil } // Encode returns a populated EncodedStatistics object func (s *FixedLenByteArrayStatistics) Encode() (enc EncodedStatistics, err error) { defer func() { if r := recover(); r != nil { err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r) } }() if s.HasMinMax() { enc.SetMax(s.EncodeMax()) enc.SetMin(s.EncodeMin()) } if s.HasNullCount() { enc.SetNullCount(s.NullCount()) enc.AllNullValue = s.NumValues() == 0 } if s.HasDistinctCount() { enc.SetDistinctCount(s.DistinctCount()) } return } type minmaxPairFloat16 [2]parquet.FixedLenByteArray // Float16Statistics is the typed interface for managing stats for a column // of Float16 type. type Float16Statistics struct { statistics min parquet.FixedLenByteArray max parquet.FixedLenByteArray bitSetReader bitutils.SetBitRunReader } // NewFloat16Statistics constructs an appropriate stat object type using the // given column descriptor and allocator. // // Panics if the physical type of descr is not parquet.Type.FixedLenByteArray // Panics if the logical type of descr is not schema.Float16LogicalType func NewFloat16Statistics(descr *schema.Column, mem memory.Allocator) *Float16Statistics { if descr.PhysicalType() != parquet.Types.FixedLenByteArray { panic(fmt.Errorf("parquet: invalid type %s for constructing a Float16 stat object", descr.PhysicalType())) } if !descr.LogicalType().Equals(schema.Float16LogicalType{}) { panic(fmt.Errorf("parquet: invalid logical type %s for constructing a Float16 stat object", descr.LogicalType().String())) } return &Float16Statistics{ statistics: statistics{ descr: descr, hasNullCount: true, hasDistinctCount: true, order: descr.SortOrder(), encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), mem: mem, }, } } // NewFloat16StatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewFloat16StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Float16Statistics { ret := NewFloat16Statistics(descr, mem) ret.nvalues += nvalues if encoded.IsSetNullCount() { ret.IncNulls(encoded.GetNullCount()) } if encoded.IsSetDistinctCount() { ret.IncDistinct(encoded.GetDistinctCount()) } encodedMin := encoded.GetMin() if encodedMin != nil && len(encodedMin) > 0 { ret.min = ret.plainDecode(encodedMin) } encodedMax := encoded.GetMax() if encodedMax != nil && len(encodedMax) > 0 { ret.max = ret.plainDecode(encodedMax) } ret.hasMinMax = encoded.IsSetMax() || encoded.IsSetMin() return ret } func (s *Float16Statistics) plainEncode(src parquet.FixedLenByteArray) []byte { s.encoder.(encoding.FixedLenByteArrayEncoder).Put([]parquet.FixedLenByteArray{src}) buf, err := s.encoder.FlushValues() if err != nil { panic(err) // recovered by Encode } defer buf.Release() out := make([]byte, buf.Len()) copy(out, buf.Bytes()) return out } func (s *Float16Statistics) plainDecode(src []byte) parquet.FixedLenByteArray { var buf [1]parquet.FixedLenByteArray decoder := encoding.NewDecoder(s.descr.PhysicalType(), parquet.Encodings.Plain, s.descr, s.mem) decoder.SetData(1, src) decoder.(encoding.FixedLenByteArrayDecoder).Decode(buf[:]) return buf[0] } func (s *Float16Statistics) minval(a, b parquet.FixedLenByteArray) parquet.FixedLenByteArray { switch { case a == nil: return b case b == nil: return a case s.less(a, b): return a default: return b } } func (s *Float16Statistics) maxval(a, b parquet.FixedLenByteArray) parquet.FixedLenByteArray { switch { case a == nil: return b case b == nil: return a case s.less(a, b): return b default: return a } } // MinMaxEqual returns true if both stat objects have the same Min and Max values func (s *Float16Statistics) MinMaxEqual(rhs *Float16Statistics) bool { return s.equal(s.min, rhs.min) && s.equal(s.max, rhs.max) } // Equals returns true only if both objects are the same type, have the same min and // max values, null count, distinct count and number of values. func (s *Float16Statistics) Equals(other TypedStatistics) bool { if s.Type() != other.Type() || !s.descr.LogicalType().Equals(other.Descr().LogicalType()) { return false } rhs, ok := other.(*Float16Statistics) if !ok { return false } if s.HasMinMax() != rhs.HasMinMax() { return false } return (s.hasMinMax && s.MinMaxEqual(rhs)) && s.NullCount() == rhs.NullCount() && s.DistinctCount() == rhs.DistinctCount() && s.NumValues() == rhs.NumValues() } func (s *Float16Statistics) coalesce(val, fallback parquet.FixedLenByteArray) parquet.FixedLenByteArray { if float16.FromLEBytes(val).IsNaN() { return fallback } return val } func (s *Float16Statistics) getMinMax(values []parquet.FixedLenByteArray) (min, max parquet.FixedLenByteArray) { defMin := s.defaultMin() defMax := s.defaultMax() min = defMin max = defMax for _, v := range values { min = s.minval(min, s.coalesce(v, defMin)) max = s.maxval(max, s.coalesce(v, defMax)) } return } func (s *Float16Statistics) getMinMaxSpaced(values []parquet.FixedLenByteArray, validBits []byte, validBitsOffset int64) (min, max parquet.FixedLenByteArray) { min = s.defaultMin() max = s.defaultMax() if s.bitSetReader == nil { s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(values))) } else { s.bitSetReader.Reset(validBits, validBitsOffset, int64(len(values))) } for { run := s.bitSetReader.NextRun() if run.Length == 0 { break } for _, v := range values[int(run.Pos):int(run.Pos+run.Length)] { min = s.minval(min, coalesce(v, s.defaultMin()).(parquet.FixedLenByteArray)) max = s.maxval(max, coalesce(v, s.defaultMax()).(parquet.FixedLenByteArray)) } } return } func (s *Float16Statistics) Min() parquet.FixedLenByteArray { return s.min } func (s *Float16Statistics) Max() parquet.FixedLenByteArray { return s.max } // Merge merges the stats from other into this stat object, updating // the null count, distinct count, number of values and the min/max if // appropriate. func (s *Float16Statistics) Merge(other TypedStatistics) { rhs, ok := other.(*Float16Statistics) if !ok { panic("incompatible stat type merge") } s.statistics.merge(rhs) if rhs.HasMinMax() { s.SetMinMax(rhs.Min(), rhs.Max()) } } // Update is used to add more values to the current stat object, finding the // min and max values etc. func (s *Float16Statistics) Update(values []parquet.FixedLenByteArray, numNull int64) { s.IncNulls(numNull) s.nvalues += int64(len(values)) if len(values) == 0 { return } s.SetMinMax(s.getMinMax(values)) } // UpdateSpaced is just like Update, but for spaced values using validBits to determine // and skip null values. func (s *Float16Statistics) UpdateSpaced(values []parquet.FixedLenByteArray, validBits []byte, validBitsOffset, numNull int64) { s.IncNulls(numNull) notnull := int64(len(values)) - numNull s.nvalues += notnull if notnull == 0 { return } s.SetMinMax(s.getMinMaxSpaced(values, validBits, validBitsOffset)) } func (s *Float16Statistics) UpdateFromArrow(values arrow.Array, updateCounts bool) error { if updateCounts { s.IncNulls(int64(values.NullN())) s.nvalues += int64(values.Len() - values.NullN()) } if values.NullN() == values.Len() { return nil } return fmt.Errorf("%w: update float16 stats from Arrow", arrow.ErrNotImplemented) } // SetMinMax updates the min and max values only if they are not currently set // or if argMin is less than the current min / argMax is greater than the current max func (s *Float16Statistics) SetMinMax(argMin, argMax parquet.FixedLenByteArray) { maybeMinMax := s.cleanStat([2]parquet.FixedLenByteArray{argMin, argMax}) if maybeMinMax == nil { return } min := (*maybeMinMax)[0] max := (*maybeMinMax)[1] if !s.hasMinMax { s.hasMinMax = true s.min = min s.max = max } else { if !s.less(s.min, min) { s.min = min } if s.less(s.max, max) { s.max = max } } } // EncodeMin returns the encoded min value with plain encoding. // // ByteArray stats do not include the length in the encoding. func (s *Float16Statistics) EncodeMin() []byte { if s.HasMinMax() { return s.plainEncode(s.min) } return nil } // EncodeMax returns the current encoded max value with plain encoding // // ByteArray stats do not include the length in the encoding func (s *Float16Statistics) EncodeMax() []byte { if s.HasMinMax() { return s.plainEncode(s.max) } return nil } // Encode returns a populated EncodedStatistics object func (s *Float16Statistics) Encode() (enc EncodedStatistics, err error) { defer func() { if r := recover(); r != nil { err = shared_utils.FormatRecoveredError("unknown error type thrown from panic", r) } }() if s.HasMinMax() { enc.SetMax(s.EncodeMax()) enc.SetMin(s.EncodeMin()) } if s.HasNullCount() { enc.SetNullCount(s.NullCount()) enc.AllNullValue = s.NumValues() == 0 } if s.HasDistinctCount() { enc.SetDistinctCount(s.DistinctCount()) } return } // NewStatistics uses the type in the column descriptor to construct the appropriate // typed stats object. If mem is nil, then memory.DefaultAllocator will be used. func NewStatistics(descr *schema.Column, mem memory.Allocator) TypedStatistics { if mem == nil { mem = memory.DefaultAllocator } switch descr.PhysicalType() { case parquet.Types.Int32: return NewInt32Statistics(descr, mem) case parquet.Types.Int64: return NewInt64Statistics(descr, mem) case parquet.Types.Int96: return NewInt96Statistics(descr, mem) case parquet.Types.Float: return NewFloat32Statistics(descr, mem) case parquet.Types.Double: return NewFloat64Statistics(descr, mem) case parquet.Types.Boolean: return NewBooleanStatistics(descr, mem) case parquet.Types.ByteArray: return NewByteArrayStatistics(descr, mem) case parquet.Types.FixedLenByteArray: if descr.LogicalType().Equals(schema.Float16LogicalType{}) { return NewFloat16Statistics(descr, mem) } return NewFixedLenByteArrayStatistics(descr, mem) default: panic("not implemented") } } // NewStatisticsFromEncoded uses the provided information to initialize a typed stat object // by checking the type of the provided column descriptor. // // If mem is nil, then memory.DefaultAllocator is used. func NewStatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) TypedStatistics { if mem == nil { mem = memory.DefaultAllocator } switch descr.PhysicalType() { case parquet.Types.Int32: return NewInt32StatisticsFromEncoded(descr, mem, nvalues, encoded) case parquet.Types.Int64: return NewInt64StatisticsFromEncoded(descr, mem, nvalues, encoded) case parquet.Types.Int96: return NewInt96StatisticsFromEncoded(descr, mem, nvalues, encoded) case parquet.Types.Float: return NewFloat32StatisticsFromEncoded(descr, mem, nvalues, encoded) case parquet.Types.Double: return NewFloat64StatisticsFromEncoded(descr, mem, nvalues, encoded) case parquet.Types.Boolean: return NewBooleanStatisticsFromEncoded(descr, mem, nvalues, encoded) case parquet.Types.ByteArray: return NewByteArrayStatisticsFromEncoded(descr, mem, nvalues, encoded) case parquet.Types.FixedLenByteArray: if descr.LogicalType().Equals(schema.Float16LogicalType{}) { return NewFloat16StatisticsFromEncoded(descr, mem, nvalues, encoded) } return NewFixedLenByteArrayStatisticsFromEncoded(descr, mem, nvalues, encoded) default: panic("not implemented") } }

parquet/metadata/statistics_types.gen.go (2,123 lines of code) (raw):