hll/utils.go (119 lines of code) (raw):

/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package hll import ( "fmt" "math" "github.com/apache/datasketches-go/internal" ) const ( defaultLgK = 12 lgInitListSize = 3 lgInitSetSize = 5 ) const ( minLogK = 4 maxLogK = 21 empty = 0 keyBits26 = 26 valBits6 = 6 keyMask26 = (1 << keyBits26) - 1 valMask6 = (1 << valBits6) - 1 resizeNumber = 3 resizeDenom = 4 couponRSEFactor = .409 //at transition point not the asymptote couponRSE = couponRSEFactor / (1 << 13) hiNibbleMask = 0xf0 loNibbleMask = 0x0f auxToken = 0xf ) var ( hllNonHipRSEFactor = math.Sqrt((3.0 * math.Log(2.0)) - 1.0) //1.03896 hllHipRSEFActor = math.Sqrt(math.Log(2.0)) //.8325546 ) type TgtHllType int type curMode int const ( curModeList curMode = 0 curModeSet curMode = 1 curModeHll curMode = 2 ) // Specifies the target type of HLL sketch to be created. It is a target in that the actual // allocation of the HLL array is deferred until sufficient number of items have been received by // the warm-up phases. // // These three target types are isomorphic representations of the same underlying HLL algorithm. // Thus, given the same value of <i>lgConfigK</i> and the same input, all three HLL target types // will produce identical estimates and have identical error distributions. // // The memory (and also the serialization) of the sketch during this early warmup phase starts // out very small (8 bytes, when empty) and then grows in increments of 4 bytes as required // until the full HLL array is allocated. This transition point occurs at about 10% of K for // sketches where lgConfigK is > 8. // // - Hll 8 This uses an 8-bit byte per HLL bucket. It is generally the // fastest in terms of update time, but has the largest storage footprint of about // K bytes. // // - Hll 6 This uses a 6-bit field per HLL bucket. It is the generally the next fastest // in terms of update time with a storage footprint of about 3/4 * K bytes. // // - Hll 4 This uses a 4-bit field per HLL bucket and for large counts may require // the use of a small internal auxiliary array for storing statistical exceptions, which are rare. // For the values of lgConfigK > 13 (K = 8192), // this additional array adds about 3% to the overall storage. It is generally the slowest in // terms of update time, but has the smallest storage footprint of about // K/2 * 1.03 bytes. const ( TgtHllTypeHll4 = TgtHllType(0) TgtHllTypeHll6 = TgtHllType(1) TgtHllTypeHll8 = TgtHllType(2) TgtHllTypeDefault = TgtHllTypeHll4 ) var ( // lgAuxArrInts is the Log2 table sizes for exceptions based on lgK from 0 to 26. //However, only lgK from 4 to 21 are used. lgAuxArrInts = []int{ 0, 2, 2, 2, 2, 2, 2, 3, 3, 3, //0 - 9 4, 4, 5, 5, 6, 7, 8, 9, 10, 11, //10 - 19 12, 13, 14, 15, 16, 17, 18, //20 - 26 } ) // CheckLgK checks the given lgK and returns it if it is valid and return an error otherwise. func checkLgK(lgK int) (int, error) { if lgK >= minLogK && lgK <= maxLogK { return lgK, nil } return 0, fmt.Errorf("log K must be between 4 and 21, inclusive: %d", lgK) } // pair returns a value where the lower 26 bits are the slotNo and the upper 6 bits are the value. func pair(slotNo int, value int) int { return (value << keyBits26) | (slotNo & keyMask26) } // pairString returns a string representation of the pair. func pairString(pair int) string { return fmt.Sprintf("SlotNo: %d, Value: %d", getPairLow26(pair), getPairValue(pair)) } // getPairLow26 returns the pair, the lower 26 bits of the pair. func getPairLow26(pair int) int { return pair & keyMask26 } // getPairValue returns the value of the pair. // The value is the upper 6 bits of the pair. func getPairValue(pair int) int { return pair >> keyBits26 } func checkNumStdDev(numStdDev int) error { if numStdDev < 1 || numStdDev > 3 { return fmt.Errorf("NumStdDev may not be less than 1 or greater than 3: %d", numStdDev) } return nil } // checkPreamble checks the given preamble and returns the curMode if it is valid and return an error otherwise. func checkPreamble(preamble []byte) (curMode, error) { if len(preamble) == 0 { return 0, fmt.Errorf("preamble cannot be nil or empty") } preInts := extractPreInts(preamble) if len(preamble) < (preInts * 4) { return 0, fmt.Errorf("preamble length mismatch: %d, %d", len(preamble), preInts) } serVer := extractSerVer(preamble) famId := extractFamilyID(preamble) curMode := extractCurMode(preamble) if famId != internal.FamilyEnum.HLL.Id { return 0, fmt.Errorf("possible Corruption: Invalid Family: %d", famId) } if serVer != 1 { return 0, fmt.Errorf("possible Corruption: Invalid Serialization Version: %d", serVer) } if preInts != listPreInts && preInts != hashSetPreInts && preInts != hllPreInts { return 0, fmt.Errorf("possible Corruption: Invalid Preamble Ints: %d", preInts) } if curMode == curModeList && preInts != listPreInts { return 0, fmt.Errorf("possible Corruption: Invalid Preamble Ints: %d", preInts) } if curMode == curModeSet && preInts != hashSetPreInts { return 0, fmt.Errorf("possible Corruption: Invalid Preamble Ints: %d", preInts) } if curMode == curModeHll && preInts != hllPreInts { return 0, fmt.Errorf("possible Corruption: Invalid Preamble Ints: %d", preInts) } return curMode, nil } func getMaxUpdatableSerializationBytes(lgConfigK int, tgtHllType TgtHllType) int { var arrBytes int if tgtHllType == TgtHllTypeHll4 { auxBytes := 4 << lgAuxArrInts[lgConfigK] arrBytes = (1 << (lgConfigK - 1)) + auxBytes } else if tgtHllType == TgtHllTypeHll6 { numSlots := 1 << lgConfigK arrBytes = ((numSlots * 3) >> 2) + 1 } else { arrBytes = 1 << lgConfigK } return hllByteArrStart + arrBytes }