common/item_sketch_string.go (103 lines of code) (raw):

/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package common import ( "encoding/binary" "errors" "unsafe" "github.com/twmb/murmur3" ) type ItemSketchStringHasher struct{} type ItemSketchStringSerDe struct{} var ItemSketchStringComparator = func(reverseOrder bool) CompareFn[string] { return func(a, b string) bool { if reverseOrder { return a > b } return a < b } } func (f ItemSketchStringHasher) Hash(item string) uint64 { datum := unsafe.Slice(unsafe.StringData(item), len(item)) return murmur3.SeedSum64(defaultSerdeHashSeed, datum[:]) } func (f ItemSketchStringSerDe) SizeOf(item string) int { if len(item) == 0 { return int(unsafe.Sizeof(uint32(0))) } return len(item) + int(unsafe.Sizeof(uint32(0))) } func (f ItemSketchStringSerDe) SizeOfMany(mem []byte, offsetBytes int, numItems int) (int, error) { if numItems <= 0 { return 0, nil } reqLen := 4 offset := offsetBytes memCap := len(mem) for i := 0; i < numItems; i++ { if !checkBounds(offset, reqLen, memCap) { return 0, errors.New("offset out of bounds") } itemLenBytes := int(binary.LittleEndian.Uint32(mem[offset:])) offset += 4 if offset+itemLenBytes > memCap { return 0, errors.New("offset out of bounds") } offset += itemLenBytes } return offset - offsetBytes, nil } func (f ItemSketchStringSerDe) SerializeOneToSlice(item string) []byte { if len(item) == 0 { return []byte{} } utf8len := len(item) bytesOut := make([]byte, utf8len+4) binary.LittleEndian.PutUint32(bytesOut, uint32(utf8len)) copy(bytesOut[4:], []byte(item)) return bytesOut } func (f ItemSketchStringSerDe) SerializeManyToSlice(item []string) []byte { if len(item) == 0 { return []byte{} } totalBytes := 0 numItems := len(item) serialized2DArray := make([][]byte, numItems) for i := 0; i < numItems; i++ { serialized2DArray[i] = []byte(item[i]) totalBytes += len(serialized2DArray[i]) + 4 } bytesOut := make([]byte, totalBytes) offset := 0 for i := 0; i < numItems; i++ { utf8len := len(serialized2DArray[i]) binary.LittleEndian.PutUint32(bytesOut[offset:], uint32(utf8len)) offset += 4 copy(bytesOut[offset:], serialized2DArray[i]) offset += utf8len } return bytesOut } func (f ItemSketchStringSerDe) DeserializeManyFromSlice(mem []byte, offsetBytes int, numItems int) ([]string, error) { if numItems <= 0 { return []string{}, nil } array := make([]string, numItems) offset := offsetBytes intSize := int(unsafe.Sizeof(uint32(0))) memCap := len(mem) for i := 0; i < numItems; i++ { if !checkBounds(offset, intSize, memCap) { return nil, errors.New("offset out of bounds") } strLength := int(binary.LittleEndian.Uint32(mem[offset:])) offset += intSize utf8Bytes := make([]byte, strLength) if !checkBounds(offset, strLength, memCap) { return nil, errors.New("offset out of bounds") } copy(utf8Bytes, mem[offset:offset+strLength]) offset += strLength array[i] = string(utf8Bytes) } return array, nil }