internal/hashing/xxh3_memo

// Code generated by xxh3_memo_table.gen.go.tmpl. DO NOT EDIT. // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package hashing import ( "math" "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/bitutil" "github.com/apache/arrow-go/v18/internal/utils" ) type payloadInt8 struct { val int8 memoIdx int32 } type entryInt8 struct { h uint64 payload payloadInt8 } func (e entryInt8) Valid() bool { return e.h != sentinel } // Int8HashTable is a hashtable specifically for int8 that // is utilized with the MemoTable to generalize interactions for easier // implementation of dictionaries without losing performance. type Int8HashTable struct { cap uint64 capMask uint64 size uint64 entries []entryInt8 } // NewInt8HashTable returns a new hash table for int8 values // initialized with the passed in capacity or 32 whichever is larger. func NewInt8HashTable(cap uint64) *Int8HashTable { initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) ret := &Int8HashTable{cap: initCap, capMask: initCap - 1, size: 0} ret.entries = make([]entryInt8, initCap) return ret } // Reset drops all of the values in this hash table and re-initializes it // with the specified initial capacity as if by calling New, but without having // to reallocate the object. func (h *Int8HashTable) Reset(cap uint64) { h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) h.capMask = h.cap - 1 h.size = 0 h.entries = make([]entryInt8, h.cap) } // CopyValues is used for copying the values out of the hash table into the // passed in slice, in the order that they were first inserted func (h *Int8HashTable) CopyValues(out []int8) { h.CopyValuesSubset(0, out) } // CopyValuesSubset copies a subset of the values in the hashtable out, starting // with the value at start, in the order that they were inserted. func (h *Int8HashTable) CopyValuesSubset(start int, out []int8) { h.VisitEntries(func(e *entryInt8) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { out[idx] = e.payload.val } }) } func (h *Int8HashTable) WriteOut(out []byte) { h.WriteOutSubset(0, out) } func (h *Int8HashTable) WriteOutSubset(start int, out []byte) { data := arrow.Int8Traits.CastFromBytes(out) h.VisitEntries(func(e *entryInt8) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { data[idx] = e.payload.val } }) } func (h *Int8HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } func (Int8HashTable) fixHash(v uint64) uint64 { if v == sentinel { return 42 } return v } // Lookup retrieves the entry for a given hash value assuming it's payload value returns // true when passed to the cmp func. Returns a pointer to the entry for the given hash value, // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. func (h *Int8HashTable) Lookup(v uint64, cmp func(int8) bool) (*entryInt8, bool) { idx, ok := h.lookup(v, h.capMask, cmp) return &h.entries[idx], ok } func (h *Int8HashTable) lookup(v uint64, szMask uint64, cmp func(int8) bool) (uint64, bool) { const perturbShift uint8 = 5 var ( idx uint64 perturb uint64 e *entryInt8 ) v = h.fixHash(v) idx = v & szMask perturb = (v >> uint64(perturbShift)) + 1 for { e = &h.entries[idx] if e.h == v && cmp(e.payload.val) { return idx, true } if e.h == sentinel { return idx, false } // perturbation logic inspired from CPython's set/dict object // the goal is that all 64 bits of unmasked hash value eventually // participate int he probing sequence, to minimize clustering idx = (idx + perturb) & szMask perturb = (perturb >> uint64(perturbShift)) + 1 } } func (h *Int8HashTable) upsize(newcap uint64) error { newMask := newcap - 1 oldEntries := h.entries h.entries = make([]entryInt8, newcap) for _, e := range oldEntries { if e.Valid() { idx, _ := h.lookup(e.h, newMask, func(int8) bool { return false }) h.entries[idx] = e } } h.cap = newcap h.capMask = newMask return nil } // Insert updates the given entry with the provided hash value, payload value and memo index. // The entry pointer must have been retrieved via lookup in order to actually insert properly. func (h *Int8HashTable) Insert(e *entryInt8, v uint64, val int8, memoIdx int32) error { e.h = h.fixHash(v) e.payload.val = val e.payload.memoIdx = memoIdx h.size++ if h.needUpsize() { h.upsize(h.cap * uint64(loadFactor) * 2) } return nil } // VisitEntries will call the passed in function on each *valid* entry in the hash table, // a valid entry being one which has had a value inserted into it. func (h *Int8HashTable) VisitEntries(visit func(*entryInt8)) { for _, e := range h.entries { if e.Valid() { visit(&e) } } } // Int8MemoTable is a wrapper over the appropriate hashtable to provide an interface // conforming to the MemoTable interface defined in the encoding package for general interactions // regarding dictionaries. type Int8MemoTable struct { tbl *Int8HashTable nullIdx int32 } // NewInt8MemoTable returns a new memotable with num entries pre-allocated to reduce further // allocations when inserting. func NewInt8MemoTable(num int64) *Int8MemoTable { return &Int8MemoTable{tbl: NewInt8HashTable(uint64(num)), nullIdx: KeyNotFound} } func (Int8MemoTable) TypeTraits() TypeTraits { return arrow.Int8Traits } // Reset allows this table to be re-used by dumping all the data currently in the table. func (s *Int8MemoTable) Reset() { s.tbl.Reset(32) s.nullIdx = KeyNotFound } // Size returns the current number of inserted elements into the table including if a null // has been inserted. func (s *Int8MemoTable) Size() int { sz := int(s.tbl.size) if _, ok := s.GetNull(); ok { sz++ } return sz } // GetNull returns the index of an inserted null or KeyNotFound along with a bool // that will be true if found and false if not. func (s *Int8MemoTable) GetNull() (int, bool) { return int(s.nullIdx), s.nullIdx != KeyNotFound } // GetOrInsertNull will return the index of the null entry or insert a null entry // if one currently doesn't exist. The found value will be true if there was already // a null in the table, and false if it inserted one. func (s *Int8MemoTable) GetOrInsertNull() (idx int, found bool) { idx, found = s.GetNull() if !found { idx = s.Size() s.nullIdx = int32(idx) } return } // CopyValues will copy the values from the memo table out into the passed in slice // which must be of the appropriate type. func (s *Int8MemoTable) CopyValues(out interface{}) { s.CopyValuesSubset(0, out) } // CopyValuesSubset is like CopyValues but only copies a subset of values starting // at the provided start index func (s *Int8MemoTable) CopyValuesSubset(start int, out interface{}) { s.tbl.CopyValuesSubset(start, out.([]int8)) } func (s *Int8MemoTable) WriteOut(out []byte) { s.tbl.CopyValues(arrow.Int8Traits.CastFromBytes(out)) } func (s *Int8MemoTable) WriteOutSubset(start int, out []byte) { s.tbl.CopyValuesSubset(start, arrow.Int8Traits.CastFromBytes(out)) } func (s *Int8MemoTable) WriteOutLE(out []byte) { s.tbl.WriteOut(out) } func (s *Int8MemoTable) WriteOutSubsetLE(start int, out []byte) { s.tbl.WriteOutSubset(start, out) } func (s *Int8MemoTable) Exists(val int8) bool { _, ok := s.Get(val) return ok } // Get returns the index of the requested value in the hash table or KeyNotFound // along with a boolean indicating if it was found or not. func (s *Int8MemoTable) Get(val interface{}) (int, bool) { h := hashInt(uint64(val.(int8)), 0) if e, ok := s.tbl.Lookup(h, func(v int8) bool { return val.(int8) == v }); ok { return int(e.payload.memoIdx), ok } return KeyNotFound, false } // GetOrInsert will return the index of the specified value in the table, or insert the // value into the table and return the new index. found indicates whether or not it already // existed in the table (true) or was inserted by this call (false). func (s *Int8MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { return s.InsertOrGet(val.(int8)) } func (s *Int8MemoTable) InsertOrGet(val int8) (idx int, found bool, err error) { h := hashInt(uint64(val), 0) e, ok := s.tbl.Lookup(h, func(v int8) bool { return val == v }) if ok { idx = int(e.payload.memoIdx) found = true } else { idx = s.Size() s.tbl.Insert(e, h, val, int32(idx)) } return } // GetOrInsertBytes is unimplemented func (s *Int8MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { panic("unimplemented") } type payloadUint8 struct { val uint8 memoIdx int32 } type entryUint8 struct { h uint64 payload payloadUint8 } func (e entryUint8) Valid() bool { return e.h != sentinel } // Uint8HashTable is a hashtable specifically for uint8 that // is utilized with the MemoTable to generalize interactions for easier // implementation of dictionaries without losing performance. type Uint8HashTable struct { cap uint64 capMask uint64 size uint64 entries []entryUint8 } // NewUint8HashTable returns a new hash table for uint8 values // initialized with the passed in capacity or 32 whichever is larger. func NewUint8HashTable(cap uint64) *Uint8HashTable { initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) ret := &Uint8HashTable{cap: initCap, capMask: initCap - 1, size: 0} ret.entries = make([]entryUint8, initCap) return ret } // Reset drops all of the values in this hash table and re-initializes it // with the specified initial capacity as if by calling New, but without having // to reallocate the object. func (h *Uint8HashTable) Reset(cap uint64) { h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) h.capMask = h.cap - 1 h.size = 0 h.entries = make([]entryUint8, h.cap) } // CopyValues is used for copying the values out of the hash table into the // passed in slice, in the order that they were first inserted func (h *Uint8HashTable) CopyValues(out []uint8) { h.CopyValuesSubset(0, out) } // CopyValuesSubset copies a subset of the values in the hashtable out, starting // with the value at start, in the order that they were inserted. func (h *Uint8HashTable) CopyValuesSubset(start int, out []uint8) { h.VisitEntries(func(e *entryUint8) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { out[idx] = e.payload.val } }) } func (h *Uint8HashTable) WriteOut(out []byte) { h.WriteOutSubset(0, out) } func (h *Uint8HashTable) WriteOutSubset(start int, out []byte) { data := arrow.Uint8Traits.CastFromBytes(out) h.VisitEntries(func(e *entryUint8) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { data[idx] = e.payload.val } }) } func (h *Uint8HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } func (Uint8HashTable) fixHash(v uint64) uint64 { if v == sentinel { return 42 } return v } // Lookup retrieves the entry for a given hash value assuming it's payload value returns // true when passed to the cmp func. Returns a pointer to the entry for the given hash value, // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. func (h *Uint8HashTable) Lookup(v uint64, cmp func(uint8) bool) (*entryUint8, bool) { idx, ok := h.lookup(v, h.capMask, cmp) return &h.entries[idx], ok } func (h *Uint8HashTable) lookup(v uint64, szMask uint64, cmp func(uint8) bool) (uint64, bool) { const perturbShift uint8 = 5 var ( idx uint64 perturb uint64 e *entryUint8 ) v = h.fixHash(v) idx = v & szMask perturb = (v >> uint64(perturbShift)) + 1 for { e = &h.entries[idx] if e.h == v && cmp(e.payload.val) { return idx, true } if e.h == sentinel { return idx, false } // perturbation logic inspired from CPython's set/dict object // the goal is that all 64 bits of unmasked hash value eventually // participate int he probing sequence, to minimize clustering idx = (idx + perturb) & szMask perturb = (perturb >> uint64(perturbShift)) + 1 } } func (h *Uint8HashTable) upsize(newcap uint64) error { newMask := newcap - 1 oldEntries := h.entries h.entries = make([]entryUint8, newcap) for _, e := range oldEntries { if e.Valid() { idx, _ := h.lookup(e.h, newMask, func(uint8) bool { return false }) h.entries[idx] = e } } h.cap = newcap h.capMask = newMask return nil } // Insert updates the given entry with the provided hash value, payload value and memo index. // The entry pointer must have been retrieved via lookup in order to actually insert properly. func (h *Uint8HashTable) Insert(e *entryUint8, v uint64, val uint8, memoIdx int32) error { e.h = h.fixHash(v) e.payload.val = val e.payload.memoIdx = memoIdx h.size++ if h.needUpsize() { h.upsize(h.cap * uint64(loadFactor) * 2) } return nil } // VisitEntries will call the passed in function on each *valid* entry in the hash table, // a valid entry being one which has had a value inserted into it. func (h *Uint8HashTable) VisitEntries(visit func(*entryUint8)) { for _, e := range h.entries { if e.Valid() { visit(&e) } } } // Uint8MemoTable is a wrapper over the appropriate hashtable to provide an interface // conforming to the MemoTable interface defined in the encoding package for general interactions // regarding dictionaries. type Uint8MemoTable struct { tbl *Uint8HashTable nullIdx int32 } // NewUint8MemoTable returns a new memotable with num entries pre-allocated to reduce further // allocations when inserting. func NewUint8MemoTable(num int64) *Uint8MemoTable { return &Uint8MemoTable{tbl: NewUint8HashTable(uint64(num)), nullIdx: KeyNotFound} } func (Uint8MemoTable) TypeTraits() TypeTraits { return arrow.Uint8Traits } // Reset allows this table to be re-used by dumping all the data currently in the table. func (s *Uint8MemoTable) Reset() { s.tbl.Reset(32) s.nullIdx = KeyNotFound } // Size returns the current number of inserted elements into the table including if a null // has been inserted. func (s *Uint8MemoTable) Size() int { sz := int(s.tbl.size) if _, ok := s.GetNull(); ok { sz++ } return sz } // GetNull returns the index of an inserted null or KeyNotFound along with a bool // that will be true if found and false if not. func (s *Uint8MemoTable) GetNull() (int, bool) { return int(s.nullIdx), s.nullIdx != KeyNotFound } // GetOrInsertNull will return the index of the null entry or insert a null entry // if one currently doesn't exist. The found value will be true if there was already // a null in the table, and false if it inserted one. func (s *Uint8MemoTable) GetOrInsertNull() (idx int, found bool) { idx, found = s.GetNull() if !found { idx = s.Size() s.nullIdx = int32(idx) } return } // CopyValues will copy the values from the memo table out into the passed in slice // which must be of the appropriate type. func (s *Uint8MemoTable) CopyValues(out interface{}) { s.CopyValuesSubset(0, out) } // CopyValuesSubset is like CopyValues but only copies a subset of values starting // at the provided start index func (s *Uint8MemoTable) CopyValuesSubset(start int, out interface{}) { s.tbl.CopyValuesSubset(start, out.([]uint8)) } func (s *Uint8MemoTable) WriteOut(out []byte) { s.tbl.CopyValues(arrow.Uint8Traits.CastFromBytes(out)) } func (s *Uint8MemoTable) WriteOutSubset(start int, out []byte) { s.tbl.CopyValuesSubset(start, arrow.Uint8Traits.CastFromBytes(out)) } func (s *Uint8MemoTable) WriteOutLE(out []byte) { s.tbl.WriteOut(out) } func (s *Uint8MemoTable) WriteOutSubsetLE(start int, out []byte) { s.tbl.WriteOutSubset(start, out) } func (s *Uint8MemoTable) Exists(val uint8) bool { _, ok := s.Get(val) return ok } // Get returns the index of the requested value in the hash table or KeyNotFound // along with a boolean indicating if it was found or not. func (s *Uint8MemoTable) Get(val interface{}) (int, bool) { h := hashInt(uint64(val.(uint8)), 0) if e, ok := s.tbl.Lookup(h, func(v uint8) bool { return val.(uint8) == v }); ok { return int(e.payload.memoIdx), ok } return KeyNotFound, false } // GetOrInsert will return the index of the specified value in the table, or insert the // value into the table and return the new index. found indicates whether or not it already // existed in the table (true) or was inserted by this call (false). func (s *Uint8MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { return s.InsertOrGet(val.(uint8)) } func (s *Uint8MemoTable) InsertOrGet(val uint8) (idx int, found bool, err error) { h := hashInt(uint64(val), 0) e, ok := s.tbl.Lookup(h, func(v uint8) bool { return val == v }) if ok { idx = int(e.payload.memoIdx) found = true } else { idx = s.Size() s.tbl.Insert(e, h, val, int32(idx)) } return } // GetOrInsertBytes is unimplemented func (s *Uint8MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { panic("unimplemented") } type payloadInt16 struct { val int16 memoIdx int32 } type entryInt16 struct { h uint64 payload payloadInt16 } func (e entryInt16) Valid() bool { return e.h != sentinel } // Int16HashTable is a hashtable specifically for int16 that // is utilized with the MemoTable to generalize interactions for easier // implementation of dictionaries without losing performance. type Int16HashTable struct { cap uint64 capMask uint64 size uint64 entries []entryInt16 } // NewInt16HashTable returns a new hash table for int16 values // initialized with the passed in capacity or 32 whichever is larger. func NewInt16HashTable(cap uint64) *Int16HashTable { initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) ret := &Int16HashTable{cap: initCap, capMask: initCap - 1, size: 0} ret.entries = make([]entryInt16, initCap) return ret } // Reset drops all of the values in this hash table and re-initializes it // with the specified initial capacity as if by calling New, but without having // to reallocate the object. func (h *Int16HashTable) Reset(cap uint64) { h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) h.capMask = h.cap - 1 h.size = 0 h.entries = make([]entryInt16, h.cap) } // CopyValues is used for copying the values out of the hash table into the // passed in slice, in the order that they were first inserted func (h *Int16HashTable) CopyValues(out []int16) { h.CopyValuesSubset(0, out) } // CopyValuesSubset copies a subset of the values in the hashtable out, starting // with the value at start, in the order that they were inserted. func (h *Int16HashTable) CopyValuesSubset(start int, out []int16) { h.VisitEntries(func(e *entryInt16) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { out[idx] = e.payload.val } }) } func (h *Int16HashTable) WriteOut(out []byte) { h.WriteOutSubset(0, out) } func (h *Int16HashTable) WriteOutSubset(start int, out []byte) { data := arrow.Int16Traits.CastFromBytes(out) h.VisitEntries(func(e *entryInt16) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { data[idx] = utils.ToLEInt16(e.payload.val) } }) } func (h *Int16HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } func (Int16HashTable) fixHash(v uint64) uint64 { if v == sentinel { return 42 } return v } // Lookup retrieves the entry for a given hash value assuming it's payload value returns // true when passed to the cmp func. Returns a pointer to the entry for the given hash value, // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. func (h *Int16HashTable) Lookup(v uint64, cmp func(int16) bool) (*entryInt16, bool) { idx, ok := h.lookup(v, h.capMask, cmp) return &h.entries[idx], ok } func (h *Int16HashTable) lookup(v uint64, szMask uint64, cmp func(int16) bool) (uint64, bool) { const perturbShift uint8 = 5 var ( idx uint64 perturb uint64 e *entryInt16 ) v = h.fixHash(v) idx = v & szMask perturb = (v >> uint64(perturbShift)) + 1 for { e = &h.entries[idx] if e.h == v && cmp(e.payload.val) { return idx, true } if e.h == sentinel { return idx, false } // perturbation logic inspired from CPython's set/dict object // the goal is that all 64 bits of unmasked hash value eventually // participate int he probing sequence, to minimize clustering idx = (idx + perturb) & szMask perturb = (perturb >> uint64(perturbShift)) + 1 } } func (h *Int16HashTable) upsize(newcap uint64) error { newMask := newcap - 1 oldEntries := h.entries h.entries = make([]entryInt16, newcap) for _, e := range oldEntries { if e.Valid() { idx, _ := h.lookup(e.h, newMask, func(int16) bool { return false }) h.entries[idx] = e } } h.cap = newcap h.capMask = newMask return nil } // Insert updates the given entry with the provided hash value, payload value and memo index. // The entry pointer must have been retrieved via lookup in order to actually insert properly. func (h *Int16HashTable) Insert(e *entryInt16, v uint64, val int16, memoIdx int32) error { e.h = h.fixHash(v) e.payload.val = val e.payload.memoIdx = memoIdx h.size++ if h.needUpsize() { h.upsize(h.cap * uint64(loadFactor) * 2) } return nil } // VisitEntries will call the passed in function on each *valid* entry in the hash table, // a valid entry being one which has had a value inserted into it. func (h *Int16HashTable) VisitEntries(visit func(*entryInt16)) { for _, e := range h.entries { if e.Valid() { visit(&e) } } } // Int16MemoTable is a wrapper over the appropriate hashtable to provide an interface // conforming to the MemoTable interface defined in the encoding package for general interactions // regarding dictionaries. type Int16MemoTable struct { tbl *Int16HashTable nullIdx int32 } // NewInt16MemoTable returns a new memotable with num entries pre-allocated to reduce further // allocations when inserting. func NewInt16MemoTable(num int64) *Int16MemoTable { return &Int16MemoTable{tbl: NewInt16HashTable(uint64(num)), nullIdx: KeyNotFound} } func (Int16MemoTable) TypeTraits() TypeTraits { return arrow.Int16Traits } // Reset allows this table to be re-used by dumping all the data currently in the table. func (s *Int16MemoTable) Reset() { s.tbl.Reset(32) s.nullIdx = KeyNotFound } // Size returns the current number of inserted elements into the table including if a null // has been inserted. func (s *Int16MemoTable) Size() int { sz := int(s.tbl.size) if _, ok := s.GetNull(); ok { sz++ } return sz } // GetNull returns the index of an inserted null or KeyNotFound along with a bool // that will be true if found and false if not. func (s *Int16MemoTable) GetNull() (int, bool) { return int(s.nullIdx), s.nullIdx != KeyNotFound } // GetOrInsertNull will return the index of the null entry or insert a null entry // if one currently doesn't exist. The found value will be true if there was already // a null in the table, and false if it inserted one. func (s *Int16MemoTable) GetOrInsertNull() (idx int, found bool) { idx, found = s.GetNull() if !found { idx = s.Size() s.nullIdx = int32(idx) } return } // CopyValues will copy the values from the memo table out into the passed in slice // which must be of the appropriate type. func (s *Int16MemoTable) CopyValues(out interface{}) { s.CopyValuesSubset(0, out) } // CopyValuesSubset is like CopyValues but only copies a subset of values starting // at the provided start index func (s *Int16MemoTable) CopyValuesSubset(start int, out interface{}) { s.tbl.CopyValuesSubset(start, out.([]int16)) } func (s *Int16MemoTable) WriteOut(out []byte) { s.tbl.CopyValues(arrow.Int16Traits.CastFromBytes(out)) } func (s *Int16MemoTable) WriteOutSubset(start int, out []byte) { s.tbl.CopyValuesSubset(start, arrow.Int16Traits.CastFromBytes(out)) } func (s *Int16MemoTable) WriteOutLE(out []byte) { s.tbl.WriteOut(out) } func (s *Int16MemoTable) WriteOutSubsetLE(start int, out []byte) { s.tbl.WriteOutSubset(start, out) } func (s *Int16MemoTable) Exists(val int16) bool { _, ok := s.Get(val) return ok } // Get returns the index of the requested value in the hash table or KeyNotFound // along with a boolean indicating if it was found or not. func (s *Int16MemoTable) Get(val interface{}) (int, bool) { h := hashInt(uint64(val.(int16)), 0) if e, ok := s.tbl.Lookup(h, func(v int16) bool { return val.(int16) == v }); ok { return int(e.payload.memoIdx), ok } return KeyNotFound, false } // GetOrInsert will return the index of the specified value in the table, or insert the // value into the table and return the new index. found indicates whether or not it already // existed in the table (true) or was inserted by this call (false). func (s *Int16MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { return s.InsertOrGet(val.(int16)) } func (s *Int16MemoTable) InsertOrGet(val int16) (idx int, found bool, err error) { h := hashInt(uint64(val), 0) e, ok := s.tbl.Lookup(h, func(v int16) bool { return val == v }) if ok { idx = int(e.payload.memoIdx) found = true } else { idx = s.Size() s.tbl.Insert(e, h, val, int32(idx)) } return } // GetOrInsertBytes is unimplemented func (s *Int16MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { panic("unimplemented") } type payloadUint16 struct { val uint16 memoIdx int32 } type entryUint16 struct { h uint64 payload payloadUint16 } func (e entryUint16) Valid() bool { return e.h != sentinel } // Uint16HashTable is a hashtable specifically for uint16 that // is utilized with the MemoTable to generalize interactions for easier // implementation of dictionaries without losing performance. type Uint16HashTable struct { cap uint64 capMask uint64 size uint64 entries []entryUint16 } // NewUint16HashTable returns a new hash table for uint16 values // initialized with the passed in capacity or 32 whichever is larger. func NewUint16HashTable(cap uint64) *Uint16HashTable { initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) ret := &Uint16HashTable{cap: initCap, capMask: initCap - 1, size: 0} ret.entries = make([]entryUint16, initCap) return ret } // Reset drops all of the values in this hash table and re-initializes it // with the specified initial capacity as if by calling New, but without having // to reallocate the object. func (h *Uint16HashTable) Reset(cap uint64) { h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) h.capMask = h.cap - 1 h.size = 0 h.entries = make([]entryUint16, h.cap) } // CopyValues is used for copying the values out of the hash table into the // passed in slice, in the order that they were first inserted func (h *Uint16HashTable) CopyValues(out []uint16) { h.CopyValuesSubset(0, out) } // CopyValuesSubset copies a subset of the values in the hashtable out, starting // with the value at start, in the order that they were inserted. func (h *Uint16HashTable) CopyValuesSubset(start int, out []uint16) { h.VisitEntries(func(e *entryUint16) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { out[idx] = e.payload.val } }) } func (h *Uint16HashTable) WriteOut(out []byte) { h.WriteOutSubset(0, out) } func (h *Uint16HashTable) WriteOutSubset(start int, out []byte) { data := arrow.Uint16Traits.CastFromBytes(out) h.VisitEntries(func(e *entryUint16) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { data[idx] = utils.ToLEUint16(e.payload.val) } }) } func (h *Uint16HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } func (Uint16HashTable) fixHash(v uint64) uint64 { if v == sentinel { return 42 } return v } // Lookup retrieves the entry for a given hash value assuming it's payload value returns // true when passed to the cmp func. Returns a pointer to the entry for the given hash value, // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. func (h *Uint16HashTable) Lookup(v uint64, cmp func(uint16) bool) (*entryUint16, bool) { idx, ok := h.lookup(v, h.capMask, cmp) return &h.entries[idx], ok } func (h *Uint16HashTable) lookup(v uint64, szMask uint64, cmp func(uint16) bool) (uint64, bool) { const perturbShift uint8 = 5 var ( idx uint64 perturb uint64 e *entryUint16 ) v = h.fixHash(v) idx = v & szMask perturb = (v >> uint64(perturbShift)) + 1 for { e = &h.entries[idx] if e.h == v && cmp(e.payload.val) { return idx, true } if e.h == sentinel { return idx, false } // perturbation logic inspired from CPython's set/dict object // the goal is that all 64 bits of unmasked hash value eventually // participate int he probing sequence, to minimize clustering idx = (idx + perturb) & szMask perturb = (perturb >> uint64(perturbShift)) + 1 } } func (h *Uint16HashTable) upsize(newcap uint64) error { newMask := newcap - 1 oldEntries := h.entries h.entries = make([]entryUint16, newcap) for _, e := range oldEntries { if e.Valid() { idx, _ := h.lookup(e.h, newMask, func(uint16) bool { return false }) h.entries[idx] = e } } h.cap = newcap h.capMask = newMask return nil } // Insert updates the given entry with the provided hash value, payload value and memo index. // The entry pointer must have been retrieved via lookup in order to actually insert properly. func (h *Uint16HashTable) Insert(e *entryUint16, v uint64, val uint16, memoIdx int32) error { e.h = h.fixHash(v) e.payload.val = val e.payload.memoIdx = memoIdx h.size++ if h.needUpsize() { h.upsize(h.cap * uint64(loadFactor) * 2) } return nil } // VisitEntries will call the passed in function on each *valid* entry in the hash table, // a valid entry being one which has had a value inserted into it. func (h *Uint16HashTable) VisitEntries(visit func(*entryUint16)) { for _, e := range h.entries { if e.Valid() { visit(&e) } } } // Uint16MemoTable is a wrapper over the appropriate hashtable to provide an interface // conforming to the MemoTable interface defined in the encoding package for general interactions // regarding dictionaries. type Uint16MemoTable struct { tbl *Uint16HashTable nullIdx int32 } // NewUint16MemoTable returns a new memotable with num entries pre-allocated to reduce further // allocations when inserting. func NewUint16MemoTable(num int64) *Uint16MemoTable { return &Uint16MemoTable{tbl: NewUint16HashTable(uint64(num)), nullIdx: KeyNotFound} } func (Uint16MemoTable) TypeTraits() TypeTraits { return arrow.Uint16Traits } // Reset allows this table to be re-used by dumping all the data currently in the table. func (s *Uint16MemoTable) Reset() { s.tbl.Reset(32) s.nullIdx = KeyNotFound } // Size returns the current number of inserted elements into the table including if a null // has been inserted. func (s *Uint16MemoTable) Size() int { sz := int(s.tbl.size) if _, ok := s.GetNull(); ok { sz++ } return sz } // GetNull returns the index of an inserted null or KeyNotFound along with a bool // that will be true if found and false if not. func (s *Uint16MemoTable) GetNull() (int, bool) { return int(s.nullIdx), s.nullIdx != KeyNotFound } // GetOrInsertNull will return the index of the null entry or insert a null entry // if one currently doesn't exist. The found value will be true if there was already // a null in the table, and false if it inserted one. func (s *Uint16MemoTable) GetOrInsertNull() (idx int, found bool) { idx, found = s.GetNull() if !found { idx = s.Size() s.nullIdx = int32(idx) } return } // CopyValues will copy the values from the memo table out into the passed in slice // which must be of the appropriate type. func (s *Uint16MemoTable) CopyValues(out interface{}) { s.CopyValuesSubset(0, out) } // CopyValuesSubset is like CopyValues but only copies a subset of values starting // at the provided start index func (s *Uint16MemoTable) CopyValuesSubset(start int, out interface{}) { s.tbl.CopyValuesSubset(start, out.([]uint16)) } func (s *Uint16MemoTable) WriteOut(out []byte) { s.tbl.CopyValues(arrow.Uint16Traits.CastFromBytes(out)) } func (s *Uint16MemoTable) WriteOutSubset(start int, out []byte) { s.tbl.CopyValuesSubset(start, arrow.Uint16Traits.CastFromBytes(out)) } func (s *Uint16MemoTable) WriteOutLE(out []byte) { s.tbl.WriteOut(out) } func (s *Uint16MemoTable) WriteOutSubsetLE(start int, out []byte) { s.tbl.WriteOutSubset(start, out) } func (s *Uint16MemoTable) Exists(val uint16) bool { _, ok := s.Get(val) return ok } // Get returns the index of the requested value in the hash table or KeyNotFound // along with a boolean indicating if it was found or not. func (s *Uint16MemoTable) Get(val interface{}) (int, bool) { h := hashInt(uint64(val.(uint16)), 0) if e, ok := s.tbl.Lookup(h, func(v uint16) bool { return val.(uint16) == v }); ok { return int(e.payload.memoIdx), ok } return KeyNotFound, false } // GetOrInsert will return the index of the specified value in the table, or insert the // value into the table and return the new index. found indicates whether or not it already // existed in the table (true) or was inserted by this call (false). func (s *Uint16MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { return s.InsertOrGet(val.(uint16)) } func (s *Uint16MemoTable) InsertOrGet(val uint16) (idx int, found bool, err error) { h := hashInt(uint64(val), 0) e, ok := s.tbl.Lookup(h, func(v uint16) bool { return val == v }) if ok { idx = int(e.payload.memoIdx) found = true } else { idx = s.Size() s.tbl.Insert(e, h, val, int32(idx)) } return } // GetOrInsertBytes is unimplemented func (s *Uint16MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { panic("unimplemented") } type payloadInt32 struct { val int32 memoIdx int32 } type entryInt32 struct { h uint64 payload payloadInt32 } func (e entryInt32) Valid() bool { return e.h != sentinel } // Int32HashTable is a hashtable specifically for int32 that // is utilized with the MemoTable to generalize interactions for easier // implementation of dictionaries without losing performance. type Int32HashTable struct { cap uint64 capMask uint64 size uint64 entries []entryInt32 } // NewInt32HashTable returns a new hash table for int32 values // initialized with the passed in capacity or 32 whichever is larger. func NewInt32HashTable(cap uint64) *Int32HashTable { initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) ret := &Int32HashTable{cap: initCap, capMask: initCap - 1, size: 0} ret.entries = make([]entryInt32, initCap) return ret } // Reset drops all of the values in this hash table and re-initializes it // with the specified initial capacity as if by calling New, but without having // to reallocate the object. func (h *Int32HashTable) Reset(cap uint64) { h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) h.capMask = h.cap - 1 h.size = 0 h.entries = make([]entryInt32, h.cap) } // CopyValues is used for copying the values out of the hash table into the // passed in slice, in the order that they were first inserted func (h *Int32HashTable) CopyValues(out []int32) { h.CopyValuesSubset(0, out) } // CopyValuesSubset copies a subset of the values in the hashtable out, starting // with the value at start, in the order that they were inserted. func (h *Int32HashTable) CopyValuesSubset(start int, out []int32) { h.VisitEntries(func(e *entryInt32) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { out[idx] = e.payload.val } }) } func (h *Int32HashTable) WriteOut(out []byte) { h.WriteOutSubset(0, out) } func (h *Int32HashTable) WriteOutSubset(start int, out []byte) { data := arrow.Int32Traits.CastFromBytes(out) h.VisitEntries(func(e *entryInt32) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { data[idx] = utils.ToLEInt32(e.payload.val) } }) } func (h *Int32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } func (Int32HashTable) fixHash(v uint64) uint64 { if v == sentinel { return 42 } return v } // Lookup retrieves the entry for a given hash value assuming it's payload value returns // true when passed to the cmp func. Returns a pointer to the entry for the given hash value, // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. func (h *Int32HashTable) Lookup(v uint64, cmp func(int32) bool) (*entryInt32, bool) { idx, ok := h.lookup(v, h.capMask, cmp) return &h.entries[idx], ok } func (h *Int32HashTable) lookup(v uint64, szMask uint64, cmp func(int32) bool) (uint64, bool) { const perturbShift uint8 = 5 var ( idx uint64 perturb uint64 e *entryInt32 ) v = h.fixHash(v) idx = v & szMask perturb = (v >> uint64(perturbShift)) + 1 for { e = &h.entries[idx] if e.h == v && cmp(e.payload.val) { return idx, true } if e.h == sentinel { return idx, false } // perturbation logic inspired from CPython's set/dict object // the goal is that all 64 bits of unmasked hash value eventually // participate int he probing sequence, to minimize clustering idx = (idx + perturb) & szMask perturb = (perturb >> uint64(perturbShift)) + 1 } } func (h *Int32HashTable) upsize(newcap uint64) error { newMask := newcap - 1 oldEntries := h.entries h.entries = make([]entryInt32, newcap) for _, e := range oldEntries { if e.Valid() { idx, _ := h.lookup(e.h, newMask, func(int32) bool { return false }) h.entries[idx] = e } } h.cap = newcap h.capMask = newMask return nil } // Insert updates the given entry with the provided hash value, payload value and memo index. // The entry pointer must have been retrieved via lookup in order to actually insert properly. func (h *Int32HashTable) Insert(e *entryInt32, v uint64, val int32, memoIdx int32) error { e.h = h.fixHash(v) e.payload.val = val e.payload.memoIdx = memoIdx h.size++ if h.needUpsize() { h.upsize(h.cap * uint64(loadFactor) * 2) } return nil } // VisitEntries will call the passed in function on each *valid* entry in the hash table, // a valid entry being one which has had a value inserted into it. func (h *Int32HashTable) VisitEntries(visit func(*entryInt32)) { for _, e := range h.entries { if e.Valid() { visit(&e) } } } // Int32MemoTable is a wrapper over the appropriate hashtable to provide an interface // conforming to the MemoTable interface defined in the encoding package for general interactions // regarding dictionaries. type Int32MemoTable struct { tbl *Int32HashTable nullIdx int32 } // NewInt32MemoTable returns a new memotable with num entries pre-allocated to reduce further // allocations when inserting. func NewInt32MemoTable(num int64) *Int32MemoTable { return &Int32MemoTable{tbl: NewInt32HashTable(uint64(num)), nullIdx: KeyNotFound} } func (Int32MemoTable) TypeTraits() TypeTraits { return arrow.Int32Traits } // Reset allows this table to be re-used by dumping all the data currently in the table. func (s *Int32MemoTable) Reset() { s.tbl.Reset(32) s.nullIdx = KeyNotFound } // Size returns the current number of inserted elements into the table including if a null // has been inserted. func (s *Int32MemoTable) Size() int { sz := int(s.tbl.size) if _, ok := s.GetNull(); ok { sz++ } return sz } // GetNull returns the index of an inserted null or KeyNotFound along with a bool // that will be true if found and false if not. func (s *Int32MemoTable) GetNull() (int, bool) { return int(s.nullIdx), s.nullIdx != KeyNotFound } // GetOrInsertNull will return the index of the null entry or insert a null entry // if one currently doesn't exist. The found value will be true if there was already // a null in the table, and false if it inserted one. func (s *Int32MemoTable) GetOrInsertNull() (idx int, found bool) { idx, found = s.GetNull() if !found { idx = s.Size() s.nullIdx = int32(idx) } return } // CopyValues will copy the values from the memo table out into the passed in slice // which must be of the appropriate type. func (s *Int32MemoTable) CopyValues(out interface{}) { s.CopyValuesSubset(0, out) } // CopyValuesSubset is like CopyValues but only copies a subset of values starting // at the provided start index func (s *Int32MemoTable) CopyValuesSubset(start int, out interface{}) { s.tbl.CopyValuesSubset(start, out.([]int32)) } func (s *Int32MemoTable) WriteOut(out []byte) { s.tbl.CopyValues(arrow.Int32Traits.CastFromBytes(out)) } func (s *Int32MemoTable) WriteOutSubset(start int, out []byte) { s.tbl.CopyValuesSubset(start, arrow.Int32Traits.CastFromBytes(out)) } func (s *Int32MemoTable) WriteOutLE(out []byte) { s.tbl.WriteOut(out) } func (s *Int32MemoTable) WriteOutSubsetLE(start int, out []byte) { s.tbl.WriteOutSubset(start, out) } func (s *Int32MemoTable) Exists(val int32) bool { _, ok := s.Get(val) return ok } // Get returns the index of the requested value in the hash table or KeyNotFound // along with a boolean indicating if it was found or not. func (s *Int32MemoTable) Get(val interface{}) (int, bool) { h := hashInt(uint64(val.(int32)), 0) if e, ok := s.tbl.Lookup(h, func(v int32) bool { return val.(int32) == v }); ok { return int(e.payload.memoIdx), ok } return KeyNotFound, false } // GetOrInsert will return the index of the specified value in the table, or insert the // value into the table and return the new index. found indicates whether or not it already // existed in the table (true) or was inserted by this call (false). func (s *Int32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { return s.InsertOrGet(val.(int32)) } func (s *Int32MemoTable) InsertOrGet(val int32) (idx int, found bool, err error) { h := hashInt(uint64(val), 0) e, ok := s.tbl.Lookup(h, func(v int32) bool { return val == v }) if ok { idx = int(e.payload.memoIdx) found = true } else { idx = s.Size() s.tbl.Insert(e, h, val, int32(idx)) } return } // GetOrInsertBytes is unimplemented func (s *Int32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { panic("unimplemented") } type payloadInt64 struct { val int64 memoIdx int32 } type entryInt64 struct { h uint64 payload payloadInt64 } func (e entryInt64) Valid() bool { return e.h != sentinel } // Int64HashTable is a hashtable specifically for int64 that // is utilized with the MemoTable to generalize interactions for easier // implementation of dictionaries without losing performance. type Int64HashTable struct { cap uint64 capMask uint64 size uint64 entries []entryInt64 } // NewInt64HashTable returns a new hash table for int64 values // initialized with the passed in capacity or 32 whichever is larger. func NewInt64HashTable(cap uint64) *Int64HashTable { initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) ret := &Int64HashTable{cap: initCap, capMask: initCap - 1, size: 0} ret.entries = make([]entryInt64, initCap) return ret } // Reset drops all of the values in this hash table and re-initializes it // with the specified initial capacity as if by calling New, but without having // to reallocate the object. func (h *Int64HashTable) Reset(cap uint64) { h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) h.capMask = h.cap - 1 h.size = 0 h.entries = make([]entryInt64, h.cap) } // CopyValues is used for copying the values out of the hash table into the // passed in slice, in the order that they were first inserted func (h *Int64HashTable) CopyValues(out []int64) { h.CopyValuesSubset(0, out) } // CopyValuesSubset copies a subset of the values in the hashtable out, starting // with the value at start, in the order that they were inserted. func (h *Int64HashTable) CopyValuesSubset(start int, out []int64) { h.VisitEntries(func(e *entryInt64) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { out[idx] = e.payload.val } }) } func (h *Int64HashTable) WriteOut(out []byte) { h.WriteOutSubset(0, out) } func (h *Int64HashTable) WriteOutSubset(start int, out []byte) { data := arrow.Int64Traits.CastFromBytes(out) h.VisitEntries(func(e *entryInt64) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { data[idx] = utils.ToLEInt64(e.payload.val) } }) } func (h *Int64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } func (Int64HashTable) fixHash(v uint64) uint64 { if v == sentinel { return 42 } return v } // Lookup retrieves the entry for a given hash value assuming it's payload value returns // true when passed to the cmp func. Returns a pointer to the entry for the given hash value, // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. func (h *Int64HashTable) Lookup(v uint64, cmp func(int64) bool) (*entryInt64, bool) { idx, ok := h.lookup(v, h.capMask, cmp) return &h.entries[idx], ok } func (h *Int64HashTable) lookup(v uint64, szMask uint64, cmp func(int64) bool) (uint64, bool) { const perturbShift uint8 = 5 var ( idx uint64 perturb uint64 e *entryInt64 ) v = h.fixHash(v) idx = v & szMask perturb = (v >> uint64(perturbShift)) + 1 for { e = &h.entries[idx] if e.h == v && cmp(e.payload.val) { return idx, true } if e.h == sentinel { return idx, false } // perturbation logic inspired from CPython's set/dict object // the goal is that all 64 bits of unmasked hash value eventually // participate int he probing sequence, to minimize clustering idx = (idx + perturb) & szMask perturb = (perturb >> uint64(perturbShift)) + 1 } } func (h *Int64HashTable) upsize(newcap uint64) error { newMask := newcap - 1 oldEntries := h.entries h.entries = make([]entryInt64, newcap) for _, e := range oldEntries { if e.Valid() { idx, _ := h.lookup(e.h, newMask, func(int64) bool { return false }) h.entries[idx] = e } } h.cap = newcap h.capMask = newMask return nil } // Insert updates the given entry with the provided hash value, payload value and memo index. // The entry pointer must have been retrieved via lookup in order to actually insert properly. func (h *Int64HashTable) Insert(e *entryInt64, v uint64, val int64, memoIdx int32) error { e.h = h.fixHash(v) e.payload.val = val e.payload.memoIdx = memoIdx h.size++ if h.needUpsize() { h.upsize(h.cap * uint64(loadFactor) * 2) } return nil } // VisitEntries will call the passed in function on each *valid* entry in the hash table, // a valid entry being one which has had a value inserted into it. func (h *Int64HashTable) VisitEntries(visit func(*entryInt64)) { for _, e := range h.entries { if e.Valid() { visit(&e) } } } // Int64MemoTable is a wrapper over the appropriate hashtable to provide an interface // conforming to the MemoTable interface defined in the encoding package for general interactions // regarding dictionaries. type Int64MemoTable struct { tbl *Int64HashTable nullIdx int32 } // NewInt64MemoTable returns a new memotable with num entries pre-allocated to reduce further // allocations when inserting. func NewInt64MemoTable(num int64) *Int64MemoTable { return &Int64MemoTable{tbl: NewInt64HashTable(uint64(num)), nullIdx: KeyNotFound} } func (Int64MemoTable) TypeTraits() TypeTraits { return arrow.Int64Traits } // Reset allows this table to be re-used by dumping all the data currently in the table. func (s *Int64MemoTable) Reset() { s.tbl.Reset(32) s.nullIdx = KeyNotFound } // Size returns the current number of inserted elements into the table including if a null // has been inserted. func (s *Int64MemoTable) Size() int { sz := int(s.tbl.size) if _, ok := s.GetNull(); ok { sz++ } return sz } // GetNull returns the index of an inserted null or KeyNotFound along with a bool // that will be true if found and false if not. func (s *Int64MemoTable) GetNull() (int, bool) { return int(s.nullIdx), s.nullIdx != KeyNotFound } // GetOrInsertNull will return the index of the null entry or insert a null entry // if one currently doesn't exist. The found value will be true if there was already // a null in the table, and false if it inserted one. func (s *Int64MemoTable) GetOrInsertNull() (idx int, found bool) { idx, found = s.GetNull() if !found { idx = s.Size() s.nullIdx = int32(idx) } return } // CopyValues will copy the values from the memo table out into the passed in slice // which must be of the appropriate type. func (s *Int64MemoTable) CopyValues(out interface{}) { s.CopyValuesSubset(0, out) } // CopyValuesSubset is like CopyValues but only copies a subset of values starting // at the provided start index func (s *Int64MemoTable) CopyValuesSubset(start int, out interface{}) { s.tbl.CopyValuesSubset(start, out.([]int64)) } func (s *Int64MemoTable) WriteOut(out []byte) { s.tbl.CopyValues(arrow.Int64Traits.CastFromBytes(out)) } func (s *Int64MemoTable) WriteOutSubset(start int, out []byte) { s.tbl.CopyValuesSubset(start, arrow.Int64Traits.CastFromBytes(out)) } func (s *Int64MemoTable) WriteOutLE(out []byte) { s.tbl.WriteOut(out) } func (s *Int64MemoTable) WriteOutSubsetLE(start int, out []byte) { s.tbl.WriteOutSubset(start, out) } func (s *Int64MemoTable) Exists(val int64) bool { _, ok := s.Get(val) return ok } // Get returns the index of the requested value in the hash table or KeyNotFound // along with a boolean indicating if it was found or not. func (s *Int64MemoTable) Get(val interface{}) (int, bool) { h := hashInt(uint64(val.(int64)), 0) if e, ok := s.tbl.Lookup(h, func(v int64) bool { return val.(int64) == v }); ok { return int(e.payload.memoIdx), ok } return KeyNotFound, false } // GetOrInsert will return the index of the specified value in the table, or insert the // value into the table and return the new index. found indicates whether or not it already // existed in the table (true) or was inserted by this call (false). func (s *Int64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { return s.InsertOrGet(val.(int64)) } func (s *Int64MemoTable) InsertOrGet(val int64) (idx int, found bool, err error) { h := hashInt(uint64(val), 0) e, ok := s.tbl.Lookup(h, func(v int64) bool { return val == v }) if ok { idx = int(e.payload.memoIdx) found = true } else { idx = s.Size() s.tbl.Insert(e, h, val, int32(idx)) } return } // GetOrInsertBytes is unimplemented func (s *Int64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { panic("unimplemented") } type payloadUint32 struct { val uint32 memoIdx int32 } type entryUint32 struct { h uint64 payload payloadUint32 } func (e entryUint32) Valid() bool { return e.h != sentinel } // Uint32HashTable is a hashtable specifically for uint32 that // is utilized with the MemoTable to generalize interactions for easier // implementation of dictionaries without losing performance. type Uint32HashTable struct { cap uint64 capMask uint64 size uint64 entries []entryUint32 } // NewUint32HashTable returns a new hash table for uint32 values // initialized with the passed in capacity or 32 whichever is larger. func NewUint32HashTable(cap uint64) *Uint32HashTable { initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) ret := &Uint32HashTable{cap: initCap, capMask: initCap - 1, size: 0} ret.entries = make([]entryUint32, initCap) return ret } // Reset drops all of the values in this hash table and re-initializes it // with the specified initial capacity as if by calling New, but without having // to reallocate the object. func (h *Uint32HashTable) Reset(cap uint64) { h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) h.capMask = h.cap - 1 h.size = 0 h.entries = make([]entryUint32, h.cap) } // CopyValues is used for copying the values out of the hash table into the // passed in slice, in the order that they were first inserted func (h *Uint32HashTable) CopyValues(out []uint32) { h.CopyValuesSubset(0, out) } // CopyValuesSubset copies a subset of the values in the hashtable out, starting // with the value at start, in the order that they were inserted. func (h *Uint32HashTable) CopyValuesSubset(start int, out []uint32) { h.VisitEntries(func(e *entryUint32) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { out[idx] = e.payload.val } }) } func (h *Uint32HashTable) WriteOut(out []byte) { h.WriteOutSubset(0, out) } func (h *Uint32HashTable) WriteOutSubset(start int, out []byte) { data := arrow.Uint32Traits.CastFromBytes(out) h.VisitEntries(func(e *entryUint32) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { data[idx] = utils.ToLEUint32(e.payload.val) } }) } func (h *Uint32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } func (Uint32HashTable) fixHash(v uint64) uint64 { if v == sentinel { return 42 } return v } // Lookup retrieves the entry for a given hash value assuming it's payload value returns // true when passed to the cmp func. Returns a pointer to the entry for the given hash value, // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. func (h *Uint32HashTable) Lookup(v uint64, cmp func(uint32) bool) (*entryUint32, bool) { idx, ok := h.lookup(v, h.capMask, cmp) return &h.entries[idx], ok } func (h *Uint32HashTable) lookup(v uint64, szMask uint64, cmp func(uint32) bool) (uint64, bool) { const perturbShift uint8 = 5 var ( idx uint64 perturb uint64 e *entryUint32 ) v = h.fixHash(v) idx = v & szMask perturb = (v >> uint64(perturbShift)) + 1 for { e = &h.entries[idx] if e.h == v && cmp(e.payload.val) { return idx, true } if e.h == sentinel { return idx, false } // perturbation logic inspired from CPython's set/dict object // the goal is that all 64 bits of unmasked hash value eventually // participate int he probing sequence, to minimize clustering idx = (idx + perturb) & szMask perturb = (perturb >> uint64(perturbShift)) + 1 } } func (h *Uint32HashTable) upsize(newcap uint64) error { newMask := newcap - 1 oldEntries := h.entries h.entries = make([]entryUint32, newcap) for _, e := range oldEntries { if e.Valid() { idx, _ := h.lookup(e.h, newMask, func(uint32) bool { return false }) h.entries[idx] = e } } h.cap = newcap h.capMask = newMask return nil } // Insert updates the given entry with the provided hash value, payload value and memo index. // The entry pointer must have been retrieved via lookup in order to actually insert properly. func (h *Uint32HashTable) Insert(e *entryUint32, v uint64, val uint32, memoIdx int32) error { e.h = h.fixHash(v) e.payload.val = val e.payload.memoIdx = memoIdx h.size++ if h.needUpsize() { h.upsize(h.cap * uint64(loadFactor) * 2) } return nil } // VisitEntries will call the passed in function on each *valid* entry in the hash table, // a valid entry being one which has had a value inserted into it. func (h *Uint32HashTable) VisitEntries(visit func(*entryUint32)) { for _, e := range h.entries { if e.Valid() { visit(&e) } } } // Uint32MemoTable is a wrapper over the appropriate hashtable to provide an interface // conforming to the MemoTable interface defined in the encoding package for general interactions // regarding dictionaries. type Uint32MemoTable struct { tbl *Uint32HashTable nullIdx int32 } // NewUint32MemoTable returns a new memotable with num entries pre-allocated to reduce further // allocations when inserting. func NewUint32MemoTable(num int64) *Uint32MemoTable { return &Uint32MemoTable{tbl: NewUint32HashTable(uint64(num)), nullIdx: KeyNotFound} } func (Uint32MemoTable) TypeTraits() TypeTraits { return arrow.Uint32Traits } // Reset allows this table to be re-used by dumping all the data currently in the table. func (s *Uint32MemoTable) Reset() { s.tbl.Reset(32) s.nullIdx = KeyNotFound } // Size returns the current number of inserted elements into the table including if a null // has been inserted. func (s *Uint32MemoTable) Size() int { sz := int(s.tbl.size) if _, ok := s.GetNull(); ok { sz++ } return sz } // GetNull returns the index of an inserted null or KeyNotFound along with a bool // that will be true if found and false if not. func (s *Uint32MemoTable) GetNull() (int, bool) { return int(s.nullIdx), s.nullIdx != KeyNotFound } // GetOrInsertNull will return the index of the null entry or insert a null entry // if one currently doesn't exist. The found value will be true if there was already // a null in the table, and false if it inserted one. func (s *Uint32MemoTable) GetOrInsertNull() (idx int, found bool) { idx, found = s.GetNull() if !found { idx = s.Size() s.nullIdx = int32(idx) } return } // CopyValues will copy the values from the memo table out into the passed in slice // which must be of the appropriate type. func (s *Uint32MemoTable) CopyValues(out interface{}) { s.CopyValuesSubset(0, out) } // CopyValuesSubset is like CopyValues but only copies a subset of values starting // at the provided start index func (s *Uint32MemoTable) CopyValuesSubset(start int, out interface{}) { s.tbl.CopyValuesSubset(start, out.([]uint32)) } func (s *Uint32MemoTable) WriteOut(out []byte) { s.tbl.CopyValues(arrow.Uint32Traits.CastFromBytes(out)) } func (s *Uint32MemoTable) WriteOutSubset(start int, out []byte) { s.tbl.CopyValuesSubset(start, arrow.Uint32Traits.CastFromBytes(out)) } func (s *Uint32MemoTable) WriteOutLE(out []byte) { s.tbl.WriteOut(out) } func (s *Uint32MemoTable) WriteOutSubsetLE(start int, out []byte) { s.tbl.WriteOutSubset(start, out) } func (s *Uint32MemoTable) Exists(val uint32) bool { _, ok := s.Get(val) return ok } // Get returns the index of the requested value in the hash table or KeyNotFound // along with a boolean indicating if it was found or not. func (s *Uint32MemoTable) Get(val interface{}) (int, bool) { h := hashInt(uint64(val.(uint32)), 0) if e, ok := s.tbl.Lookup(h, func(v uint32) bool { return val.(uint32) == v }); ok { return int(e.payload.memoIdx), ok } return KeyNotFound, false } // GetOrInsert will return the index of the specified value in the table, or insert the // value into the table and return the new index. found indicates whether or not it already // existed in the table (true) or was inserted by this call (false). func (s *Uint32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { return s.InsertOrGet(val.(uint32)) } func (s *Uint32MemoTable) InsertOrGet(val uint32) (idx int, found bool, err error) { h := hashInt(uint64(val), 0) e, ok := s.tbl.Lookup(h, func(v uint32) bool { return val == v }) if ok { idx = int(e.payload.memoIdx) found = true } else { idx = s.Size() s.tbl.Insert(e, h, val, int32(idx)) } return } // GetOrInsertBytes is unimplemented func (s *Uint32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { panic("unimplemented") } type payloadUint64 struct { val uint64 memoIdx int32 } type entryUint64 struct { h uint64 payload payloadUint64 } func (e entryUint64) Valid() bool { return e.h != sentinel } // Uint64HashTable is a hashtable specifically for uint64 that // is utilized with the MemoTable to generalize interactions for easier // implementation of dictionaries without losing performance. type Uint64HashTable struct { cap uint64 capMask uint64 size uint64 entries []entryUint64 } // NewUint64HashTable returns a new hash table for uint64 values // initialized with the passed in capacity or 32 whichever is larger. func NewUint64HashTable(cap uint64) *Uint64HashTable { initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) ret := &Uint64HashTable{cap: initCap, capMask: initCap - 1, size: 0} ret.entries = make([]entryUint64, initCap) return ret } // Reset drops all of the values in this hash table and re-initializes it // with the specified initial capacity as if by calling New, but without having // to reallocate the object. func (h *Uint64HashTable) Reset(cap uint64) { h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) h.capMask = h.cap - 1 h.size = 0 h.entries = make([]entryUint64, h.cap) } // CopyValues is used for copying the values out of the hash table into the // passed in slice, in the order that they were first inserted func (h *Uint64HashTable) CopyValues(out []uint64) { h.CopyValuesSubset(0, out) } // CopyValuesSubset copies a subset of the values in the hashtable out, starting // with the value at start, in the order that they were inserted. func (h *Uint64HashTable) CopyValuesSubset(start int, out []uint64) { h.VisitEntries(func(e *entryUint64) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { out[idx] = e.payload.val } }) } func (h *Uint64HashTable) WriteOut(out []byte) { h.WriteOutSubset(0, out) } func (h *Uint64HashTable) WriteOutSubset(start int, out []byte) { data := arrow.Uint64Traits.CastFromBytes(out) h.VisitEntries(func(e *entryUint64) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { data[idx] = utils.ToLEUint64(e.payload.val) } }) } func (h *Uint64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } func (Uint64HashTable) fixHash(v uint64) uint64 { if v == sentinel { return 42 } return v } // Lookup retrieves the entry for a given hash value assuming it's payload value returns // true when passed to the cmp func. Returns a pointer to the entry for the given hash value, // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. func (h *Uint64HashTable) Lookup(v uint64, cmp func(uint64) bool) (*entryUint64, bool) { idx, ok := h.lookup(v, h.capMask, cmp) return &h.entries[idx], ok } func (h *Uint64HashTable) lookup(v uint64, szMask uint64, cmp func(uint64) bool) (uint64, bool) { const perturbShift uint8 = 5 var ( idx uint64 perturb uint64 e *entryUint64 ) v = h.fixHash(v) idx = v & szMask perturb = (v >> uint64(perturbShift)) + 1 for { e = &h.entries[idx] if e.h == v && cmp(e.payload.val) { return idx, true } if e.h == sentinel { return idx, false } // perturbation logic inspired from CPython's set/dict object // the goal is that all 64 bits of unmasked hash value eventually // participate int he probing sequence, to minimize clustering idx = (idx + perturb) & szMask perturb = (perturb >> uint64(perturbShift)) + 1 } } func (h *Uint64HashTable) upsize(newcap uint64) error { newMask := newcap - 1 oldEntries := h.entries h.entries = make([]entryUint64, newcap) for _, e := range oldEntries { if e.Valid() { idx, _ := h.lookup(e.h, newMask, func(uint64) bool { return false }) h.entries[idx] = e } } h.cap = newcap h.capMask = newMask return nil } // Insert updates the given entry with the provided hash value, payload value and memo index. // The entry pointer must have been retrieved via lookup in order to actually insert properly. func (h *Uint64HashTable) Insert(e *entryUint64, v uint64, val uint64, memoIdx int32) error { e.h = h.fixHash(v) e.payload.val = val e.payload.memoIdx = memoIdx h.size++ if h.needUpsize() { h.upsize(h.cap * uint64(loadFactor) * 2) } return nil } // VisitEntries will call the passed in function on each *valid* entry in the hash table, // a valid entry being one which has had a value inserted into it. func (h *Uint64HashTable) VisitEntries(visit func(*entryUint64)) { for _, e := range h.entries { if e.Valid() { visit(&e) } } } // Uint64MemoTable is a wrapper over the appropriate hashtable to provide an interface // conforming to the MemoTable interface defined in the encoding package for general interactions // regarding dictionaries. type Uint64MemoTable struct { tbl *Uint64HashTable nullIdx int32 } // NewUint64MemoTable returns a new memotable with num entries pre-allocated to reduce further // allocations when inserting. func NewUint64MemoTable(num int64) *Uint64MemoTable { return &Uint64MemoTable{tbl: NewUint64HashTable(uint64(num)), nullIdx: KeyNotFound} } func (Uint64MemoTable) TypeTraits() TypeTraits { return arrow.Uint64Traits } // Reset allows this table to be re-used by dumping all the data currently in the table. func (s *Uint64MemoTable) Reset() { s.tbl.Reset(32) s.nullIdx = KeyNotFound } // Size returns the current number of inserted elements into the table including if a null // has been inserted. func (s *Uint64MemoTable) Size() int { sz := int(s.tbl.size) if _, ok := s.GetNull(); ok { sz++ } return sz } // GetNull returns the index of an inserted null or KeyNotFound along with a bool // that will be true if found and false if not. func (s *Uint64MemoTable) GetNull() (int, bool) { return int(s.nullIdx), s.nullIdx != KeyNotFound } // GetOrInsertNull will return the index of the null entry or insert a null entry // if one currently doesn't exist. The found value will be true if there was already // a null in the table, and false if it inserted one. func (s *Uint64MemoTable) GetOrInsertNull() (idx int, found bool) { idx, found = s.GetNull() if !found { idx = s.Size() s.nullIdx = int32(idx) } return } // CopyValues will copy the values from the memo table out into the passed in slice // which must be of the appropriate type. func (s *Uint64MemoTable) CopyValues(out interface{}) { s.CopyValuesSubset(0, out) } // CopyValuesSubset is like CopyValues but only copies a subset of values starting // at the provided start index func (s *Uint64MemoTable) CopyValuesSubset(start int, out interface{}) { s.tbl.CopyValuesSubset(start, out.([]uint64)) } func (s *Uint64MemoTable) WriteOut(out []byte) { s.tbl.CopyValues(arrow.Uint64Traits.CastFromBytes(out)) } func (s *Uint64MemoTable) WriteOutSubset(start int, out []byte) { s.tbl.CopyValuesSubset(start, arrow.Uint64Traits.CastFromBytes(out)) } func (s *Uint64MemoTable) WriteOutLE(out []byte) { s.tbl.WriteOut(out) } func (s *Uint64MemoTable) WriteOutSubsetLE(start int, out []byte) { s.tbl.WriteOutSubset(start, out) } func (s *Uint64MemoTable) Exists(val uint64) bool { _, ok := s.Get(val) return ok } // Get returns the index of the requested value in the hash table or KeyNotFound // along with a boolean indicating if it was found or not. func (s *Uint64MemoTable) Get(val interface{}) (int, bool) { h := hashInt(uint64(val.(uint64)), 0) if e, ok := s.tbl.Lookup(h, func(v uint64) bool { return val.(uint64) == v }); ok { return int(e.payload.memoIdx), ok } return KeyNotFound, false } // GetOrInsert will return the index of the specified value in the table, or insert the // value into the table and return the new index. found indicates whether or not it already // existed in the table (true) or was inserted by this call (false). func (s *Uint64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { return s.InsertOrGet(val.(uint64)) } func (s *Uint64MemoTable) InsertOrGet(val uint64) (idx int, found bool, err error) { h := hashInt(uint64(val), 0) e, ok := s.tbl.Lookup(h, func(v uint64) bool { return val == v }) if ok { idx = int(e.payload.memoIdx) found = true } else { idx = s.Size() s.tbl.Insert(e, h, val, int32(idx)) } return } // GetOrInsertBytes is unimplemented func (s *Uint64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { panic("unimplemented") } type payloadFloat32 struct { val float32 memoIdx int32 } type entryFloat32 struct { h uint64 payload payloadFloat32 } func (e entryFloat32) Valid() bool { return e.h != sentinel } // Float32HashTable is a hashtable specifically for float32 that // is utilized with the MemoTable to generalize interactions for easier // implementation of dictionaries without losing performance. type Float32HashTable struct { cap uint64 capMask uint64 size uint64 entries []entryFloat32 } // NewFloat32HashTable returns a new hash table for float32 values // initialized with the passed in capacity or 32 whichever is larger. func NewFloat32HashTable(cap uint64) *Float32HashTable { initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) ret := &Float32HashTable{cap: initCap, capMask: initCap - 1, size: 0} ret.entries = make([]entryFloat32, initCap) return ret } // Reset drops all of the values in this hash table and re-initializes it // with the specified initial capacity as if by calling New, but without having // to reallocate the object. func (h *Float32HashTable) Reset(cap uint64) { h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) h.capMask = h.cap - 1 h.size = 0 h.entries = make([]entryFloat32, h.cap) } // CopyValues is used for copying the values out of the hash table into the // passed in slice, in the order that they were first inserted func (h *Float32HashTable) CopyValues(out []float32) { h.CopyValuesSubset(0, out) } // CopyValuesSubset copies a subset of the values in the hashtable out, starting // with the value at start, in the order that they were inserted. func (h *Float32HashTable) CopyValuesSubset(start int, out []float32) { h.VisitEntries(func(e *entryFloat32) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { out[idx] = e.payload.val } }) } func (h *Float32HashTable) WriteOut(out []byte) { h.WriteOutSubset(0, out) } func (h *Float32HashTable) WriteOutSubset(start int, out []byte) { data := arrow.Float32Traits.CastFromBytes(out) h.VisitEntries(func(e *entryFloat32) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { data[idx] = utils.ToLEFloat32(e.payload.val) } }) } func (h *Float32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } func (Float32HashTable) fixHash(v uint64) uint64 { if v == sentinel { return 42 } return v } // Lookup retrieves the entry for a given hash value assuming it's payload value returns // true when passed to the cmp func. Returns a pointer to the entry for the given hash value, // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. func (h *Float32HashTable) Lookup(v uint64, cmp func(float32) bool) (*entryFloat32, bool) { idx, ok := h.lookup(v, h.capMask, cmp) return &h.entries[idx], ok } func (h *Float32HashTable) lookup(v uint64, szMask uint64, cmp func(float32) bool) (uint64, bool) { const perturbShift uint8 = 5 var ( idx uint64 perturb uint64 e *entryFloat32 ) v = h.fixHash(v) idx = v & szMask perturb = (v >> uint64(perturbShift)) + 1 for { e = &h.entries[idx] if e.h == v && cmp(e.payload.val) { return idx, true } if e.h == sentinel { return idx, false } // perturbation logic inspired from CPython's set/dict object // the goal is that all 64 bits of unmasked hash value eventually // participate int he probing sequence, to minimize clustering idx = (idx + perturb) & szMask perturb = (perturb >> uint64(perturbShift)) + 1 } } func (h *Float32HashTable) upsize(newcap uint64) error { newMask := newcap - 1 oldEntries := h.entries h.entries = make([]entryFloat32, newcap) for _, e := range oldEntries { if e.Valid() { idx, _ := h.lookup(e.h, newMask, func(float32) bool { return false }) h.entries[idx] = e } } h.cap = newcap h.capMask = newMask return nil } // Insert updates the given entry with the provided hash value, payload value and memo index. // The entry pointer must have been retrieved via lookup in order to actually insert properly. func (h *Float32HashTable) Insert(e *entryFloat32, v uint64, val float32, memoIdx int32) error { e.h = h.fixHash(v) e.payload.val = val e.payload.memoIdx = memoIdx h.size++ if h.needUpsize() { h.upsize(h.cap * uint64(loadFactor) * 2) } return nil } // VisitEntries will call the passed in function on each *valid* entry in the hash table, // a valid entry being one which has had a value inserted into it. func (h *Float32HashTable) VisitEntries(visit func(*entryFloat32)) { for _, e := range h.entries { if e.Valid() { visit(&e) } } } // Float32MemoTable is a wrapper over the appropriate hashtable to provide an interface // conforming to the MemoTable interface defined in the encoding package for general interactions // regarding dictionaries. type Float32MemoTable struct { tbl *Float32HashTable nullIdx int32 } // NewFloat32MemoTable returns a new memotable with num entries pre-allocated to reduce further // allocations when inserting. func NewFloat32MemoTable(num int64) *Float32MemoTable { return &Float32MemoTable{tbl: NewFloat32HashTable(uint64(num)), nullIdx: KeyNotFound} } func (Float32MemoTable) TypeTraits() TypeTraits { return arrow.Float32Traits } // Reset allows this table to be re-used by dumping all the data currently in the table. func (s *Float32MemoTable) Reset() { s.tbl.Reset(32) s.nullIdx = KeyNotFound } // Size returns the current number of inserted elements into the table including if a null // has been inserted. func (s *Float32MemoTable) Size() int { sz := int(s.tbl.size) if _, ok := s.GetNull(); ok { sz++ } return sz } // GetNull returns the index of an inserted null or KeyNotFound along with a bool // that will be true if found and false if not. func (s *Float32MemoTable) GetNull() (int, bool) { return int(s.nullIdx), s.nullIdx != KeyNotFound } // GetOrInsertNull will return the index of the null entry or insert a null entry // if one currently doesn't exist. The found value will be true if there was already // a null in the table, and false if it inserted one. func (s *Float32MemoTable) GetOrInsertNull() (idx int, found bool) { idx, found = s.GetNull() if !found { idx = s.Size() s.nullIdx = int32(idx) } return } // CopyValues will copy the values from the memo table out into the passed in slice // which must be of the appropriate type. func (s *Float32MemoTable) CopyValues(out interface{}) { s.CopyValuesSubset(0, out) } // CopyValuesSubset is like CopyValues but only copies a subset of values starting // at the provided start index func (s *Float32MemoTable) CopyValuesSubset(start int, out interface{}) { s.tbl.CopyValuesSubset(start, out.([]float32)) } func (s *Float32MemoTable) WriteOut(out []byte) { s.tbl.CopyValues(arrow.Float32Traits.CastFromBytes(out)) } func (s *Float32MemoTable) WriteOutSubset(start int, out []byte) { s.tbl.CopyValuesSubset(start, arrow.Float32Traits.CastFromBytes(out)) } func (s *Float32MemoTable) WriteOutLE(out []byte) { s.tbl.WriteOut(out) } func (s *Float32MemoTable) WriteOutSubsetLE(start int, out []byte) { s.tbl.WriteOutSubset(start, out) } func (s *Float32MemoTable) Exists(val float32) bool { _, ok := s.Get(val) return ok } // Get returns the index of the requested value in the hash table or KeyNotFound // along with a boolean indicating if it was found or not. func (s *Float32MemoTable) Get(val interface{}) (int, bool) { var cmp func(float32) bool if math.IsNaN(float64(val.(float32))) { cmp = isNan32Cmp // use consistent internal bit pattern for NaN regardless of the pattern // that is passed to us. NaN is NaN is NaN val = float32(math.NaN()) } else { cmp = func(v float32) bool { return val.(float32) == v } } h := hashFloat32(val.(float32), 0) if e, ok := s.tbl.Lookup(h, cmp); ok { return int(e.payload.memoIdx), ok } return KeyNotFound, false } // GetOrInsert will return the index of the specified value in the table, or insert the // value into the table and return the new index. found indicates whether or not it already // existed in the table (true) or was inserted by this call (false). func (s *Float32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { return s.InsertOrGet(val.(float32)) } func (s *Float32MemoTable) InsertOrGet(val float32) (idx int, found bool, err error) { var cmp func(float32) bool if math.IsNaN(float64(val)) { cmp = isNan32Cmp // use consistent internal bit pattern for NaN regardless of the pattern // that is passed to us. NaN is NaN is NaN val = float32(math.NaN()) } else { cmp = func(v float32) bool { return val == v } } h := hashFloat32(val, 0) e, ok := s.tbl.Lookup(h, cmp) if ok { idx = int(e.payload.memoIdx) found = true } else { idx = s.Size() s.tbl.Insert(e, h, val, int32(idx)) } return } // GetOrInsertBytes is unimplemented func (s *Float32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { panic("unimplemented") } type payloadFloat64 struct { val float64 memoIdx int32 } type entryFloat64 struct { h uint64 payload payloadFloat64 } func (e entryFloat64) Valid() bool { return e.h != sentinel } // Float64HashTable is a hashtable specifically for float64 that // is utilized with the MemoTable to generalize interactions for easier // implementation of dictionaries without losing performance. type Float64HashTable struct { cap uint64 capMask uint64 size uint64 entries []entryFloat64 } // NewFloat64HashTable returns a new hash table for float64 values // initialized with the passed in capacity or 32 whichever is larger. func NewFloat64HashTable(cap uint64) *Float64HashTable { initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) ret := &Float64HashTable{cap: initCap, capMask: initCap - 1, size: 0} ret.entries = make([]entryFloat64, initCap) return ret } // Reset drops all of the values in this hash table and re-initializes it // with the specified initial capacity as if by calling New, but without having // to reallocate the object. func (h *Float64HashTable) Reset(cap uint64) { h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) h.capMask = h.cap - 1 h.size = 0 h.entries = make([]entryFloat64, h.cap) } // CopyValues is used for copying the values out of the hash table into the // passed in slice, in the order that they were first inserted func (h *Float64HashTable) CopyValues(out []float64) { h.CopyValuesSubset(0, out) } // CopyValuesSubset copies a subset of the values in the hashtable out, starting // with the value at start, in the order that they were inserted. func (h *Float64HashTable) CopyValuesSubset(start int, out []float64) { h.VisitEntries(func(e *entryFloat64) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { out[idx] = e.payload.val } }) } func (h *Float64HashTable) WriteOut(out []byte) { h.WriteOutSubset(0, out) } func (h *Float64HashTable) WriteOutSubset(start int, out []byte) { data := arrow.Float64Traits.CastFromBytes(out) h.VisitEntries(func(e *entryFloat64) { idx := e.payload.memoIdx - int32(start) if idx >= 0 { data[idx] = utils.ToLEFloat64(e.payload.val) } }) } func (h *Float64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } func (Float64HashTable) fixHash(v uint64) uint64 { if v == sentinel { return 42 } return v } // Lookup retrieves the entry for a given hash value assuming it's payload value returns // true when passed to the cmp func. Returns a pointer to the entry for the given hash value, // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. func (h *Float64HashTable) Lookup(v uint64, cmp func(float64) bool) (*entryFloat64, bool) { idx, ok := h.lookup(v, h.capMask, cmp) return &h.entries[idx], ok } func (h *Float64HashTable) lookup(v uint64, szMask uint64, cmp func(float64) bool) (uint64, bool) { const perturbShift uint8 = 5 var ( idx uint64 perturb uint64 e *entryFloat64 ) v = h.fixHash(v) idx = v & szMask perturb = (v >> uint64(perturbShift)) + 1 for { e = &h.entries[idx] if e.h == v && cmp(e.payload.val) { return idx, true } if e.h == sentinel { return idx, false } // perturbation logic inspired from CPython's set/dict object // the goal is that all 64 bits of unmasked hash value eventually // participate int he probing sequence, to minimize clustering idx = (idx + perturb) & szMask perturb = (perturb >> uint64(perturbShift)) + 1 } } func (h *Float64HashTable) upsize(newcap uint64) error { newMask := newcap - 1 oldEntries := h.entries h.entries = make([]entryFloat64, newcap) for _, e := range oldEntries { if e.Valid() { idx, _ := h.lookup(e.h, newMask, func(float64) bool { return false }) h.entries[idx] = e } } h.cap = newcap h.capMask = newMask return nil } // Insert updates the given entry with the provided hash value, payload value and memo index. // The entry pointer must have been retrieved via lookup in order to actually insert properly. func (h *Float64HashTable) Insert(e *entryFloat64, v uint64, val float64, memoIdx int32) error { e.h = h.fixHash(v) e.payload.val = val e.payload.memoIdx = memoIdx h.size++ if h.needUpsize() { h.upsize(h.cap * uint64(loadFactor) * 2) } return nil } // VisitEntries will call the passed in function on each *valid* entry in the hash table, // a valid entry being one which has had a value inserted into it. func (h *Float64HashTable) VisitEntries(visit func(*entryFloat64)) { for _, e := range h.entries { if e.Valid() { visit(&e) } } } // Float64MemoTable is a wrapper over the appropriate hashtable to provide an interface // conforming to the MemoTable interface defined in the encoding package for general interactions // regarding dictionaries. type Float64MemoTable struct { tbl *Float64HashTable nullIdx int32 } // NewFloat64MemoTable returns a new memotable with num entries pre-allocated to reduce further // allocations when inserting. func NewFloat64MemoTable(num int64) *Float64MemoTable { return &Float64MemoTable{tbl: NewFloat64HashTable(uint64(num)), nullIdx: KeyNotFound} } func (Float64MemoTable) TypeTraits() TypeTraits { return arrow.Float64Traits } // Reset allows this table to be re-used by dumping all the data currently in the table. func (s *Float64MemoTable) Reset() { s.tbl.Reset(32) s.nullIdx = KeyNotFound } // Size returns the current number of inserted elements into the table including if a null // has been inserted. func (s *Float64MemoTable) Size() int { sz := int(s.tbl.size) if _, ok := s.GetNull(); ok { sz++ } return sz } // GetNull returns the index of an inserted null or KeyNotFound along with a bool // that will be true if found and false if not. func (s *Float64MemoTable) GetNull() (int, bool) { return int(s.nullIdx), s.nullIdx != KeyNotFound } // GetOrInsertNull will return the index of the null entry or insert a null entry // if one currently doesn't exist. The found value will be true if there was already // a null in the table, and false if it inserted one. func (s *Float64MemoTable) GetOrInsertNull() (idx int, found bool) { idx, found = s.GetNull() if !found { idx = s.Size() s.nullIdx = int32(idx) } return } // CopyValues will copy the values from the memo table out into the passed in slice // which must be of the appropriate type. func (s *Float64MemoTable) CopyValues(out interface{}) { s.CopyValuesSubset(0, out) } // CopyValuesSubset is like CopyValues but only copies a subset of values starting // at the provided start index func (s *Float64MemoTable) CopyValuesSubset(start int, out interface{}) { s.tbl.CopyValuesSubset(start, out.([]float64)) } func (s *Float64MemoTable) WriteOut(out []byte) { s.tbl.CopyValues(arrow.Float64Traits.CastFromBytes(out)) } func (s *Float64MemoTable) WriteOutSubset(start int, out []byte) { s.tbl.CopyValuesSubset(start, arrow.Float64Traits.CastFromBytes(out)) } func (s *Float64MemoTable) WriteOutLE(out []byte) { s.tbl.WriteOut(out) } func (s *Float64MemoTable) WriteOutSubsetLE(start int, out []byte) { s.tbl.WriteOutSubset(start, out) } func (s *Float64MemoTable) Exists(val float64) bool { _, ok := s.Get(val) return ok } // Get returns the index of the requested value in the hash table or KeyNotFound // along with a boolean indicating if it was found or not. func (s *Float64MemoTable) Get(val interface{}) (int, bool) { var cmp func(float64) bool if math.IsNaN(val.(float64)) { cmp = math.IsNaN // use consistent internal bit pattern for NaN regardless of the pattern // that is passed to us. NaN is NaN is NaN val = math.NaN() } else { cmp = func(v float64) bool { return val.(float64) == v } } h := hashFloat64(val.(float64), 0) if e, ok := s.tbl.Lookup(h, cmp); ok { return int(e.payload.memoIdx), ok } return KeyNotFound, false } // GetOrInsert will return the index of the specified value in the table, or insert the // value into the table and return the new index. found indicates whether or not it already // existed in the table (true) or was inserted by this call (false). func (s *Float64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { return s.InsertOrGet(val.(float64)) } func (s *Float64MemoTable) InsertOrGet(val float64) (idx int, found bool, err error) { var cmp func(float64) bool if math.IsNaN(val) { cmp = math.IsNaN // use consistent internal bit pattern for NaN regardless of the pattern // that is passed to us. NaN is NaN is NaN val = math.NaN() } else { cmp = func(v float64) bool { return val == v } } h := hashFloat64(val, 0) e, ok := s.tbl.Lookup(h, cmp) if ok { idx = int(e.payload.memoIdx) found = true } else { idx = s.Size() s.tbl.Insert(e, h, val, int32(idx)) } return } // GetOrInsertBytes is unimplemented func (s *Float64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { panic("unimplemented") }

internal/hashing/xxh3_memo_table.gen.go (1,981 lines of code) (raw):