internal/hashing/xxh3_memo_table.gen.go (1,981 lines of code) (raw):
// Code generated by xxh3_memo_table.gen.go.tmpl. DO NOT EDIT.
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package hashing
import (
"math"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/bitutil"
"github.com/apache/arrow-go/v18/internal/utils"
)
type payloadInt8 struct {
val int8
memoIdx int32
}
type entryInt8 struct {
h uint64
payload payloadInt8
}
func (e entryInt8) Valid() bool { return e.h != sentinel }
// Int8HashTable is a hashtable specifically for int8 that
// is utilized with the MemoTable to generalize interactions for easier
// implementation of dictionaries without losing performance.
type Int8HashTable struct {
cap uint64
capMask uint64
size uint64
entries []entryInt8
}
// NewInt8HashTable returns a new hash table for int8 values
// initialized with the passed in capacity or 32 whichever is larger.
func NewInt8HashTable(cap uint64) *Int8HashTable {
initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
ret := &Int8HashTable{cap: initCap, capMask: initCap - 1, size: 0}
ret.entries = make([]entryInt8, initCap)
return ret
}
// Reset drops all of the values in this hash table and re-initializes it
// with the specified initial capacity as if by calling New, but without having
// to reallocate the object.
func (h *Int8HashTable) Reset(cap uint64) {
h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
h.capMask = h.cap - 1
h.size = 0
h.entries = make([]entryInt8, h.cap)
}
// CopyValues is used for copying the values out of the hash table into the
// passed in slice, in the order that they were first inserted
func (h *Int8HashTable) CopyValues(out []int8) {
h.CopyValuesSubset(0, out)
}
// CopyValuesSubset copies a subset of the values in the hashtable out, starting
// with the value at start, in the order that they were inserted.
func (h *Int8HashTable) CopyValuesSubset(start int, out []int8) {
h.VisitEntries(func(e *entryInt8) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
out[idx] = e.payload.val
}
})
}
func (h *Int8HashTable) WriteOut(out []byte) {
h.WriteOutSubset(0, out)
}
func (h *Int8HashTable) WriteOutSubset(start int, out []byte) {
data := arrow.Int8Traits.CastFromBytes(out)
h.VisitEntries(func(e *entryInt8) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
data[idx] = e.payload.val
}
})
}
func (h *Int8HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
func (Int8HashTable) fixHash(v uint64) uint64 {
if v == sentinel {
return 42
}
return v
}
// Lookup retrieves the entry for a given hash value assuming it's payload value returns
// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
func (h *Int8HashTable) Lookup(v uint64, cmp func(int8) bool) (*entryInt8, bool) {
idx, ok := h.lookup(v, h.capMask, cmp)
return &h.entries[idx], ok
}
func (h *Int8HashTable) lookup(v uint64, szMask uint64, cmp func(int8) bool) (uint64, bool) {
const perturbShift uint8 = 5
var (
idx uint64
perturb uint64
e *entryInt8
)
v = h.fixHash(v)
idx = v & szMask
perturb = (v >> uint64(perturbShift)) + 1
for {
e = &h.entries[idx]
if e.h == v && cmp(e.payload.val) {
return idx, true
}
if e.h == sentinel {
return idx, false
}
// perturbation logic inspired from CPython's set/dict object
// the goal is that all 64 bits of unmasked hash value eventually
// participate int he probing sequence, to minimize clustering
idx = (idx + perturb) & szMask
perturb = (perturb >> uint64(perturbShift)) + 1
}
}
func (h *Int8HashTable) upsize(newcap uint64) error {
newMask := newcap - 1
oldEntries := h.entries
h.entries = make([]entryInt8, newcap)
for _, e := range oldEntries {
if e.Valid() {
idx, _ := h.lookup(e.h, newMask, func(int8) bool { return false })
h.entries[idx] = e
}
}
h.cap = newcap
h.capMask = newMask
return nil
}
// Insert updates the given entry with the provided hash value, payload value and memo index.
// The entry pointer must have been retrieved via lookup in order to actually insert properly.
func (h *Int8HashTable) Insert(e *entryInt8, v uint64, val int8, memoIdx int32) error {
e.h = h.fixHash(v)
e.payload.val = val
e.payload.memoIdx = memoIdx
h.size++
if h.needUpsize() {
h.upsize(h.cap * uint64(loadFactor) * 2)
}
return nil
}
// VisitEntries will call the passed in function on each *valid* entry in the hash table,
// a valid entry being one which has had a value inserted into it.
func (h *Int8HashTable) VisitEntries(visit func(*entryInt8)) {
for _, e := range h.entries {
if e.Valid() {
visit(&e)
}
}
}
// Int8MemoTable is a wrapper over the appropriate hashtable to provide an interface
// conforming to the MemoTable interface defined in the encoding package for general interactions
// regarding dictionaries.
type Int8MemoTable struct {
tbl *Int8HashTable
nullIdx int32
}
// NewInt8MemoTable returns a new memotable with num entries pre-allocated to reduce further
// allocations when inserting.
func NewInt8MemoTable(num int64) *Int8MemoTable {
return &Int8MemoTable{tbl: NewInt8HashTable(uint64(num)), nullIdx: KeyNotFound}
}
func (Int8MemoTable) TypeTraits() TypeTraits {
return arrow.Int8Traits
}
// Reset allows this table to be re-used by dumping all the data currently in the table.
func (s *Int8MemoTable) Reset() {
s.tbl.Reset(32)
s.nullIdx = KeyNotFound
}
// Size returns the current number of inserted elements into the table including if a null
// has been inserted.
func (s *Int8MemoTable) Size() int {
sz := int(s.tbl.size)
if _, ok := s.GetNull(); ok {
sz++
}
return sz
}
// GetNull returns the index of an inserted null or KeyNotFound along with a bool
// that will be true if found and false if not.
func (s *Int8MemoTable) GetNull() (int, bool) {
return int(s.nullIdx), s.nullIdx != KeyNotFound
}
// GetOrInsertNull will return the index of the null entry or insert a null entry
// if one currently doesn't exist. The found value will be true if there was already
// a null in the table, and false if it inserted one.
func (s *Int8MemoTable) GetOrInsertNull() (idx int, found bool) {
idx, found = s.GetNull()
if !found {
idx = s.Size()
s.nullIdx = int32(idx)
}
return
}
// CopyValues will copy the values from the memo table out into the passed in slice
// which must be of the appropriate type.
func (s *Int8MemoTable) CopyValues(out interface{}) {
s.CopyValuesSubset(0, out)
}
// CopyValuesSubset is like CopyValues but only copies a subset of values starting
// at the provided start index
func (s *Int8MemoTable) CopyValuesSubset(start int, out interface{}) {
s.tbl.CopyValuesSubset(start, out.([]int8))
}
func (s *Int8MemoTable) WriteOut(out []byte) {
s.tbl.CopyValues(arrow.Int8Traits.CastFromBytes(out))
}
func (s *Int8MemoTable) WriteOutSubset(start int, out []byte) {
s.tbl.CopyValuesSubset(start, arrow.Int8Traits.CastFromBytes(out))
}
func (s *Int8MemoTable) WriteOutLE(out []byte) {
s.tbl.WriteOut(out)
}
func (s *Int8MemoTable) WriteOutSubsetLE(start int, out []byte) {
s.tbl.WriteOutSubset(start, out)
}
func (s *Int8MemoTable) Exists(val int8) bool {
_, ok := s.Get(val)
return ok
}
// Get returns the index of the requested value in the hash table or KeyNotFound
// along with a boolean indicating if it was found or not.
func (s *Int8MemoTable) Get(val interface{}) (int, bool) {
h := hashInt(uint64(val.(int8)), 0)
if e, ok := s.tbl.Lookup(h, func(v int8) bool { return val.(int8) == v }); ok {
return int(e.payload.memoIdx), ok
}
return KeyNotFound, false
}
// GetOrInsert will return the index of the specified value in the table, or insert the
// value into the table and return the new index. found indicates whether or not it already
// existed in the table (true) or was inserted by this call (false).
func (s *Int8MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
return s.InsertOrGet(val.(int8))
}
func (s *Int8MemoTable) InsertOrGet(val int8) (idx int, found bool, err error) {
h := hashInt(uint64(val), 0)
e, ok := s.tbl.Lookup(h, func(v int8) bool {
return val == v
})
if ok {
idx = int(e.payload.memoIdx)
found = true
} else {
idx = s.Size()
s.tbl.Insert(e, h, val, int32(idx))
}
return
}
// GetOrInsertBytes is unimplemented
func (s *Int8MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
panic("unimplemented")
}
type payloadUint8 struct {
val uint8
memoIdx int32
}
type entryUint8 struct {
h uint64
payload payloadUint8
}
func (e entryUint8) Valid() bool { return e.h != sentinel }
// Uint8HashTable is a hashtable specifically for uint8 that
// is utilized with the MemoTable to generalize interactions for easier
// implementation of dictionaries without losing performance.
type Uint8HashTable struct {
cap uint64
capMask uint64
size uint64
entries []entryUint8
}
// NewUint8HashTable returns a new hash table for uint8 values
// initialized with the passed in capacity or 32 whichever is larger.
func NewUint8HashTable(cap uint64) *Uint8HashTable {
initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
ret := &Uint8HashTable{cap: initCap, capMask: initCap - 1, size: 0}
ret.entries = make([]entryUint8, initCap)
return ret
}
// Reset drops all of the values in this hash table and re-initializes it
// with the specified initial capacity as if by calling New, but without having
// to reallocate the object.
func (h *Uint8HashTable) Reset(cap uint64) {
h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
h.capMask = h.cap - 1
h.size = 0
h.entries = make([]entryUint8, h.cap)
}
// CopyValues is used for copying the values out of the hash table into the
// passed in slice, in the order that they were first inserted
func (h *Uint8HashTable) CopyValues(out []uint8) {
h.CopyValuesSubset(0, out)
}
// CopyValuesSubset copies a subset of the values in the hashtable out, starting
// with the value at start, in the order that they were inserted.
func (h *Uint8HashTable) CopyValuesSubset(start int, out []uint8) {
h.VisitEntries(func(e *entryUint8) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
out[idx] = e.payload.val
}
})
}
func (h *Uint8HashTable) WriteOut(out []byte) {
h.WriteOutSubset(0, out)
}
func (h *Uint8HashTable) WriteOutSubset(start int, out []byte) {
data := arrow.Uint8Traits.CastFromBytes(out)
h.VisitEntries(func(e *entryUint8) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
data[idx] = e.payload.val
}
})
}
func (h *Uint8HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
func (Uint8HashTable) fixHash(v uint64) uint64 {
if v == sentinel {
return 42
}
return v
}
// Lookup retrieves the entry for a given hash value assuming it's payload value returns
// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
func (h *Uint8HashTable) Lookup(v uint64, cmp func(uint8) bool) (*entryUint8, bool) {
idx, ok := h.lookup(v, h.capMask, cmp)
return &h.entries[idx], ok
}
func (h *Uint8HashTable) lookup(v uint64, szMask uint64, cmp func(uint8) bool) (uint64, bool) {
const perturbShift uint8 = 5
var (
idx uint64
perturb uint64
e *entryUint8
)
v = h.fixHash(v)
idx = v & szMask
perturb = (v >> uint64(perturbShift)) + 1
for {
e = &h.entries[idx]
if e.h == v && cmp(e.payload.val) {
return idx, true
}
if e.h == sentinel {
return idx, false
}
// perturbation logic inspired from CPython's set/dict object
// the goal is that all 64 bits of unmasked hash value eventually
// participate int he probing sequence, to minimize clustering
idx = (idx + perturb) & szMask
perturb = (perturb >> uint64(perturbShift)) + 1
}
}
func (h *Uint8HashTable) upsize(newcap uint64) error {
newMask := newcap - 1
oldEntries := h.entries
h.entries = make([]entryUint8, newcap)
for _, e := range oldEntries {
if e.Valid() {
idx, _ := h.lookup(e.h, newMask, func(uint8) bool { return false })
h.entries[idx] = e
}
}
h.cap = newcap
h.capMask = newMask
return nil
}
// Insert updates the given entry with the provided hash value, payload value and memo index.
// The entry pointer must have been retrieved via lookup in order to actually insert properly.
func (h *Uint8HashTable) Insert(e *entryUint8, v uint64, val uint8, memoIdx int32) error {
e.h = h.fixHash(v)
e.payload.val = val
e.payload.memoIdx = memoIdx
h.size++
if h.needUpsize() {
h.upsize(h.cap * uint64(loadFactor) * 2)
}
return nil
}
// VisitEntries will call the passed in function on each *valid* entry in the hash table,
// a valid entry being one which has had a value inserted into it.
func (h *Uint8HashTable) VisitEntries(visit func(*entryUint8)) {
for _, e := range h.entries {
if e.Valid() {
visit(&e)
}
}
}
// Uint8MemoTable is a wrapper over the appropriate hashtable to provide an interface
// conforming to the MemoTable interface defined in the encoding package for general interactions
// regarding dictionaries.
type Uint8MemoTable struct {
tbl *Uint8HashTable
nullIdx int32
}
// NewUint8MemoTable returns a new memotable with num entries pre-allocated to reduce further
// allocations when inserting.
func NewUint8MemoTable(num int64) *Uint8MemoTable {
return &Uint8MemoTable{tbl: NewUint8HashTable(uint64(num)), nullIdx: KeyNotFound}
}
func (Uint8MemoTable) TypeTraits() TypeTraits {
return arrow.Uint8Traits
}
// Reset allows this table to be re-used by dumping all the data currently in the table.
func (s *Uint8MemoTable) Reset() {
s.tbl.Reset(32)
s.nullIdx = KeyNotFound
}
// Size returns the current number of inserted elements into the table including if a null
// has been inserted.
func (s *Uint8MemoTable) Size() int {
sz := int(s.tbl.size)
if _, ok := s.GetNull(); ok {
sz++
}
return sz
}
// GetNull returns the index of an inserted null or KeyNotFound along with a bool
// that will be true if found and false if not.
func (s *Uint8MemoTable) GetNull() (int, bool) {
return int(s.nullIdx), s.nullIdx != KeyNotFound
}
// GetOrInsertNull will return the index of the null entry or insert a null entry
// if one currently doesn't exist. The found value will be true if there was already
// a null in the table, and false if it inserted one.
func (s *Uint8MemoTable) GetOrInsertNull() (idx int, found bool) {
idx, found = s.GetNull()
if !found {
idx = s.Size()
s.nullIdx = int32(idx)
}
return
}
// CopyValues will copy the values from the memo table out into the passed in slice
// which must be of the appropriate type.
func (s *Uint8MemoTable) CopyValues(out interface{}) {
s.CopyValuesSubset(0, out)
}
// CopyValuesSubset is like CopyValues but only copies a subset of values starting
// at the provided start index
func (s *Uint8MemoTable) CopyValuesSubset(start int, out interface{}) {
s.tbl.CopyValuesSubset(start, out.([]uint8))
}
func (s *Uint8MemoTable) WriteOut(out []byte) {
s.tbl.CopyValues(arrow.Uint8Traits.CastFromBytes(out))
}
func (s *Uint8MemoTable) WriteOutSubset(start int, out []byte) {
s.tbl.CopyValuesSubset(start, arrow.Uint8Traits.CastFromBytes(out))
}
func (s *Uint8MemoTable) WriteOutLE(out []byte) {
s.tbl.WriteOut(out)
}
func (s *Uint8MemoTable) WriteOutSubsetLE(start int, out []byte) {
s.tbl.WriteOutSubset(start, out)
}
func (s *Uint8MemoTable) Exists(val uint8) bool {
_, ok := s.Get(val)
return ok
}
// Get returns the index of the requested value in the hash table or KeyNotFound
// along with a boolean indicating if it was found or not.
func (s *Uint8MemoTable) Get(val interface{}) (int, bool) {
h := hashInt(uint64(val.(uint8)), 0)
if e, ok := s.tbl.Lookup(h, func(v uint8) bool { return val.(uint8) == v }); ok {
return int(e.payload.memoIdx), ok
}
return KeyNotFound, false
}
// GetOrInsert will return the index of the specified value in the table, or insert the
// value into the table and return the new index. found indicates whether or not it already
// existed in the table (true) or was inserted by this call (false).
func (s *Uint8MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
return s.InsertOrGet(val.(uint8))
}
func (s *Uint8MemoTable) InsertOrGet(val uint8) (idx int, found bool, err error) {
h := hashInt(uint64(val), 0)
e, ok := s.tbl.Lookup(h, func(v uint8) bool {
return val == v
})
if ok {
idx = int(e.payload.memoIdx)
found = true
} else {
idx = s.Size()
s.tbl.Insert(e, h, val, int32(idx))
}
return
}
// GetOrInsertBytes is unimplemented
func (s *Uint8MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
panic("unimplemented")
}
type payloadInt16 struct {
val int16
memoIdx int32
}
type entryInt16 struct {
h uint64
payload payloadInt16
}
func (e entryInt16) Valid() bool { return e.h != sentinel }
// Int16HashTable is a hashtable specifically for int16 that
// is utilized with the MemoTable to generalize interactions for easier
// implementation of dictionaries without losing performance.
type Int16HashTable struct {
cap uint64
capMask uint64
size uint64
entries []entryInt16
}
// NewInt16HashTable returns a new hash table for int16 values
// initialized with the passed in capacity or 32 whichever is larger.
func NewInt16HashTable(cap uint64) *Int16HashTable {
initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
ret := &Int16HashTable{cap: initCap, capMask: initCap - 1, size: 0}
ret.entries = make([]entryInt16, initCap)
return ret
}
// Reset drops all of the values in this hash table and re-initializes it
// with the specified initial capacity as if by calling New, but without having
// to reallocate the object.
func (h *Int16HashTable) Reset(cap uint64) {
h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
h.capMask = h.cap - 1
h.size = 0
h.entries = make([]entryInt16, h.cap)
}
// CopyValues is used for copying the values out of the hash table into the
// passed in slice, in the order that they were first inserted
func (h *Int16HashTable) CopyValues(out []int16) {
h.CopyValuesSubset(0, out)
}
// CopyValuesSubset copies a subset of the values in the hashtable out, starting
// with the value at start, in the order that they were inserted.
func (h *Int16HashTable) CopyValuesSubset(start int, out []int16) {
h.VisitEntries(func(e *entryInt16) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
out[idx] = e.payload.val
}
})
}
func (h *Int16HashTable) WriteOut(out []byte) {
h.WriteOutSubset(0, out)
}
func (h *Int16HashTable) WriteOutSubset(start int, out []byte) {
data := arrow.Int16Traits.CastFromBytes(out)
h.VisitEntries(func(e *entryInt16) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
data[idx] = utils.ToLEInt16(e.payload.val)
}
})
}
func (h *Int16HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
func (Int16HashTable) fixHash(v uint64) uint64 {
if v == sentinel {
return 42
}
return v
}
// Lookup retrieves the entry for a given hash value assuming it's payload value returns
// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
func (h *Int16HashTable) Lookup(v uint64, cmp func(int16) bool) (*entryInt16, bool) {
idx, ok := h.lookup(v, h.capMask, cmp)
return &h.entries[idx], ok
}
func (h *Int16HashTable) lookup(v uint64, szMask uint64, cmp func(int16) bool) (uint64, bool) {
const perturbShift uint8 = 5
var (
idx uint64
perturb uint64
e *entryInt16
)
v = h.fixHash(v)
idx = v & szMask
perturb = (v >> uint64(perturbShift)) + 1
for {
e = &h.entries[idx]
if e.h == v && cmp(e.payload.val) {
return idx, true
}
if e.h == sentinel {
return idx, false
}
// perturbation logic inspired from CPython's set/dict object
// the goal is that all 64 bits of unmasked hash value eventually
// participate int he probing sequence, to minimize clustering
idx = (idx + perturb) & szMask
perturb = (perturb >> uint64(perturbShift)) + 1
}
}
func (h *Int16HashTable) upsize(newcap uint64) error {
newMask := newcap - 1
oldEntries := h.entries
h.entries = make([]entryInt16, newcap)
for _, e := range oldEntries {
if e.Valid() {
idx, _ := h.lookup(e.h, newMask, func(int16) bool { return false })
h.entries[idx] = e
}
}
h.cap = newcap
h.capMask = newMask
return nil
}
// Insert updates the given entry with the provided hash value, payload value and memo index.
// The entry pointer must have been retrieved via lookup in order to actually insert properly.
func (h *Int16HashTable) Insert(e *entryInt16, v uint64, val int16, memoIdx int32) error {
e.h = h.fixHash(v)
e.payload.val = val
e.payload.memoIdx = memoIdx
h.size++
if h.needUpsize() {
h.upsize(h.cap * uint64(loadFactor) * 2)
}
return nil
}
// VisitEntries will call the passed in function on each *valid* entry in the hash table,
// a valid entry being one which has had a value inserted into it.
func (h *Int16HashTable) VisitEntries(visit func(*entryInt16)) {
for _, e := range h.entries {
if e.Valid() {
visit(&e)
}
}
}
// Int16MemoTable is a wrapper over the appropriate hashtable to provide an interface
// conforming to the MemoTable interface defined in the encoding package for general interactions
// regarding dictionaries.
type Int16MemoTable struct {
tbl *Int16HashTable
nullIdx int32
}
// NewInt16MemoTable returns a new memotable with num entries pre-allocated to reduce further
// allocations when inserting.
func NewInt16MemoTable(num int64) *Int16MemoTable {
return &Int16MemoTable{tbl: NewInt16HashTable(uint64(num)), nullIdx: KeyNotFound}
}
func (Int16MemoTable) TypeTraits() TypeTraits {
return arrow.Int16Traits
}
// Reset allows this table to be re-used by dumping all the data currently in the table.
func (s *Int16MemoTable) Reset() {
s.tbl.Reset(32)
s.nullIdx = KeyNotFound
}
// Size returns the current number of inserted elements into the table including if a null
// has been inserted.
func (s *Int16MemoTable) Size() int {
sz := int(s.tbl.size)
if _, ok := s.GetNull(); ok {
sz++
}
return sz
}
// GetNull returns the index of an inserted null or KeyNotFound along with a bool
// that will be true if found and false if not.
func (s *Int16MemoTable) GetNull() (int, bool) {
return int(s.nullIdx), s.nullIdx != KeyNotFound
}
// GetOrInsertNull will return the index of the null entry or insert a null entry
// if one currently doesn't exist. The found value will be true if there was already
// a null in the table, and false if it inserted one.
func (s *Int16MemoTable) GetOrInsertNull() (idx int, found bool) {
idx, found = s.GetNull()
if !found {
idx = s.Size()
s.nullIdx = int32(idx)
}
return
}
// CopyValues will copy the values from the memo table out into the passed in slice
// which must be of the appropriate type.
func (s *Int16MemoTable) CopyValues(out interface{}) {
s.CopyValuesSubset(0, out)
}
// CopyValuesSubset is like CopyValues but only copies a subset of values starting
// at the provided start index
func (s *Int16MemoTable) CopyValuesSubset(start int, out interface{}) {
s.tbl.CopyValuesSubset(start, out.([]int16))
}
func (s *Int16MemoTable) WriteOut(out []byte) {
s.tbl.CopyValues(arrow.Int16Traits.CastFromBytes(out))
}
func (s *Int16MemoTable) WriteOutSubset(start int, out []byte) {
s.tbl.CopyValuesSubset(start, arrow.Int16Traits.CastFromBytes(out))
}
func (s *Int16MemoTable) WriteOutLE(out []byte) {
s.tbl.WriteOut(out)
}
func (s *Int16MemoTable) WriteOutSubsetLE(start int, out []byte) {
s.tbl.WriteOutSubset(start, out)
}
func (s *Int16MemoTable) Exists(val int16) bool {
_, ok := s.Get(val)
return ok
}
// Get returns the index of the requested value in the hash table or KeyNotFound
// along with a boolean indicating if it was found or not.
func (s *Int16MemoTable) Get(val interface{}) (int, bool) {
h := hashInt(uint64(val.(int16)), 0)
if e, ok := s.tbl.Lookup(h, func(v int16) bool { return val.(int16) == v }); ok {
return int(e.payload.memoIdx), ok
}
return KeyNotFound, false
}
// GetOrInsert will return the index of the specified value in the table, or insert the
// value into the table and return the new index. found indicates whether or not it already
// existed in the table (true) or was inserted by this call (false).
func (s *Int16MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
return s.InsertOrGet(val.(int16))
}
func (s *Int16MemoTable) InsertOrGet(val int16) (idx int, found bool, err error) {
h := hashInt(uint64(val), 0)
e, ok := s.tbl.Lookup(h, func(v int16) bool {
return val == v
})
if ok {
idx = int(e.payload.memoIdx)
found = true
} else {
idx = s.Size()
s.tbl.Insert(e, h, val, int32(idx))
}
return
}
// GetOrInsertBytes is unimplemented
func (s *Int16MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
panic("unimplemented")
}
type payloadUint16 struct {
val uint16
memoIdx int32
}
type entryUint16 struct {
h uint64
payload payloadUint16
}
func (e entryUint16) Valid() bool { return e.h != sentinel }
// Uint16HashTable is a hashtable specifically for uint16 that
// is utilized with the MemoTable to generalize interactions for easier
// implementation of dictionaries without losing performance.
type Uint16HashTable struct {
cap uint64
capMask uint64
size uint64
entries []entryUint16
}
// NewUint16HashTable returns a new hash table for uint16 values
// initialized with the passed in capacity or 32 whichever is larger.
func NewUint16HashTable(cap uint64) *Uint16HashTable {
initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
ret := &Uint16HashTable{cap: initCap, capMask: initCap - 1, size: 0}
ret.entries = make([]entryUint16, initCap)
return ret
}
// Reset drops all of the values in this hash table and re-initializes it
// with the specified initial capacity as if by calling New, but without having
// to reallocate the object.
func (h *Uint16HashTable) Reset(cap uint64) {
h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
h.capMask = h.cap - 1
h.size = 0
h.entries = make([]entryUint16, h.cap)
}
// CopyValues is used for copying the values out of the hash table into the
// passed in slice, in the order that they were first inserted
func (h *Uint16HashTable) CopyValues(out []uint16) {
h.CopyValuesSubset(0, out)
}
// CopyValuesSubset copies a subset of the values in the hashtable out, starting
// with the value at start, in the order that they were inserted.
func (h *Uint16HashTable) CopyValuesSubset(start int, out []uint16) {
h.VisitEntries(func(e *entryUint16) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
out[idx] = e.payload.val
}
})
}
func (h *Uint16HashTable) WriteOut(out []byte) {
h.WriteOutSubset(0, out)
}
func (h *Uint16HashTable) WriteOutSubset(start int, out []byte) {
data := arrow.Uint16Traits.CastFromBytes(out)
h.VisitEntries(func(e *entryUint16) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
data[idx] = utils.ToLEUint16(e.payload.val)
}
})
}
func (h *Uint16HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
func (Uint16HashTable) fixHash(v uint64) uint64 {
if v == sentinel {
return 42
}
return v
}
// Lookup retrieves the entry for a given hash value assuming it's payload value returns
// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
func (h *Uint16HashTable) Lookup(v uint64, cmp func(uint16) bool) (*entryUint16, bool) {
idx, ok := h.lookup(v, h.capMask, cmp)
return &h.entries[idx], ok
}
func (h *Uint16HashTable) lookup(v uint64, szMask uint64, cmp func(uint16) bool) (uint64, bool) {
const perturbShift uint8 = 5
var (
idx uint64
perturb uint64
e *entryUint16
)
v = h.fixHash(v)
idx = v & szMask
perturb = (v >> uint64(perturbShift)) + 1
for {
e = &h.entries[idx]
if e.h == v && cmp(e.payload.val) {
return idx, true
}
if e.h == sentinel {
return idx, false
}
// perturbation logic inspired from CPython's set/dict object
// the goal is that all 64 bits of unmasked hash value eventually
// participate int he probing sequence, to minimize clustering
idx = (idx + perturb) & szMask
perturb = (perturb >> uint64(perturbShift)) + 1
}
}
func (h *Uint16HashTable) upsize(newcap uint64) error {
newMask := newcap - 1
oldEntries := h.entries
h.entries = make([]entryUint16, newcap)
for _, e := range oldEntries {
if e.Valid() {
idx, _ := h.lookup(e.h, newMask, func(uint16) bool { return false })
h.entries[idx] = e
}
}
h.cap = newcap
h.capMask = newMask
return nil
}
// Insert updates the given entry with the provided hash value, payload value and memo index.
// The entry pointer must have been retrieved via lookup in order to actually insert properly.
func (h *Uint16HashTable) Insert(e *entryUint16, v uint64, val uint16, memoIdx int32) error {
e.h = h.fixHash(v)
e.payload.val = val
e.payload.memoIdx = memoIdx
h.size++
if h.needUpsize() {
h.upsize(h.cap * uint64(loadFactor) * 2)
}
return nil
}
// VisitEntries will call the passed in function on each *valid* entry in the hash table,
// a valid entry being one which has had a value inserted into it.
func (h *Uint16HashTable) VisitEntries(visit func(*entryUint16)) {
for _, e := range h.entries {
if e.Valid() {
visit(&e)
}
}
}
// Uint16MemoTable is a wrapper over the appropriate hashtable to provide an interface
// conforming to the MemoTable interface defined in the encoding package for general interactions
// regarding dictionaries.
type Uint16MemoTable struct {
tbl *Uint16HashTable
nullIdx int32
}
// NewUint16MemoTable returns a new memotable with num entries pre-allocated to reduce further
// allocations when inserting.
func NewUint16MemoTable(num int64) *Uint16MemoTable {
return &Uint16MemoTable{tbl: NewUint16HashTable(uint64(num)), nullIdx: KeyNotFound}
}
func (Uint16MemoTable) TypeTraits() TypeTraits {
return arrow.Uint16Traits
}
// Reset allows this table to be re-used by dumping all the data currently in the table.
func (s *Uint16MemoTable) Reset() {
s.tbl.Reset(32)
s.nullIdx = KeyNotFound
}
// Size returns the current number of inserted elements into the table including if a null
// has been inserted.
func (s *Uint16MemoTable) Size() int {
sz := int(s.tbl.size)
if _, ok := s.GetNull(); ok {
sz++
}
return sz
}
// GetNull returns the index of an inserted null or KeyNotFound along with a bool
// that will be true if found and false if not.
func (s *Uint16MemoTable) GetNull() (int, bool) {
return int(s.nullIdx), s.nullIdx != KeyNotFound
}
// GetOrInsertNull will return the index of the null entry or insert a null entry
// if one currently doesn't exist. The found value will be true if there was already
// a null in the table, and false if it inserted one.
func (s *Uint16MemoTable) GetOrInsertNull() (idx int, found bool) {
idx, found = s.GetNull()
if !found {
idx = s.Size()
s.nullIdx = int32(idx)
}
return
}
// CopyValues will copy the values from the memo table out into the passed in slice
// which must be of the appropriate type.
func (s *Uint16MemoTable) CopyValues(out interface{}) {
s.CopyValuesSubset(0, out)
}
// CopyValuesSubset is like CopyValues but only copies a subset of values starting
// at the provided start index
func (s *Uint16MemoTable) CopyValuesSubset(start int, out interface{}) {
s.tbl.CopyValuesSubset(start, out.([]uint16))
}
func (s *Uint16MemoTable) WriteOut(out []byte) {
s.tbl.CopyValues(arrow.Uint16Traits.CastFromBytes(out))
}
func (s *Uint16MemoTable) WriteOutSubset(start int, out []byte) {
s.tbl.CopyValuesSubset(start, arrow.Uint16Traits.CastFromBytes(out))
}
func (s *Uint16MemoTable) WriteOutLE(out []byte) {
s.tbl.WriteOut(out)
}
func (s *Uint16MemoTable) WriteOutSubsetLE(start int, out []byte) {
s.tbl.WriteOutSubset(start, out)
}
func (s *Uint16MemoTable) Exists(val uint16) bool {
_, ok := s.Get(val)
return ok
}
// Get returns the index of the requested value in the hash table or KeyNotFound
// along with a boolean indicating if it was found or not.
func (s *Uint16MemoTable) Get(val interface{}) (int, bool) {
h := hashInt(uint64(val.(uint16)), 0)
if e, ok := s.tbl.Lookup(h, func(v uint16) bool { return val.(uint16) == v }); ok {
return int(e.payload.memoIdx), ok
}
return KeyNotFound, false
}
// GetOrInsert will return the index of the specified value in the table, or insert the
// value into the table and return the new index. found indicates whether or not it already
// existed in the table (true) or was inserted by this call (false).
func (s *Uint16MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
return s.InsertOrGet(val.(uint16))
}
func (s *Uint16MemoTable) InsertOrGet(val uint16) (idx int, found bool, err error) {
h := hashInt(uint64(val), 0)
e, ok := s.tbl.Lookup(h, func(v uint16) bool {
return val == v
})
if ok {
idx = int(e.payload.memoIdx)
found = true
} else {
idx = s.Size()
s.tbl.Insert(e, h, val, int32(idx))
}
return
}
// GetOrInsertBytes is unimplemented
func (s *Uint16MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
panic("unimplemented")
}
type payloadInt32 struct {
val int32
memoIdx int32
}
type entryInt32 struct {
h uint64
payload payloadInt32
}
func (e entryInt32) Valid() bool { return e.h != sentinel }
// Int32HashTable is a hashtable specifically for int32 that
// is utilized with the MemoTable to generalize interactions for easier
// implementation of dictionaries without losing performance.
type Int32HashTable struct {
cap uint64
capMask uint64
size uint64
entries []entryInt32
}
// NewInt32HashTable returns a new hash table for int32 values
// initialized with the passed in capacity or 32 whichever is larger.
func NewInt32HashTable(cap uint64) *Int32HashTable {
initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
ret := &Int32HashTable{cap: initCap, capMask: initCap - 1, size: 0}
ret.entries = make([]entryInt32, initCap)
return ret
}
// Reset drops all of the values in this hash table and re-initializes it
// with the specified initial capacity as if by calling New, but without having
// to reallocate the object.
func (h *Int32HashTable) Reset(cap uint64) {
h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
h.capMask = h.cap - 1
h.size = 0
h.entries = make([]entryInt32, h.cap)
}
// CopyValues is used for copying the values out of the hash table into the
// passed in slice, in the order that they were first inserted
func (h *Int32HashTable) CopyValues(out []int32) {
h.CopyValuesSubset(0, out)
}
// CopyValuesSubset copies a subset of the values in the hashtable out, starting
// with the value at start, in the order that they were inserted.
func (h *Int32HashTable) CopyValuesSubset(start int, out []int32) {
h.VisitEntries(func(e *entryInt32) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
out[idx] = e.payload.val
}
})
}
func (h *Int32HashTable) WriteOut(out []byte) {
h.WriteOutSubset(0, out)
}
func (h *Int32HashTable) WriteOutSubset(start int, out []byte) {
data := arrow.Int32Traits.CastFromBytes(out)
h.VisitEntries(func(e *entryInt32) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
data[idx] = utils.ToLEInt32(e.payload.val)
}
})
}
func (h *Int32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
func (Int32HashTable) fixHash(v uint64) uint64 {
if v == sentinel {
return 42
}
return v
}
// Lookup retrieves the entry for a given hash value assuming it's payload value returns
// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
func (h *Int32HashTable) Lookup(v uint64, cmp func(int32) bool) (*entryInt32, bool) {
idx, ok := h.lookup(v, h.capMask, cmp)
return &h.entries[idx], ok
}
func (h *Int32HashTable) lookup(v uint64, szMask uint64, cmp func(int32) bool) (uint64, bool) {
const perturbShift uint8 = 5
var (
idx uint64
perturb uint64
e *entryInt32
)
v = h.fixHash(v)
idx = v & szMask
perturb = (v >> uint64(perturbShift)) + 1
for {
e = &h.entries[idx]
if e.h == v && cmp(e.payload.val) {
return idx, true
}
if e.h == sentinel {
return idx, false
}
// perturbation logic inspired from CPython's set/dict object
// the goal is that all 64 bits of unmasked hash value eventually
// participate int he probing sequence, to minimize clustering
idx = (idx + perturb) & szMask
perturb = (perturb >> uint64(perturbShift)) + 1
}
}
func (h *Int32HashTable) upsize(newcap uint64) error {
newMask := newcap - 1
oldEntries := h.entries
h.entries = make([]entryInt32, newcap)
for _, e := range oldEntries {
if e.Valid() {
idx, _ := h.lookup(e.h, newMask, func(int32) bool { return false })
h.entries[idx] = e
}
}
h.cap = newcap
h.capMask = newMask
return nil
}
// Insert updates the given entry with the provided hash value, payload value and memo index.
// The entry pointer must have been retrieved via lookup in order to actually insert properly.
func (h *Int32HashTable) Insert(e *entryInt32, v uint64, val int32, memoIdx int32) error {
e.h = h.fixHash(v)
e.payload.val = val
e.payload.memoIdx = memoIdx
h.size++
if h.needUpsize() {
h.upsize(h.cap * uint64(loadFactor) * 2)
}
return nil
}
// VisitEntries will call the passed in function on each *valid* entry in the hash table,
// a valid entry being one which has had a value inserted into it.
func (h *Int32HashTable) VisitEntries(visit func(*entryInt32)) {
for _, e := range h.entries {
if e.Valid() {
visit(&e)
}
}
}
// Int32MemoTable is a wrapper over the appropriate hashtable to provide an interface
// conforming to the MemoTable interface defined in the encoding package for general interactions
// regarding dictionaries.
type Int32MemoTable struct {
tbl *Int32HashTable
nullIdx int32
}
// NewInt32MemoTable returns a new memotable with num entries pre-allocated to reduce further
// allocations when inserting.
func NewInt32MemoTable(num int64) *Int32MemoTable {
return &Int32MemoTable{tbl: NewInt32HashTable(uint64(num)), nullIdx: KeyNotFound}
}
func (Int32MemoTable) TypeTraits() TypeTraits {
return arrow.Int32Traits
}
// Reset allows this table to be re-used by dumping all the data currently in the table.
func (s *Int32MemoTable) Reset() {
s.tbl.Reset(32)
s.nullIdx = KeyNotFound
}
// Size returns the current number of inserted elements into the table including if a null
// has been inserted.
func (s *Int32MemoTable) Size() int {
sz := int(s.tbl.size)
if _, ok := s.GetNull(); ok {
sz++
}
return sz
}
// GetNull returns the index of an inserted null or KeyNotFound along with a bool
// that will be true if found and false if not.
func (s *Int32MemoTable) GetNull() (int, bool) {
return int(s.nullIdx), s.nullIdx != KeyNotFound
}
// GetOrInsertNull will return the index of the null entry or insert a null entry
// if one currently doesn't exist. The found value will be true if there was already
// a null in the table, and false if it inserted one.
func (s *Int32MemoTable) GetOrInsertNull() (idx int, found bool) {
idx, found = s.GetNull()
if !found {
idx = s.Size()
s.nullIdx = int32(idx)
}
return
}
// CopyValues will copy the values from the memo table out into the passed in slice
// which must be of the appropriate type.
func (s *Int32MemoTable) CopyValues(out interface{}) {
s.CopyValuesSubset(0, out)
}
// CopyValuesSubset is like CopyValues but only copies a subset of values starting
// at the provided start index
func (s *Int32MemoTable) CopyValuesSubset(start int, out interface{}) {
s.tbl.CopyValuesSubset(start, out.([]int32))
}
func (s *Int32MemoTable) WriteOut(out []byte) {
s.tbl.CopyValues(arrow.Int32Traits.CastFromBytes(out))
}
func (s *Int32MemoTable) WriteOutSubset(start int, out []byte) {
s.tbl.CopyValuesSubset(start, arrow.Int32Traits.CastFromBytes(out))
}
func (s *Int32MemoTable) WriteOutLE(out []byte) {
s.tbl.WriteOut(out)
}
func (s *Int32MemoTable) WriteOutSubsetLE(start int, out []byte) {
s.tbl.WriteOutSubset(start, out)
}
func (s *Int32MemoTable) Exists(val int32) bool {
_, ok := s.Get(val)
return ok
}
// Get returns the index of the requested value in the hash table or KeyNotFound
// along with a boolean indicating if it was found or not.
func (s *Int32MemoTable) Get(val interface{}) (int, bool) {
h := hashInt(uint64(val.(int32)), 0)
if e, ok := s.tbl.Lookup(h, func(v int32) bool { return val.(int32) == v }); ok {
return int(e.payload.memoIdx), ok
}
return KeyNotFound, false
}
// GetOrInsert will return the index of the specified value in the table, or insert the
// value into the table and return the new index. found indicates whether or not it already
// existed in the table (true) or was inserted by this call (false).
func (s *Int32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
return s.InsertOrGet(val.(int32))
}
func (s *Int32MemoTable) InsertOrGet(val int32) (idx int, found bool, err error) {
h := hashInt(uint64(val), 0)
e, ok := s.tbl.Lookup(h, func(v int32) bool {
return val == v
})
if ok {
idx = int(e.payload.memoIdx)
found = true
} else {
idx = s.Size()
s.tbl.Insert(e, h, val, int32(idx))
}
return
}
// GetOrInsertBytes is unimplemented
func (s *Int32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
panic("unimplemented")
}
type payloadInt64 struct {
val int64
memoIdx int32
}
type entryInt64 struct {
h uint64
payload payloadInt64
}
func (e entryInt64) Valid() bool { return e.h != sentinel }
// Int64HashTable is a hashtable specifically for int64 that
// is utilized with the MemoTable to generalize interactions for easier
// implementation of dictionaries without losing performance.
type Int64HashTable struct {
cap uint64
capMask uint64
size uint64
entries []entryInt64
}
// NewInt64HashTable returns a new hash table for int64 values
// initialized with the passed in capacity or 32 whichever is larger.
func NewInt64HashTable(cap uint64) *Int64HashTable {
initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
ret := &Int64HashTable{cap: initCap, capMask: initCap - 1, size: 0}
ret.entries = make([]entryInt64, initCap)
return ret
}
// Reset drops all of the values in this hash table and re-initializes it
// with the specified initial capacity as if by calling New, but without having
// to reallocate the object.
func (h *Int64HashTable) Reset(cap uint64) {
h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
h.capMask = h.cap - 1
h.size = 0
h.entries = make([]entryInt64, h.cap)
}
// CopyValues is used for copying the values out of the hash table into the
// passed in slice, in the order that they were first inserted
func (h *Int64HashTable) CopyValues(out []int64) {
h.CopyValuesSubset(0, out)
}
// CopyValuesSubset copies a subset of the values in the hashtable out, starting
// with the value at start, in the order that they were inserted.
func (h *Int64HashTable) CopyValuesSubset(start int, out []int64) {
h.VisitEntries(func(e *entryInt64) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
out[idx] = e.payload.val
}
})
}
func (h *Int64HashTable) WriteOut(out []byte) {
h.WriteOutSubset(0, out)
}
func (h *Int64HashTable) WriteOutSubset(start int, out []byte) {
data := arrow.Int64Traits.CastFromBytes(out)
h.VisitEntries(func(e *entryInt64) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
data[idx] = utils.ToLEInt64(e.payload.val)
}
})
}
func (h *Int64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
func (Int64HashTable) fixHash(v uint64) uint64 {
if v == sentinel {
return 42
}
return v
}
// Lookup retrieves the entry for a given hash value assuming it's payload value returns
// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
func (h *Int64HashTable) Lookup(v uint64, cmp func(int64) bool) (*entryInt64, bool) {
idx, ok := h.lookup(v, h.capMask, cmp)
return &h.entries[idx], ok
}
func (h *Int64HashTable) lookup(v uint64, szMask uint64, cmp func(int64) bool) (uint64, bool) {
const perturbShift uint8 = 5
var (
idx uint64
perturb uint64
e *entryInt64
)
v = h.fixHash(v)
idx = v & szMask
perturb = (v >> uint64(perturbShift)) + 1
for {
e = &h.entries[idx]
if e.h == v && cmp(e.payload.val) {
return idx, true
}
if e.h == sentinel {
return idx, false
}
// perturbation logic inspired from CPython's set/dict object
// the goal is that all 64 bits of unmasked hash value eventually
// participate int he probing sequence, to minimize clustering
idx = (idx + perturb) & szMask
perturb = (perturb >> uint64(perturbShift)) + 1
}
}
func (h *Int64HashTable) upsize(newcap uint64) error {
newMask := newcap - 1
oldEntries := h.entries
h.entries = make([]entryInt64, newcap)
for _, e := range oldEntries {
if e.Valid() {
idx, _ := h.lookup(e.h, newMask, func(int64) bool { return false })
h.entries[idx] = e
}
}
h.cap = newcap
h.capMask = newMask
return nil
}
// Insert updates the given entry with the provided hash value, payload value and memo index.
// The entry pointer must have been retrieved via lookup in order to actually insert properly.
func (h *Int64HashTable) Insert(e *entryInt64, v uint64, val int64, memoIdx int32) error {
e.h = h.fixHash(v)
e.payload.val = val
e.payload.memoIdx = memoIdx
h.size++
if h.needUpsize() {
h.upsize(h.cap * uint64(loadFactor) * 2)
}
return nil
}
// VisitEntries will call the passed in function on each *valid* entry in the hash table,
// a valid entry being one which has had a value inserted into it.
func (h *Int64HashTable) VisitEntries(visit func(*entryInt64)) {
for _, e := range h.entries {
if e.Valid() {
visit(&e)
}
}
}
// Int64MemoTable is a wrapper over the appropriate hashtable to provide an interface
// conforming to the MemoTable interface defined in the encoding package for general interactions
// regarding dictionaries.
type Int64MemoTable struct {
tbl *Int64HashTable
nullIdx int32
}
// NewInt64MemoTable returns a new memotable with num entries pre-allocated to reduce further
// allocations when inserting.
func NewInt64MemoTable(num int64) *Int64MemoTable {
return &Int64MemoTable{tbl: NewInt64HashTable(uint64(num)), nullIdx: KeyNotFound}
}
func (Int64MemoTable) TypeTraits() TypeTraits {
return arrow.Int64Traits
}
// Reset allows this table to be re-used by dumping all the data currently in the table.
func (s *Int64MemoTable) Reset() {
s.tbl.Reset(32)
s.nullIdx = KeyNotFound
}
// Size returns the current number of inserted elements into the table including if a null
// has been inserted.
func (s *Int64MemoTable) Size() int {
sz := int(s.tbl.size)
if _, ok := s.GetNull(); ok {
sz++
}
return sz
}
// GetNull returns the index of an inserted null or KeyNotFound along with a bool
// that will be true if found and false if not.
func (s *Int64MemoTable) GetNull() (int, bool) {
return int(s.nullIdx), s.nullIdx != KeyNotFound
}
// GetOrInsertNull will return the index of the null entry or insert a null entry
// if one currently doesn't exist. The found value will be true if there was already
// a null in the table, and false if it inserted one.
func (s *Int64MemoTable) GetOrInsertNull() (idx int, found bool) {
idx, found = s.GetNull()
if !found {
idx = s.Size()
s.nullIdx = int32(idx)
}
return
}
// CopyValues will copy the values from the memo table out into the passed in slice
// which must be of the appropriate type.
func (s *Int64MemoTable) CopyValues(out interface{}) {
s.CopyValuesSubset(0, out)
}
// CopyValuesSubset is like CopyValues but only copies a subset of values starting
// at the provided start index
func (s *Int64MemoTable) CopyValuesSubset(start int, out interface{}) {
s.tbl.CopyValuesSubset(start, out.([]int64))
}
func (s *Int64MemoTable) WriteOut(out []byte) {
s.tbl.CopyValues(arrow.Int64Traits.CastFromBytes(out))
}
func (s *Int64MemoTable) WriteOutSubset(start int, out []byte) {
s.tbl.CopyValuesSubset(start, arrow.Int64Traits.CastFromBytes(out))
}
func (s *Int64MemoTable) WriteOutLE(out []byte) {
s.tbl.WriteOut(out)
}
func (s *Int64MemoTable) WriteOutSubsetLE(start int, out []byte) {
s.tbl.WriteOutSubset(start, out)
}
func (s *Int64MemoTable) Exists(val int64) bool {
_, ok := s.Get(val)
return ok
}
// Get returns the index of the requested value in the hash table or KeyNotFound
// along with a boolean indicating if it was found or not.
func (s *Int64MemoTable) Get(val interface{}) (int, bool) {
h := hashInt(uint64(val.(int64)), 0)
if e, ok := s.tbl.Lookup(h, func(v int64) bool { return val.(int64) == v }); ok {
return int(e.payload.memoIdx), ok
}
return KeyNotFound, false
}
// GetOrInsert will return the index of the specified value in the table, or insert the
// value into the table and return the new index. found indicates whether or not it already
// existed in the table (true) or was inserted by this call (false).
func (s *Int64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
return s.InsertOrGet(val.(int64))
}
func (s *Int64MemoTable) InsertOrGet(val int64) (idx int, found bool, err error) {
h := hashInt(uint64(val), 0)
e, ok := s.tbl.Lookup(h, func(v int64) bool {
return val == v
})
if ok {
idx = int(e.payload.memoIdx)
found = true
} else {
idx = s.Size()
s.tbl.Insert(e, h, val, int32(idx))
}
return
}
// GetOrInsertBytes is unimplemented
func (s *Int64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
panic("unimplemented")
}
type payloadUint32 struct {
val uint32
memoIdx int32
}
type entryUint32 struct {
h uint64
payload payloadUint32
}
func (e entryUint32) Valid() bool { return e.h != sentinel }
// Uint32HashTable is a hashtable specifically for uint32 that
// is utilized with the MemoTable to generalize interactions for easier
// implementation of dictionaries without losing performance.
type Uint32HashTable struct {
cap uint64
capMask uint64
size uint64
entries []entryUint32
}
// NewUint32HashTable returns a new hash table for uint32 values
// initialized with the passed in capacity or 32 whichever is larger.
func NewUint32HashTable(cap uint64) *Uint32HashTable {
initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
ret := &Uint32HashTable{cap: initCap, capMask: initCap - 1, size: 0}
ret.entries = make([]entryUint32, initCap)
return ret
}
// Reset drops all of the values in this hash table and re-initializes it
// with the specified initial capacity as if by calling New, but without having
// to reallocate the object.
func (h *Uint32HashTable) Reset(cap uint64) {
h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
h.capMask = h.cap - 1
h.size = 0
h.entries = make([]entryUint32, h.cap)
}
// CopyValues is used for copying the values out of the hash table into the
// passed in slice, in the order that they were first inserted
func (h *Uint32HashTable) CopyValues(out []uint32) {
h.CopyValuesSubset(0, out)
}
// CopyValuesSubset copies a subset of the values in the hashtable out, starting
// with the value at start, in the order that they were inserted.
func (h *Uint32HashTable) CopyValuesSubset(start int, out []uint32) {
h.VisitEntries(func(e *entryUint32) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
out[idx] = e.payload.val
}
})
}
func (h *Uint32HashTable) WriteOut(out []byte) {
h.WriteOutSubset(0, out)
}
func (h *Uint32HashTable) WriteOutSubset(start int, out []byte) {
data := arrow.Uint32Traits.CastFromBytes(out)
h.VisitEntries(func(e *entryUint32) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
data[idx] = utils.ToLEUint32(e.payload.val)
}
})
}
func (h *Uint32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
func (Uint32HashTable) fixHash(v uint64) uint64 {
if v == sentinel {
return 42
}
return v
}
// Lookup retrieves the entry for a given hash value assuming it's payload value returns
// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
func (h *Uint32HashTable) Lookup(v uint64, cmp func(uint32) bool) (*entryUint32, bool) {
idx, ok := h.lookup(v, h.capMask, cmp)
return &h.entries[idx], ok
}
func (h *Uint32HashTable) lookup(v uint64, szMask uint64, cmp func(uint32) bool) (uint64, bool) {
const perturbShift uint8 = 5
var (
idx uint64
perturb uint64
e *entryUint32
)
v = h.fixHash(v)
idx = v & szMask
perturb = (v >> uint64(perturbShift)) + 1
for {
e = &h.entries[idx]
if e.h == v && cmp(e.payload.val) {
return idx, true
}
if e.h == sentinel {
return idx, false
}
// perturbation logic inspired from CPython's set/dict object
// the goal is that all 64 bits of unmasked hash value eventually
// participate int he probing sequence, to minimize clustering
idx = (idx + perturb) & szMask
perturb = (perturb >> uint64(perturbShift)) + 1
}
}
func (h *Uint32HashTable) upsize(newcap uint64) error {
newMask := newcap - 1
oldEntries := h.entries
h.entries = make([]entryUint32, newcap)
for _, e := range oldEntries {
if e.Valid() {
idx, _ := h.lookup(e.h, newMask, func(uint32) bool { return false })
h.entries[idx] = e
}
}
h.cap = newcap
h.capMask = newMask
return nil
}
// Insert updates the given entry with the provided hash value, payload value and memo index.
// The entry pointer must have been retrieved via lookup in order to actually insert properly.
func (h *Uint32HashTable) Insert(e *entryUint32, v uint64, val uint32, memoIdx int32) error {
e.h = h.fixHash(v)
e.payload.val = val
e.payload.memoIdx = memoIdx
h.size++
if h.needUpsize() {
h.upsize(h.cap * uint64(loadFactor) * 2)
}
return nil
}
// VisitEntries will call the passed in function on each *valid* entry in the hash table,
// a valid entry being one which has had a value inserted into it.
func (h *Uint32HashTable) VisitEntries(visit func(*entryUint32)) {
for _, e := range h.entries {
if e.Valid() {
visit(&e)
}
}
}
// Uint32MemoTable is a wrapper over the appropriate hashtable to provide an interface
// conforming to the MemoTable interface defined in the encoding package for general interactions
// regarding dictionaries.
type Uint32MemoTable struct {
tbl *Uint32HashTable
nullIdx int32
}
// NewUint32MemoTable returns a new memotable with num entries pre-allocated to reduce further
// allocations when inserting.
func NewUint32MemoTable(num int64) *Uint32MemoTable {
return &Uint32MemoTable{tbl: NewUint32HashTable(uint64(num)), nullIdx: KeyNotFound}
}
func (Uint32MemoTable) TypeTraits() TypeTraits {
return arrow.Uint32Traits
}
// Reset allows this table to be re-used by dumping all the data currently in the table.
func (s *Uint32MemoTable) Reset() {
s.tbl.Reset(32)
s.nullIdx = KeyNotFound
}
// Size returns the current number of inserted elements into the table including if a null
// has been inserted.
func (s *Uint32MemoTable) Size() int {
sz := int(s.tbl.size)
if _, ok := s.GetNull(); ok {
sz++
}
return sz
}
// GetNull returns the index of an inserted null or KeyNotFound along with a bool
// that will be true if found and false if not.
func (s *Uint32MemoTable) GetNull() (int, bool) {
return int(s.nullIdx), s.nullIdx != KeyNotFound
}
// GetOrInsertNull will return the index of the null entry or insert a null entry
// if one currently doesn't exist. The found value will be true if there was already
// a null in the table, and false if it inserted one.
func (s *Uint32MemoTable) GetOrInsertNull() (idx int, found bool) {
idx, found = s.GetNull()
if !found {
idx = s.Size()
s.nullIdx = int32(idx)
}
return
}
// CopyValues will copy the values from the memo table out into the passed in slice
// which must be of the appropriate type.
func (s *Uint32MemoTable) CopyValues(out interface{}) {
s.CopyValuesSubset(0, out)
}
// CopyValuesSubset is like CopyValues but only copies a subset of values starting
// at the provided start index
func (s *Uint32MemoTable) CopyValuesSubset(start int, out interface{}) {
s.tbl.CopyValuesSubset(start, out.([]uint32))
}
func (s *Uint32MemoTable) WriteOut(out []byte) {
s.tbl.CopyValues(arrow.Uint32Traits.CastFromBytes(out))
}
func (s *Uint32MemoTable) WriteOutSubset(start int, out []byte) {
s.tbl.CopyValuesSubset(start, arrow.Uint32Traits.CastFromBytes(out))
}
func (s *Uint32MemoTable) WriteOutLE(out []byte) {
s.tbl.WriteOut(out)
}
func (s *Uint32MemoTable) WriteOutSubsetLE(start int, out []byte) {
s.tbl.WriteOutSubset(start, out)
}
func (s *Uint32MemoTable) Exists(val uint32) bool {
_, ok := s.Get(val)
return ok
}
// Get returns the index of the requested value in the hash table or KeyNotFound
// along with a boolean indicating if it was found or not.
func (s *Uint32MemoTable) Get(val interface{}) (int, bool) {
h := hashInt(uint64(val.(uint32)), 0)
if e, ok := s.tbl.Lookup(h, func(v uint32) bool { return val.(uint32) == v }); ok {
return int(e.payload.memoIdx), ok
}
return KeyNotFound, false
}
// GetOrInsert will return the index of the specified value in the table, or insert the
// value into the table and return the new index. found indicates whether or not it already
// existed in the table (true) or was inserted by this call (false).
func (s *Uint32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
return s.InsertOrGet(val.(uint32))
}
func (s *Uint32MemoTable) InsertOrGet(val uint32) (idx int, found bool, err error) {
h := hashInt(uint64(val), 0)
e, ok := s.tbl.Lookup(h, func(v uint32) bool {
return val == v
})
if ok {
idx = int(e.payload.memoIdx)
found = true
} else {
idx = s.Size()
s.tbl.Insert(e, h, val, int32(idx))
}
return
}
// GetOrInsertBytes is unimplemented
func (s *Uint32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
panic("unimplemented")
}
type payloadUint64 struct {
val uint64
memoIdx int32
}
type entryUint64 struct {
h uint64
payload payloadUint64
}
func (e entryUint64) Valid() bool { return e.h != sentinel }
// Uint64HashTable is a hashtable specifically for uint64 that
// is utilized with the MemoTable to generalize interactions for easier
// implementation of dictionaries without losing performance.
type Uint64HashTable struct {
cap uint64
capMask uint64
size uint64
entries []entryUint64
}
// NewUint64HashTable returns a new hash table for uint64 values
// initialized with the passed in capacity or 32 whichever is larger.
func NewUint64HashTable(cap uint64) *Uint64HashTable {
initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
ret := &Uint64HashTable{cap: initCap, capMask: initCap - 1, size: 0}
ret.entries = make([]entryUint64, initCap)
return ret
}
// Reset drops all of the values in this hash table and re-initializes it
// with the specified initial capacity as if by calling New, but without having
// to reallocate the object.
func (h *Uint64HashTable) Reset(cap uint64) {
h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
h.capMask = h.cap - 1
h.size = 0
h.entries = make([]entryUint64, h.cap)
}
// CopyValues is used for copying the values out of the hash table into the
// passed in slice, in the order that they were first inserted
func (h *Uint64HashTable) CopyValues(out []uint64) {
h.CopyValuesSubset(0, out)
}
// CopyValuesSubset copies a subset of the values in the hashtable out, starting
// with the value at start, in the order that they were inserted.
func (h *Uint64HashTable) CopyValuesSubset(start int, out []uint64) {
h.VisitEntries(func(e *entryUint64) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
out[idx] = e.payload.val
}
})
}
func (h *Uint64HashTable) WriteOut(out []byte) {
h.WriteOutSubset(0, out)
}
func (h *Uint64HashTable) WriteOutSubset(start int, out []byte) {
data := arrow.Uint64Traits.CastFromBytes(out)
h.VisitEntries(func(e *entryUint64) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
data[idx] = utils.ToLEUint64(e.payload.val)
}
})
}
func (h *Uint64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
func (Uint64HashTable) fixHash(v uint64) uint64 {
if v == sentinel {
return 42
}
return v
}
// Lookup retrieves the entry for a given hash value assuming it's payload value returns
// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
func (h *Uint64HashTable) Lookup(v uint64, cmp func(uint64) bool) (*entryUint64, bool) {
idx, ok := h.lookup(v, h.capMask, cmp)
return &h.entries[idx], ok
}
func (h *Uint64HashTable) lookup(v uint64, szMask uint64, cmp func(uint64) bool) (uint64, bool) {
const perturbShift uint8 = 5
var (
idx uint64
perturb uint64
e *entryUint64
)
v = h.fixHash(v)
idx = v & szMask
perturb = (v >> uint64(perturbShift)) + 1
for {
e = &h.entries[idx]
if e.h == v && cmp(e.payload.val) {
return idx, true
}
if e.h == sentinel {
return idx, false
}
// perturbation logic inspired from CPython's set/dict object
// the goal is that all 64 bits of unmasked hash value eventually
// participate int he probing sequence, to minimize clustering
idx = (idx + perturb) & szMask
perturb = (perturb >> uint64(perturbShift)) + 1
}
}
func (h *Uint64HashTable) upsize(newcap uint64) error {
newMask := newcap - 1
oldEntries := h.entries
h.entries = make([]entryUint64, newcap)
for _, e := range oldEntries {
if e.Valid() {
idx, _ := h.lookup(e.h, newMask, func(uint64) bool { return false })
h.entries[idx] = e
}
}
h.cap = newcap
h.capMask = newMask
return nil
}
// Insert updates the given entry with the provided hash value, payload value and memo index.
// The entry pointer must have been retrieved via lookup in order to actually insert properly.
func (h *Uint64HashTable) Insert(e *entryUint64, v uint64, val uint64, memoIdx int32) error {
e.h = h.fixHash(v)
e.payload.val = val
e.payload.memoIdx = memoIdx
h.size++
if h.needUpsize() {
h.upsize(h.cap * uint64(loadFactor) * 2)
}
return nil
}
// VisitEntries will call the passed in function on each *valid* entry in the hash table,
// a valid entry being one which has had a value inserted into it.
func (h *Uint64HashTable) VisitEntries(visit func(*entryUint64)) {
for _, e := range h.entries {
if e.Valid() {
visit(&e)
}
}
}
// Uint64MemoTable is a wrapper over the appropriate hashtable to provide an interface
// conforming to the MemoTable interface defined in the encoding package for general interactions
// regarding dictionaries.
type Uint64MemoTable struct {
tbl *Uint64HashTable
nullIdx int32
}
// NewUint64MemoTable returns a new memotable with num entries pre-allocated to reduce further
// allocations when inserting.
func NewUint64MemoTable(num int64) *Uint64MemoTable {
return &Uint64MemoTable{tbl: NewUint64HashTable(uint64(num)), nullIdx: KeyNotFound}
}
func (Uint64MemoTable) TypeTraits() TypeTraits {
return arrow.Uint64Traits
}
// Reset allows this table to be re-used by dumping all the data currently in the table.
func (s *Uint64MemoTable) Reset() {
s.tbl.Reset(32)
s.nullIdx = KeyNotFound
}
// Size returns the current number of inserted elements into the table including if a null
// has been inserted.
func (s *Uint64MemoTable) Size() int {
sz := int(s.tbl.size)
if _, ok := s.GetNull(); ok {
sz++
}
return sz
}
// GetNull returns the index of an inserted null or KeyNotFound along with a bool
// that will be true if found and false if not.
func (s *Uint64MemoTable) GetNull() (int, bool) {
return int(s.nullIdx), s.nullIdx != KeyNotFound
}
// GetOrInsertNull will return the index of the null entry or insert a null entry
// if one currently doesn't exist. The found value will be true if there was already
// a null in the table, and false if it inserted one.
func (s *Uint64MemoTable) GetOrInsertNull() (idx int, found bool) {
idx, found = s.GetNull()
if !found {
idx = s.Size()
s.nullIdx = int32(idx)
}
return
}
// CopyValues will copy the values from the memo table out into the passed in slice
// which must be of the appropriate type.
func (s *Uint64MemoTable) CopyValues(out interface{}) {
s.CopyValuesSubset(0, out)
}
// CopyValuesSubset is like CopyValues but only copies a subset of values starting
// at the provided start index
func (s *Uint64MemoTable) CopyValuesSubset(start int, out interface{}) {
s.tbl.CopyValuesSubset(start, out.([]uint64))
}
func (s *Uint64MemoTable) WriteOut(out []byte) {
s.tbl.CopyValues(arrow.Uint64Traits.CastFromBytes(out))
}
func (s *Uint64MemoTable) WriteOutSubset(start int, out []byte) {
s.tbl.CopyValuesSubset(start, arrow.Uint64Traits.CastFromBytes(out))
}
func (s *Uint64MemoTable) WriteOutLE(out []byte) {
s.tbl.WriteOut(out)
}
func (s *Uint64MemoTable) WriteOutSubsetLE(start int, out []byte) {
s.tbl.WriteOutSubset(start, out)
}
func (s *Uint64MemoTable) Exists(val uint64) bool {
_, ok := s.Get(val)
return ok
}
// Get returns the index of the requested value in the hash table or KeyNotFound
// along with a boolean indicating if it was found or not.
func (s *Uint64MemoTable) Get(val interface{}) (int, bool) {
h := hashInt(uint64(val.(uint64)), 0)
if e, ok := s.tbl.Lookup(h, func(v uint64) bool { return val.(uint64) == v }); ok {
return int(e.payload.memoIdx), ok
}
return KeyNotFound, false
}
// GetOrInsert will return the index of the specified value in the table, or insert the
// value into the table and return the new index. found indicates whether or not it already
// existed in the table (true) or was inserted by this call (false).
func (s *Uint64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
return s.InsertOrGet(val.(uint64))
}
func (s *Uint64MemoTable) InsertOrGet(val uint64) (idx int, found bool, err error) {
h := hashInt(uint64(val), 0)
e, ok := s.tbl.Lookup(h, func(v uint64) bool {
return val == v
})
if ok {
idx = int(e.payload.memoIdx)
found = true
} else {
idx = s.Size()
s.tbl.Insert(e, h, val, int32(idx))
}
return
}
// GetOrInsertBytes is unimplemented
func (s *Uint64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
panic("unimplemented")
}
type payloadFloat32 struct {
val float32
memoIdx int32
}
type entryFloat32 struct {
h uint64
payload payloadFloat32
}
func (e entryFloat32) Valid() bool { return e.h != sentinel }
// Float32HashTable is a hashtable specifically for float32 that
// is utilized with the MemoTable to generalize interactions for easier
// implementation of dictionaries without losing performance.
type Float32HashTable struct {
cap uint64
capMask uint64
size uint64
entries []entryFloat32
}
// NewFloat32HashTable returns a new hash table for float32 values
// initialized with the passed in capacity or 32 whichever is larger.
func NewFloat32HashTable(cap uint64) *Float32HashTable {
initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
ret := &Float32HashTable{cap: initCap, capMask: initCap - 1, size: 0}
ret.entries = make([]entryFloat32, initCap)
return ret
}
// Reset drops all of the values in this hash table and re-initializes it
// with the specified initial capacity as if by calling New, but without having
// to reallocate the object.
func (h *Float32HashTable) Reset(cap uint64) {
h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
h.capMask = h.cap - 1
h.size = 0
h.entries = make([]entryFloat32, h.cap)
}
// CopyValues is used for copying the values out of the hash table into the
// passed in slice, in the order that they were first inserted
func (h *Float32HashTable) CopyValues(out []float32) {
h.CopyValuesSubset(0, out)
}
// CopyValuesSubset copies a subset of the values in the hashtable out, starting
// with the value at start, in the order that they were inserted.
func (h *Float32HashTable) CopyValuesSubset(start int, out []float32) {
h.VisitEntries(func(e *entryFloat32) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
out[idx] = e.payload.val
}
})
}
func (h *Float32HashTable) WriteOut(out []byte) {
h.WriteOutSubset(0, out)
}
func (h *Float32HashTable) WriteOutSubset(start int, out []byte) {
data := arrow.Float32Traits.CastFromBytes(out)
h.VisitEntries(func(e *entryFloat32) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
data[idx] = utils.ToLEFloat32(e.payload.val)
}
})
}
func (h *Float32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
func (Float32HashTable) fixHash(v uint64) uint64 {
if v == sentinel {
return 42
}
return v
}
// Lookup retrieves the entry for a given hash value assuming it's payload value returns
// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
func (h *Float32HashTable) Lookup(v uint64, cmp func(float32) bool) (*entryFloat32, bool) {
idx, ok := h.lookup(v, h.capMask, cmp)
return &h.entries[idx], ok
}
func (h *Float32HashTable) lookup(v uint64, szMask uint64, cmp func(float32) bool) (uint64, bool) {
const perturbShift uint8 = 5
var (
idx uint64
perturb uint64
e *entryFloat32
)
v = h.fixHash(v)
idx = v & szMask
perturb = (v >> uint64(perturbShift)) + 1
for {
e = &h.entries[idx]
if e.h == v && cmp(e.payload.val) {
return idx, true
}
if e.h == sentinel {
return idx, false
}
// perturbation logic inspired from CPython's set/dict object
// the goal is that all 64 bits of unmasked hash value eventually
// participate int he probing sequence, to minimize clustering
idx = (idx + perturb) & szMask
perturb = (perturb >> uint64(perturbShift)) + 1
}
}
func (h *Float32HashTable) upsize(newcap uint64) error {
newMask := newcap - 1
oldEntries := h.entries
h.entries = make([]entryFloat32, newcap)
for _, e := range oldEntries {
if e.Valid() {
idx, _ := h.lookup(e.h, newMask, func(float32) bool { return false })
h.entries[idx] = e
}
}
h.cap = newcap
h.capMask = newMask
return nil
}
// Insert updates the given entry with the provided hash value, payload value and memo index.
// The entry pointer must have been retrieved via lookup in order to actually insert properly.
func (h *Float32HashTable) Insert(e *entryFloat32, v uint64, val float32, memoIdx int32) error {
e.h = h.fixHash(v)
e.payload.val = val
e.payload.memoIdx = memoIdx
h.size++
if h.needUpsize() {
h.upsize(h.cap * uint64(loadFactor) * 2)
}
return nil
}
// VisitEntries will call the passed in function on each *valid* entry in the hash table,
// a valid entry being one which has had a value inserted into it.
func (h *Float32HashTable) VisitEntries(visit func(*entryFloat32)) {
for _, e := range h.entries {
if e.Valid() {
visit(&e)
}
}
}
// Float32MemoTable is a wrapper over the appropriate hashtable to provide an interface
// conforming to the MemoTable interface defined in the encoding package for general interactions
// regarding dictionaries.
type Float32MemoTable struct {
tbl *Float32HashTable
nullIdx int32
}
// NewFloat32MemoTable returns a new memotable with num entries pre-allocated to reduce further
// allocations when inserting.
func NewFloat32MemoTable(num int64) *Float32MemoTable {
return &Float32MemoTable{tbl: NewFloat32HashTable(uint64(num)), nullIdx: KeyNotFound}
}
func (Float32MemoTable) TypeTraits() TypeTraits {
return arrow.Float32Traits
}
// Reset allows this table to be re-used by dumping all the data currently in the table.
func (s *Float32MemoTable) Reset() {
s.tbl.Reset(32)
s.nullIdx = KeyNotFound
}
// Size returns the current number of inserted elements into the table including if a null
// has been inserted.
func (s *Float32MemoTable) Size() int {
sz := int(s.tbl.size)
if _, ok := s.GetNull(); ok {
sz++
}
return sz
}
// GetNull returns the index of an inserted null or KeyNotFound along with a bool
// that will be true if found and false if not.
func (s *Float32MemoTable) GetNull() (int, bool) {
return int(s.nullIdx), s.nullIdx != KeyNotFound
}
// GetOrInsertNull will return the index of the null entry or insert a null entry
// if one currently doesn't exist. The found value will be true if there was already
// a null in the table, and false if it inserted one.
func (s *Float32MemoTable) GetOrInsertNull() (idx int, found bool) {
idx, found = s.GetNull()
if !found {
idx = s.Size()
s.nullIdx = int32(idx)
}
return
}
// CopyValues will copy the values from the memo table out into the passed in slice
// which must be of the appropriate type.
func (s *Float32MemoTable) CopyValues(out interface{}) {
s.CopyValuesSubset(0, out)
}
// CopyValuesSubset is like CopyValues but only copies a subset of values starting
// at the provided start index
func (s *Float32MemoTable) CopyValuesSubset(start int, out interface{}) {
s.tbl.CopyValuesSubset(start, out.([]float32))
}
func (s *Float32MemoTable) WriteOut(out []byte) {
s.tbl.CopyValues(arrow.Float32Traits.CastFromBytes(out))
}
func (s *Float32MemoTable) WriteOutSubset(start int, out []byte) {
s.tbl.CopyValuesSubset(start, arrow.Float32Traits.CastFromBytes(out))
}
func (s *Float32MemoTable) WriteOutLE(out []byte) {
s.tbl.WriteOut(out)
}
func (s *Float32MemoTable) WriteOutSubsetLE(start int, out []byte) {
s.tbl.WriteOutSubset(start, out)
}
func (s *Float32MemoTable) Exists(val float32) bool {
_, ok := s.Get(val)
return ok
}
// Get returns the index of the requested value in the hash table or KeyNotFound
// along with a boolean indicating if it was found or not.
func (s *Float32MemoTable) Get(val interface{}) (int, bool) {
var cmp func(float32) bool
if math.IsNaN(float64(val.(float32))) {
cmp = isNan32Cmp
// use consistent internal bit pattern for NaN regardless of the pattern
// that is passed to us. NaN is NaN is NaN
val = float32(math.NaN())
} else {
cmp = func(v float32) bool { return val.(float32) == v }
}
h := hashFloat32(val.(float32), 0)
if e, ok := s.tbl.Lookup(h, cmp); ok {
return int(e.payload.memoIdx), ok
}
return KeyNotFound, false
}
// GetOrInsert will return the index of the specified value in the table, or insert the
// value into the table and return the new index. found indicates whether or not it already
// existed in the table (true) or was inserted by this call (false).
func (s *Float32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
return s.InsertOrGet(val.(float32))
}
func (s *Float32MemoTable) InsertOrGet(val float32) (idx int, found bool, err error) {
var cmp func(float32) bool
if math.IsNaN(float64(val)) {
cmp = isNan32Cmp
// use consistent internal bit pattern for NaN regardless of the pattern
// that is passed to us. NaN is NaN is NaN
val = float32(math.NaN())
} else {
cmp = func(v float32) bool { return val == v }
}
h := hashFloat32(val, 0)
e, ok := s.tbl.Lookup(h, cmp)
if ok {
idx = int(e.payload.memoIdx)
found = true
} else {
idx = s.Size()
s.tbl.Insert(e, h, val, int32(idx))
}
return
}
// GetOrInsertBytes is unimplemented
func (s *Float32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
panic("unimplemented")
}
type payloadFloat64 struct {
val float64
memoIdx int32
}
type entryFloat64 struct {
h uint64
payload payloadFloat64
}
func (e entryFloat64) Valid() bool { return e.h != sentinel }
// Float64HashTable is a hashtable specifically for float64 that
// is utilized with the MemoTable to generalize interactions for easier
// implementation of dictionaries without losing performance.
type Float64HashTable struct {
cap uint64
capMask uint64
size uint64
entries []entryFloat64
}
// NewFloat64HashTable returns a new hash table for float64 values
// initialized with the passed in capacity or 32 whichever is larger.
func NewFloat64HashTable(cap uint64) *Float64HashTable {
initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
ret := &Float64HashTable{cap: initCap, capMask: initCap - 1, size: 0}
ret.entries = make([]entryFloat64, initCap)
return ret
}
// Reset drops all of the values in this hash table and re-initializes it
// with the specified initial capacity as if by calling New, but without having
// to reallocate the object.
func (h *Float64HashTable) Reset(cap uint64) {
h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
h.capMask = h.cap - 1
h.size = 0
h.entries = make([]entryFloat64, h.cap)
}
// CopyValues is used for copying the values out of the hash table into the
// passed in slice, in the order that they were first inserted
func (h *Float64HashTable) CopyValues(out []float64) {
h.CopyValuesSubset(0, out)
}
// CopyValuesSubset copies a subset of the values in the hashtable out, starting
// with the value at start, in the order that they were inserted.
func (h *Float64HashTable) CopyValuesSubset(start int, out []float64) {
h.VisitEntries(func(e *entryFloat64) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
out[idx] = e.payload.val
}
})
}
func (h *Float64HashTable) WriteOut(out []byte) {
h.WriteOutSubset(0, out)
}
func (h *Float64HashTable) WriteOutSubset(start int, out []byte) {
data := arrow.Float64Traits.CastFromBytes(out)
h.VisitEntries(func(e *entryFloat64) {
idx := e.payload.memoIdx - int32(start)
if idx >= 0 {
data[idx] = utils.ToLEFloat64(e.payload.val)
}
})
}
func (h *Float64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
func (Float64HashTable) fixHash(v uint64) uint64 {
if v == sentinel {
return 42
}
return v
}
// Lookup retrieves the entry for a given hash value assuming it's payload value returns
// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
func (h *Float64HashTable) Lookup(v uint64, cmp func(float64) bool) (*entryFloat64, bool) {
idx, ok := h.lookup(v, h.capMask, cmp)
return &h.entries[idx], ok
}
func (h *Float64HashTable) lookup(v uint64, szMask uint64, cmp func(float64) bool) (uint64, bool) {
const perturbShift uint8 = 5
var (
idx uint64
perturb uint64
e *entryFloat64
)
v = h.fixHash(v)
idx = v & szMask
perturb = (v >> uint64(perturbShift)) + 1
for {
e = &h.entries[idx]
if e.h == v && cmp(e.payload.val) {
return idx, true
}
if e.h == sentinel {
return idx, false
}
// perturbation logic inspired from CPython's set/dict object
// the goal is that all 64 bits of unmasked hash value eventually
// participate int he probing sequence, to minimize clustering
idx = (idx + perturb) & szMask
perturb = (perturb >> uint64(perturbShift)) + 1
}
}
func (h *Float64HashTable) upsize(newcap uint64) error {
newMask := newcap - 1
oldEntries := h.entries
h.entries = make([]entryFloat64, newcap)
for _, e := range oldEntries {
if e.Valid() {
idx, _ := h.lookup(e.h, newMask, func(float64) bool { return false })
h.entries[idx] = e
}
}
h.cap = newcap
h.capMask = newMask
return nil
}
// Insert updates the given entry with the provided hash value, payload value and memo index.
// The entry pointer must have been retrieved via lookup in order to actually insert properly.
func (h *Float64HashTable) Insert(e *entryFloat64, v uint64, val float64, memoIdx int32) error {
e.h = h.fixHash(v)
e.payload.val = val
e.payload.memoIdx = memoIdx
h.size++
if h.needUpsize() {
h.upsize(h.cap * uint64(loadFactor) * 2)
}
return nil
}
// VisitEntries will call the passed in function on each *valid* entry in the hash table,
// a valid entry being one which has had a value inserted into it.
func (h *Float64HashTable) VisitEntries(visit func(*entryFloat64)) {
for _, e := range h.entries {
if e.Valid() {
visit(&e)
}
}
}
// Float64MemoTable is a wrapper over the appropriate hashtable to provide an interface
// conforming to the MemoTable interface defined in the encoding package for general interactions
// regarding dictionaries.
type Float64MemoTable struct {
tbl *Float64HashTable
nullIdx int32
}
// NewFloat64MemoTable returns a new memotable with num entries pre-allocated to reduce further
// allocations when inserting.
func NewFloat64MemoTable(num int64) *Float64MemoTable {
return &Float64MemoTable{tbl: NewFloat64HashTable(uint64(num)), nullIdx: KeyNotFound}
}
func (Float64MemoTable) TypeTraits() TypeTraits {
return arrow.Float64Traits
}
// Reset allows this table to be re-used by dumping all the data currently in the table.
func (s *Float64MemoTable) Reset() {
s.tbl.Reset(32)
s.nullIdx = KeyNotFound
}
// Size returns the current number of inserted elements into the table including if a null
// has been inserted.
func (s *Float64MemoTable) Size() int {
sz := int(s.tbl.size)
if _, ok := s.GetNull(); ok {
sz++
}
return sz
}
// GetNull returns the index of an inserted null or KeyNotFound along with a bool
// that will be true if found and false if not.
func (s *Float64MemoTable) GetNull() (int, bool) {
return int(s.nullIdx), s.nullIdx != KeyNotFound
}
// GetOrInsertNull will return the index of the null entry or insert a null entry
// if one currently doesn't exist. The found value will be true if there was already
// a null in the table, and false if it inserted one.
func (s *Float64MemoTable) GetOrInsertNull() (idx int, found bool) {
idx, found = s.GetNull()
if !found {
idx = s.Size()
s.nullIdx = int32(idx)
}
return
}
// CopyValues will copy the values from the memo table out into the passed in slice
// which must be of the appropriate type.
func (s *Float64MemoTable) CopyValues(out interface{}) {
s.CopyValuesSubset(0, out)
}
// CopyValuesSubset is like CopyValues but only copies a subset of values starting
// at the provided start index
func (s *Float64MemoTable) CopyValuesSubset(start int, out interface{}) {
s.tbl.CopyValuesSubset(start, out.([]float64))
}
func (s *Float64MemoTable) WriteOut(out []byte) {
s.tbl.CopyValues(arrow.Float64Traits.CastFromBytes(out))
}
func (s *Float64MemoTable) WriteOutSubset(start int, out []byte) {
s.tbl.CopyValuesSubset(start, arrow.Float64Traits.CastFromBytes(out))
}
func (s *Float64MemoTable) WriteOutLE(out []byte) {
s.tbl.WriteOut(out)
}
func (s *Float64MemoTable) WriteOutSubsetLE(start int, out []byte) {
s.tbl.WriteOutSubset(start, out)
}
func (s *Float64MemoTable) Exists(val float64) bool {
_, ok := s.Get(val)
return ok
}
// Get returns the index of the requested value in the hash table or KeyNotFound
// along with a boolean indicating if it was found or not.
func (s *Float64MemoTable) Get(val interface{}) (int, bool) {
var cmp func(float64) bool
if math.IsNaN(val.(float64)) {
cmp = math.IsNaN
// use consistent internal bit pattern for NaN regardless of the pattern
// that is passed to us. NaN is NaN is NaN
val = math.NaN()
} else {
cmp = func(v float64) bool { return val.(float64) == v }
}
h := hashFloat64(val.(float64), 0)
if e, ok := s.tbl.Lookup(h, cmp); ok {
return int(e.payload.memoIdx), ok
}
return KeyNotFound, false
}
// GetOrInsert will return the index of the specified value in the table, or insert the
// value into the table and return the new index. found indicates whether or not it already
// existed in the table (true) or was inserted by this call (false).
func (s *Float64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
return s.InsertOrGet(val.(float64))
}
func (s *Float64MemoTable) InsertOrGet(val float64) (idx int, found bool, err error) {
var cmp func(float64) bool
if math.IsNaN(val) {
cmp = math.IsNaN
// use consistent internal bit pattern for NaN regardless of the pattern
// that is passed to us. NaN is NaN is NaN
val = math.NaN()
} else {
cmp = func(v float64) bool { return val == v }
}
h := hashFloat64(val, 0)
e, ok := s.tbl.Lookup(h, cmp)
if ok {
idx = int(e.payload.memoIdx)
found = true
} else {
idx = s.Size()
s.tbl.Insert(e, h, val, int32(idx))
}
return
}
// GetOrInsertBytes is unimplemented
func (s *Float64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
panic("unimplemented")
}