arrow/array/list.go (1,278 lines of code) (raw):
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package array
import (
"bytes"
"fmt"
"strings"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/bitutil"
"github.com/apache/arrow-go/v18/arrow/internal/debug"
"github.com/apache/arrow-go/v18/arrow/memory"
"github.com/apache/arrow-go/v18/internal/json"
)
type ListLike interface {
arrow.Array
ListValues() arrow.Array
ValueOffsets(i int) (start, end int64)
}
type VarLenListLike interface {
ListLike
}
// List represents an immutable sequence of array values.
type List struct {
array
values arrow.Array
offsets []int32
}
var _ ListLike = (*List)(nil)
// NewListData returns a new List array value, from data.
func NewListData(data arrow.ArrayData) *List {
a := &List{}
a.refCount.Add(1)
a.setData(data.(*Data))
return a
}
func (a *List) ListValues() arrow.Array { return a.values }
func (a *List) ValueStr(i int) string {
if !a.IsValid(i) {
return NullValueStr
}
return string(a.GetOneForMarshal(i).(json.RawMessage))
}
func (a *List) String() string {
o := new(strings.Builder)
o.WriteString("[")
for i := 0; i < a.Len(); i++ {
if i > 0 {
o.WriteString(" ")
}
if a.IsNull(i) {
o.WriteString(NullValueStr)
continue
}
sub := a.newListValue(i)
fmt.Fprintf(o, "%v", sub)
sub.Release()
}
o.WriteString("]")
return o.String()
}
func (a *List) newListValue(i int) arrow.Array {
beg, end := a.ValueOffsets(i)
return NewSlice(a.values, beg, end)
}
func (a *List) setData(data *Data) {
debug.Assert(len(data.buffers) >= 2, "list data should have 2 buffers")
a.array.setData(data)
vals := data.buffers[1]
if vals != nil {
a.offsets = arrow.Int32Traits.CastFromBytes(vals.Bytes())
}
a.values = MakeFromData(data.childData[0])
}
func (a *List) GetOneForMarshal(i int) interface{} {
if a.IsNull(i) {
return nil
}
slice := a.newListValue(i)
defer slice.Release()
v, err := json.Marshal(slice)
if err != nil {
panic(err)
}
return json.RawMessage(v)
}
func (a *List) MarshalJSON() ([]byte, error) {
var buf bytes.Buffer
enc := json.NewEncoder(&buf)
buf.WriteByte('[')
for i := 0; i < a.Len(); i++ {
if i != 0 {
buf.WriteByte(',')
}
if err := enc.Encode(a.GetOneForMarshal(i)); err != nil {
return nil, err
}
}
buf.WriteByte(']')
return buf.Bytes(), nil
}
func arrayEqualList(left, right *List) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
continue
}
o := func() bool {
l := left.newListValue(i)
defer l.Release()
r := right.newListValue(i)
defer r.Release()
return Equal(l, r)
}()
if !o {
return false
}
}
return true
}
// Len returns the number of elements in the array.
func (a *List) Len() int { return a.array.Len() }
func (a *List) Offsets() []int32 { return a.offsets }
func (a *List) Retain() {
a.array.Retain()
a.values.Retain()
}
func (a *List) Release() {
a.array.Release()
a.values.Release()
}
func (a *List) ValueOffsets(i int) (start, end int64) {
debug.Assert(i >= 0 && i < a.array.data.length, "index out of range")
j := i + a.array.data.offset
start, end = int64(a.offsets[j]), int64(a.offsets[j+1])
return
}
// LargeList represents an immutable sequence of array values.
type LargeList struct {
array
values arrow.Array
offsets []int64
}
var _ ListLike = (*LargeList)(nil)
// NewLargeListData returns a new LargeList array value, from data.
func NewLargeListData(data arrow.ArrayData) *LargeList {
a := new(LargeList)
a.refCount.Add(1)
a.setData(data.(*Data))
return a
}
func (a *LargeList) ListValues() arrow.Array { return a.values }
func (a *LargeList) ValueStr(i int) string {
if !a.IsValid(i) {
return NullValueStr
}
return string(a.GetOneForMarshal(i).(json.RawMessage))
}
func (a *LargeList) String() string {
o := new(strings.Builder)
o.WriteString("[")
for i := 0; i < a.Len(); i++ {
if i > 0 {
o.WriteString(" ")
}
if a.IsNull(i) {
o.WriteString(NullValueStr)
continue
}
sub := a.newListValue(i)
fmt.Fprintf(o, "%v", sub)
sub.Release()
}
o.WriteString("]")
return o.String()
}
func (a *LargeList) newListValue(i int) arrow.Array {
beg, end := a.ValueOffsets(i)
return NewSlice(a.values, beg, end)
}
func (a *LargeList) setData(data *Data) {
debug.Assert(len(data.buffers) >= 2, "list data should have 2 buffers")
a.array.setData(data)
vals := data.buffers[1]
if vals != nil {
a.offsets = arrow.Int64Traits.CastFromBytes(vals.Bytes())
}
a.values = MakeFromData(data.childData[0])
}
func (a *LargeList) GetOneForMarshal(i int) interface{} {
if a.IsNull(i) {
return nil
}
slice := a.newListValue(i)
defer slice.Release()
v, err := json.Marshal(slice)
if err != nil {
panic(err)
}
return json.RawMessage(v)
}
func (a *LargeList) MarshalJSON() ([]byte, error) {
var buf bytes.Buffer
enc := json.NewEncoder(&buf)
buf.WriteByte('[')
for i := 0; i < a.Len(); i++ {
if i != 0 {
buf.WriteByte(',')
}
if err := enc.Encode(a.GetOneForMarshal(i)); err != nil {
return nil, err
}
}
buf.WriteByte(']')
return buf.Bytes(), nil
}
func arrayEqualLargeList(left, right *LargeList) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
continue
}
o := func() bool {
l := left.newListValue(i)
defer l.Release()
r := right.newListValue(i)
defer r.Release()
return Equal(l, r)
}()
if !o {
return false
}
}
return true
}
// Len returns the number of elements in the array.
func (a *LargeList) Len() int { return a.array.Len() }
func (a *LargeList) Offsets() []int64 { return a.offsets }
func (a *LargeList) ValueOffsets(i int) (start, end int64) {
debug.Assert(i >= 0 && i < a.array.data.length, "index out of range")
j := i + a.array.data.offset
start, end = a.offsets[j], a.offsets[j+1]
return
}
func (a *LargeList) Retain() {
a.array.Retain()
a.values.Retain()
}
func (a *LargeList) Release() {
a.array.Release()
a.values.Release()
}
type baseListBuilder struct {
builder
values Builder // value builder for the list's elements.
offsets Builder
// actual list type
dt arrow.DataType
appendOffsetVal func(int)
}
type ListLikeBuilder interface {
Builder
ValueBuilder() Builder
Append(bool)
}
type VarLenListLikeBuilder interface {
ListLikeBuilder
AppendWithSize(bool, int)
}
type ListBuilder struct {
baseListBuilder
}
type LargeListBuilder struct {
baseListBuilder
}
// NewListBuilder returns a builder, using the provided memory allocator.
// The created list builder will create a list whose elements will be of type etype.
func NewListBuilder(mem memory.Allocator, etype arrow.DataType) *ListBuilder {
offsetBldr := NewInt32Builder(mem)
lb := &ListBuilder{
baseListBuilder{
builder: builder{mem: mem},
values: NewBuilder(mem, etype),
offsets: offsetBldr,
dt: arrow.ListOf(etype),
appendOffsetVal: func(o int) { offsetBldr.Append(int32(o)) },
},
}
lb.refCount.Add(1)
return lb
}
// NewListBuilderWithField takes a field to use for the child rather than just
// a datatype to allow for more customization.
func NewListBuilderWithField(mem memory.Allocator, field arrow.Field) *ListBuilder {
offsetBldr := NewInt32Builder(mem)
lb := &ListBuilder{
baseListBuilder{
builder: builder{mem: mem},
values: NewBuilder(mem, field.Type),
offsets: offsetBldr,
dt: arrow.ListOfField(field),
appendOffsetVal: func(o int) { offsetBldr.Append(int32(o)) },
},
}
lb.refCount.Add(1)
return lb
}
func (b *baseListBuilder) Type() arrow.DataType {
switch dt := b.dt.(type) {
case *arrow.ListType:
f := dt.ElemField()
f.Type = b.values.Type()
return arrow.ListOfField(f)
case *arrow.LargeListType:
f := dt.ElemField()
f.Type = b.values.Type()
return arrow.LargeListOfField(f)
}
return nil
}
// NewLargeListBuilder returns a builder, using the provided memory allocator.
// The created list builder will create a list whose elements will be of type etype.
func NewLargeListBuilder(mem memory.Allocator, etype arrow.DataType) *LargeListBuilder {
offsetBldr := NewInt64Builder(mem)
llb := &LargeListBuilder{
baseListBuilder{
builder: builder{mem: mem},
values: NewBuilder(mem, etype),
offsets: offsetBldr,
dt: arrow.LargeListOf(etype),
appendOffsetVal: func(o int) { offsetBldr.Append(int64(o)) },
},
}
llb.refCount.Add(1)
return llb
}
// NewLargeListBuilderWithField takes a field rather than just an element type
// to allow for more customization of the final type of the LargeList Array
func NewLargeListBuilderWithField(mem memory.Allocator, field arrow.Field) *LargeListBuilder {
offsetBldr := NewInt64Builder(mem)
llb := &LargeListBuilder{
baseListBuilder{
builder: builder{mem: mem},
values: NewBuilder(mem, field.Type),
offsets: offsetBldr,
dt: arrow.LargeListOfField(field),
appendOffsetVal: func(o int) { offsetBldr.Append(int64(o)) },
},
}
llb.refCount.Add(1)
return llb
}
// Release decreases the reference count by 1.
// When the reference count goes to zero, the memory is freed.
func (b *baseListBuilder) Release() {
debug.Assert(b.refCount.Load() > 0, "too many releases")
if b.refCount.Add(-1) == 0 {
if b.nullBitmap != nil {
b.nullBitmap.Release()
b.nullBitmap = nil
}
b.values.Release()
b.offsets.Release()
}
}
func (b *baseListBuilder) appendNextOffset() {
b.appendOffsetVal(b.values.Len())
}
func (b *baseListBuilder) Append(v bool) {
b.Reserve(1)
b.unsafeAppendBoolToBitmap(v)
b.appendNextOffset()
}
func (b *baseListBuilder) AppendWithSize(v bool, _ int) {
b.Append(v)
}
func (b *baseListBuilder) AppendNull() {
b.Reserve(1)
b.unsafeAppendBoolToBitmap(false)
b.appendNextOffset()
}
func (b *baseListBuilder) AppendNulls(n int) {
for i := 0; i < n; i++ {
b.AppendNull()
}
}
func (b *baseListBuilder) AppendEmptyValue() {
b.Append(true)
}
func (b *baseListBuilder) AppendEmptyValues(n int) {
for i := 0; i < n; i++ {
b.AppendEmptyValue()
}
}
func (b *ListBuilder) AppendValues(offsets []int32, valid []bool) {
b.Reserve(len(valid))
b.offsets.(*Int32Builder).AppendValues(offsets, nil)
b.builder.unsafeAppendBoolsToBitmap(valid, len(valid))
}
func (b *LargeListBuilder) AppendValues(offsets []int64, valid []bool) {
b.Reserve(len(valid))
b.offsets.(*Int64Builder).AppendValues(offsets, nil)
b.builder.unsafeAppendBoolsToBitmap(valid, len(valid))
}
func (b *baseListBuilder) unsafeAppendBoolToBitmap(isValid bool) {
if isValid {
bitutil.SetBit(b.nullBitmap.Bytes(), b.length)
} else {
b.nulls++
}
b.length++
}
func (b *baseListBuilder) init(capacity int) {
b.builder.init(capacity)
b.offsets.init(capacity + 1)
}
// Reserve ensures there is enough space for appending n elements
// by checking the capacity and calling Resize if necessary.
func (b *baseListBuilder) Reserve(n int) {
b.builder.reserve(n, b.resizeHelper)
b.offsets.Reserve(n)
}
// Resize adjusts the space allocated by b to n elements. If n is greater than b.Cap(),
// additional memory will be allocated. If n is smaller, the allocated memory may reduced.
func (b *baseListBuilder) Resize(n int) {
b.resizeHelper(n)
b.offsets.Resize(n)
}
func (b *baseListBuilder) resizeHelper(n int) {
if n < minBuilderCapacity {
n = minBuilderCapacity
}
if b.capacity == 0 {
b.init(n)
} else {
b.builder.resize(n, b.builder.init)
}
}
func (b *baseListBuilder) ValueBuilder() Builder {
return b.values
}
// NewArray creates a List array from the memory buffers used by the builder and resets the ListBuilder
// so it can be used to build a new array.
func (b *ListBuilder) NewArray() arrow.Array {
return b.NewListArray()
}
// NewArray creates a LargeList array from the memory buffers used by the builder and resets the LargeListBuilder
// so it can be used to build a new array.
func (b *LargeListBuilder) NewArray() arrow.Array {
return b.NewLargeListArray()
}
// NewListArray creates a List array from the memory buffers used by the builder and resets the ListBuilder
// so it can be used to build a new array.
func (b *ListBuilder) NewListArray() (a *List) {
data := b.newData()
a = NewListData(data)
data.Release()
return
}
// NewLargeListArray creates a List array from the memory buffers used by the builder and resets the LargeListBuilder
// so it can be used to build a new array.
func (b *LargeListBuilder) NewLargeListArray() (a *LargeList) {
data := b.newData()
a = NewLargeListData(data)
data.Release()
return
}
func (b *baseListBuilder) newData() (data *Data) {
if b.offsets.Len() != b.length+1 {
b.appendNextOffset()
}
values := b.values.NewArray()
defer values.Release()
var offsets *memory.Buffer
if b.offsets != nil {
arr := b.offsets.NewArray()
defer arr.Release()
offsets = arr.Data().Buffers()[1]
}
data = NewData(
b.Type(), b.length,
[]*memory.Buffer{
b.nullBitmap,
offsets,
},
[]arrow.ArrayData{values.Data()},
b.nulls,
0,
)
b.reset()
return
}
func (b *baseListBuilder) AppendValueFromString(s string) error {
if s == NullValueStr {
b.AppendNull()
return nil
}
return b.UnmarshalOne(json.NewDecoder(strings.NewReader(s)))
}
func (b *baseListBuilder) UnmarshalOne(dec *json.Decoder) error {
t, err := dec.Token()
if err != nil {
return err
}
switch t {
case json.Delim('['):
b.Append(true)
if err := b.values.Unmarshal(dec); err != nil {
return err
}
// consume ']'
_, err := dec.Token()
return err
case nil:
b.AppendNull()
default:
return &json.UnmarshalTypeError{
Value: fmt.Sprint(t),
Struct: b.dt.String(),
}
}
return nil
}
func (b *baseListBuilder) Unmarshal(dec *json.Decoder) error {
for dec.More() {
if err := b.UnmarshalOne(dec); err != nil {
return err
}
}
return nil
}
func (b *baseListBuilder) UnmarshalJSON(data []byte) error {
dec := json.NewDecoder(bytes.NewReader(data))
t, err := dec.Token()
if err != nil {
return err
}
if delim, ok := t.(json.Delim); !ok || delim != '[' {
return fmt.Errorf("list builder must unpack from json array, found %s", delim)
}
return b.Unmarshal(dec)
}
// ListView represents an immutable sequence of array values defined by an
// offset into a child array and a length.
type ListView struct {
array
values arrow.Array
offsets []int32
sizes []int32
}
var _ VarLenListLike = (*ListView)(nil)
func NewListViewData(data arrow.ArrayData) *ListView {
a := &ListView{}
a.refCount.Add(1)
a.setData(data.(*Data))
return a
}
func (a *ListView) ListValues() arrow.Array { return a.values }
func (a *ListView) ValueStr(i int) string {
if !a.IsValid(i) {
return NullValueStr
}
return string(a.GetOneForMarshal(i).(json.RawMessage))
}
func (a *ListView) String() string {
o := new(strings.Builder)
o.WriteString("[")
for i := 0; i < a.Len(); i++ {
if i > 0 {
o.WriteString(" ")
}
if a.IsNull(i) {
o.WriteString(NullValueStr)
continue
}
sub := a.newListValue(i)
fmt.Fprintf(o, "%v", sub)
sub.Release()
}
o.WriteString("]")
return o.String()
}
func (a *ListView) newListValue(i int) arrow.Array {
beg, end := a.ValueOffsets(i)
return NewSlice(a.values, beg, end)
}
func (a *ListView) setData(data *Data) {
debug.Assert(len(data.buffers) >= 3, "list-view data should have 3 buffers")
a.array.setData(data)
offsets := data.buffers[1]
if offsets != nil {
a.offsets = arrow.Int32Traits.CastFromBytes(offsets.Bytes())
}
sizes := data.buffers[2]
if sizes != nil {
a.sizes = arrow.Int32Traits.CastFromBytes(sizes.Bytes())
}
a.values = MakeFromData(data.childData[0])
}
func (a *ListView) GetOneForMarshal(i int) interface{} {
if a.IsNull(i) {
return nil
}
slice := a.newListValue(i)
defer slice.Release()
v, err := json.Marshal(slice)
if err != nil {
panic(err)
}
return json.RawMessage(v)
}
func (a *ListView) MarshalJSON() ([]byte, error) {
var buf bytes.Buffer
enc := json.NewEncoder(&buf)
buf.WriteByte('[')
for i := 0; i < a.Len(); i++ {
if i != 0 {
buf.WriteByte(',')
}
if err := enc.Encode(a.GetOneForMarshal(i)); err != nil {
return nil, err
}
}
buf.WriteByte(']')
return buf.Bytes(), nil
}
func arrayEqualListView(left, right *ListView) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
continue
}
o := func() bool {
l := left.newListValue(i)
defer l.Release()
r := right.newListValue(i)
defer r.Release()
return Equal(l, r)
}()
if !o {
return false
}
}
return true
}
// Len returns the number of elements in the array.
func (a *ListView) Len() int { return a.array.Len() }
func (a *ListView) Offsets() []int32 { return a.offsets }
func (a *ListView) Sizes() []int32 { return a.sizes }
func (a *ListView) Retain() {
a.array.Retain()
a.values.Retain()
}
func (a *ListView) Release() {
a.array.Release()
a.values.Release()
}
func (a *ListView) ValueOffsets(i int) (start, end int64) {
debug.Assert(i >= 0 && i < a.array.data.length, "index out of range")
j := i + a.array.data.offset
size := int64(a.sizes[j])
// If size is 0, skip accessing offsets.
if size == 0 {
start, end = 0, 0
return
}
start = int64(a.offsets[j])
end = start + size
return
}
// LargeListView represents an immutable sequence of array values defined by an
// offset into a child array and a length.
type LargeListView struct {
array
values arrow.Array
offsets []int64
sizes []int64
}
var _ VarLenListLike = (*LargeListView)(nil)
// NewLargeListViewData returns a new LargeListView array value, from data.
func NewLargeListViewData(data arrow.ArrayData) *LargeListView {
a := new(LargeListView)
a.refCount.Add(1)
a.setData(data.(*Data))
return a
}
func (a *LargeListView) ListValues() arrow.Array { return a.values }
func (a *LargeListView) ValueStr(i int) string {
if !a.IsValid(i) {
return NullValueStr
}
return string(a.GetOneForMarshal(i).(json.RawMessage))
}
func (a *LargeListView) String() string {
o := new(strings.Builder)
o.WriteString("[")
for i := 0; i < a.Len(); i++ {
if i > 0 {
o.WriteString(" ")
}
if a.IsNull(i) {
o.WriteString(NullValueStr)
continue
}
sub := a.newListValue(i)
fmt.Fprintf(o, "%v", sub)
sub.Release()
}
o.WriteString("]")
return o.String()
}
func (a *LargeListView) newListValue(i int) arrow.Array {
beg, end := a.ValueOffsets(i)
return NewSlice(a.values, beg, end)
}
func (a *LargeListView) setData(data *Data) {
debug.Assert(len(data.buffers) >= 3, "list-view data should have 3 buffers")
a.array.setData(data)
offsets := data.buffers[1]
if offsets != nil {
a.offsets = arrow.Int64Traits.CastFromBytes(offsets.Bytes())
}
sizes := data.buffers[2]
if sizes != nil {
a.sizes = arrow.Int64Traits.CastFromBytes(sizes.Bytes())
}
a.values = MakeFromData(data.childData[0])
}
func (a *LargeListView) GetOneForMarshal(i int) interface{} {
if a.IsNull(i) {
return nil
}
slice := a.newListValue(i)
defer slice.Release()
v, err := json.Marshal(slice)
if err != nil {
panic(err)
}
return json.RawMessage(v)
}
func (a *LargeListView) MarshalJSON() ([]byte, error) {
var buf bytes.Buffer
enc := json.NewEncoder(&buf)
buf.WriteByte('[')
for i := 0; i < a.Len(); i++ {
if i != 0 {
buf.WriteByte(',')
}
if err := enc.Encode(a.GetOneForMarshal(i)); err != nil {
return nil, err
}
}
buf.WriteByte(']')
return buf.Bytes(), nil
}
func arrayEqualLargeListView(left, right *LargeListView) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
continue
}
o := func() bool {
l := left.newListValue(i)
defer l.Release()
r := right.newListValue(i)
defer r.Release()
return Equal(l, r)
}()
if !o {
return false
}
}
return true
}
// Len returns the number of elements in the array.
func (a *LargeListView) Len() int { return a.array.Len() }
func (a *LargeListView) Offsets() []int64 { return a.offsets }
func (a *LargeListView) Sizes() []int64 { return a.sizes }
func (a *LargeListView) ValueOffsets(i int) (start, end int64) {
debug.Assert(i >= 0 && i < a.array.data.length, "index out of range")
j := i + a.array.data.offset
size := a.sizes[j]
// If size is 0, skip accessing offsets.
if size == 0 {
return 0, 0
}
start = a.offsets[j]
end = start + size
return
}
func (a *LargeListView) Retain() {
a.array.Retain()
a.values.Retain()
}
func (a *LargeListView) Release() {
a.array.Release()
a.values.Release()
}
// Accessors for offsets and sizes to make ListView and LargeListView validation generic.
type offsetsAndSizes interface {
offsetAt(slot int64) int64
sizeAt(slot int64) int64
}
var (
_ offsetsAndSizes = (*ListView)(nil)
_ offsetsAndSizes = (*LargeListView)(nil)
)
func (a *ListView) offsetAt(slot int64) int64 { return int64(a.offsets[int64(a.data.offset)+slot]) }
func (a *ListView) sizeAt(slot int64) int64 { return int64(a.sizes[int64(a.data.offset)+slot]) }
func (a *LargeListView) offsetAt(slot int64) int64 { return a.offsets[int64(a.data.offset)+slot] }
func (a *LargeListView) sizeAt(slot int64) int64 { return a.sizes[int64(a.data.offset)+slot] }
func outOfBoundsListViewOffset(l offsetsAndSizes, slot int64, offsetLimit int64) error {
offset := l.offsetAt(slot)
return fmt.Errorf("%w: Offset invariant failure: offset for slot %d out of bounds. Expected %d to be at least 0 and less than %d", arrow.ErrInvalid, slot, offset, offsetLimit)
}
func outOfBoundsListViewSize(l offsetsAndSizes, slot int64, offsetLimit int64) error {
size := l.sizeAt(slot)
if size < 0 {
return fmt.Errorf("%w: Offset invariant failure: size for slot %d out of bounds: %d < 0", arrow.ErrInvalid, slot, size)
}
offset := l.offsetAt(slot)
return fmt.Errorf("%w: Offset invariant failure: size for slot %d out of bounds: %d + %d > %d", arrow.ErrInvalid, slot, offset, size, offsetLimit)
}
// Pre-condition: Basic validation has already been performed
func (a *array) fullyValidateOffsetsAndSizes(l offsetsAndSizes, offsetLimit int64) error {
for slot := int64(0); slot < int64(a.Len()); slot += 1 {
size := l.sizeAt(slot)
if size > 0 {
offset := l.offsetAt(slot)
if offset < 0 || offset > offsetLimit {
return outOfBoundsListViewOffset(l, slot, offsetLimit)
}
if size > offsetLimit-int64(offset) {
return outOfBoundsListViewSize(l, slot, offsetLimit)
}
} else if size < 0 {
return outOfBoundsListViewSize(l, slot, offsetLimit)
}
}
return nil
}
func (a *array) validateOffsetsAndMaybeSizes(l offsetsAndSizes, offsetByteWidth int, isListView bool, offsetLimit int64, fullValidation bool) error {
nonEmpty := a.Len() > 0
if a.data.buffers[1] == nil {
// For length 0, an empty offsets buffer is accepted (ARROW-544).
if nonEmpty {
return fmt.Errorf("non-empty array but offsets are null")
}
return nil
}
if isListView && a.data.buffers[2] == nil {
if nonEmpty {
return fmt.Errorf("non-empty array but sizes are null")
}
return nil
}
var requiredOffsets int
if nonEmpty {
requiredOffsets = a.Len() + a.Offset()
if !isListView {
requiredOffsets += 1
}
} else {
requiredOffsets = 0
}
offsetsByteSize := a.data.buffers[1].Len()
if offsetsByteSize/offsetByteWidth < requiredOffsets {
return fmt.Errorf("offsets buffer size (bytes): %d isn't large enough for length: %d and offset: %d",
offsetsByteSize, a.Len(), a.Offset())
}
if isListView {
requiredSizes := a.Len() + a.Offset()
sizesBytesSize := a.data.buffers[2].Len()
if sizesBytesSize/offsetByteWidth < requiredSizes {
return fmt.Errorf("sizes buffer size (bytes): %d isn't large enough for length: %d and offset: %d",
sizesBytesSize, a.Len(), a.Offset())
}
}
if fullValidation && requiredOffsets > 0 {
if isListView {
return a.fullyValidateOffsetsAndSizes(l, offsetLimit)
}
// TODO: implement validation of List and LargeList
// return fullyValidateOffsets(offset_limit)
return nil
}
return nil
}
func (a *ListView) validate(fullValidation bool) error {
values := a.array.data.childData[0]
offsetLimit := values.Len()
return a.array.validateOffsetsAndMaybeSizes(a, 4, true, int64(offsetLimit), fullValidation)
}
func (a *ListView) Validate() error {
return a.validate(false)
}
func (a *ListView) ValidateFull() error {
return a.validate(true)
}
func (a *LargeListView) validate(fullValidation bool) error {
values := a.array.data.childData[0]
offsetLimit := values.Len()
return a.array.validateOffsetsAndMaybeSizes(a, 8, true, int64(offsetLimit), fullValidation)
}
func (a *LargeListView) Validate() error {
return a.validate(false)
}
func (a *LargeListView) ValidateFull() error {
return a.validate(true)
}
type baseListViewBuilder struct {
builder
values Builder // value builder for the list-view's elements.
offsets Builder
sizes Builder
// actual list-view type
dt arrow.DataType
appendOffsetVal func(int)
appendSizeVal func(int)
}
type ListViewBuilder struct {
baseListViewBuilder
}
type LargeListViewBuilder struct {
baseListViewBuilder
}
// NewListViewBuilder returns a builder, using the provided memory allocator.
// The created list-view builder will create a list whose elements will be
// of type etype.
func NewListViewBuilder(mem memory.Allocator, etype arrow.DataType) *ListViewBuilder {
offsetBldr := NewInt32Builder(mem)
sizeBldr := NewInt32Builder(mem)
lvb := &ListViewBuilder{
baseListViewBuilder{
builder: builder{mem: mem},
values: NewBuilder(mem, etype),
offsets: offsetBldr,
sizes: sizeBldr,
dt: arrow.ListViewOf(etype),
appendOffsetVal: func(o int) { offsetBldr.Append(int32(o)) },
appendSizeVal: func(s int) { sizeBldr.Append(int32(s)) },
},
}
lvb.refCount.Add(1)
return lvb
}
// NewListViewBuilderWithField takes a field to use for the child rather than just
// a datatype to allow for more customization.
func NewListViewBuilderWithField(mem memory.Allocator, field arrow.Field) *ListViewBuilder {
offsetBldr := NewInt32Builder(mem)
sizeBldr := NewInt32Builder(mem)
lvb := &ListViewBuilder{
baseListViewBuilder{
builder: builder{mem: mem},
values: NewBuilder(mem, field.Type),
offsets: offsetBldr,
sizes: sizeBldr,
dt: arrow.ListViewOfField(field),
appendOffsetVal: func(o int) { offsetBldr.Append(int32(o)) },
appendSizeVal: func(s int) { sizeBldr.Append(int32(s)) },
},
}
lvb.refCount.Add(1)
return lvb
}
func (b *baseListViewBuilder) Type() arrow.DataType {
switch dt := b.dt.(type) {
case *arrow.ListViewType:
f := dt.ElemField()
f.Type = b.values.Type()
return arrow.ListViewOfField(f)
case *arrow.LargeListViewType:
f := dt.ElemField()
f.Type = b.values.Type()
return arrow.LargeListViewOfField(f)
}
return nil
}
// NewLargeListViewBuilder returns a builder, using the provided memory allocator.
// The created list-view builder will create a list whose elements will be of type etype.
func NewLargeListViewBuilder(mem memory.Allocator, etype arrow.DataType) *LargeListViewBuilder {
offsetBldr := NewInt64Builder(mem)
sizeBldr := NewInt64Builder(mem)
llvb := &LargeListViewBuilder{
baseListViewBuilder{
builder: builder{mem: mem},
values: NewBuilder(mem, etype),
offsets: offsetBldr,
sizes: sizeBldr,
dt: arrow.LargeListViewOf(etype),
appendOffsetVal: func(o int) { offsetBldr.Append(int64(o)) },
appendSizeVal: func(s int) { sizeBldr.Append(int64(s)) },
},
}
llvb.refCount.Add(1)
return llvb
}
// NewLargeListViewBuilderWithField takes a field rather than just an element type
// to allow for more customization of the final type of the LargeListView Array
func NewLargeListViewBuilderWithField(mem memory.Allocator, field arrow.Field) *LargeListViewBuilder {
offsetBldr := NewInt64Builder(mem)
sizeBldr := NewInt64Builder(mem)
llvb := &LargeListViewBuilder{
baseListViewBuilder{
builder: builder{mem: mem},
values: NewBuilder(mem, field.Type),
offsets: offsetBldr,
sizes: sizeBldr,
dt: arrow.LargeListViewOfField(field),
appendOffsetVal: func(o int) { offsetBldr.Append(int64(o)) },
appendSizeVal: func(o int) { sizeBldr.Append(int64(o)) },
},
}
llvb.refCount.Add(1)
return llvb
}
// Release decreases the reference count by 1.
// When the reference count goes to zero, the memory is freed.
func (b *baseListViewBuilder) Release() {
debug.Assert(b.refCount.Load() > 0, "too many releases")
if b.refCount.Add(-1) == 0 {
if b.nullBitmap != nil {
b.nullBitmap.Release()
b.nullBitmap = nil
}
b.values.Release()
b.offsets.Release()
b.sizes.Release()
}
}
func (b *baseListViewBuilder) AppendDimensions(offset int, listSize int) {
b.Reserve(1)
b.unsafeAppendBoolToBitmap(true)
b.appendOffsetVal(offset)
b.appendSizeVal(listSize)
}
func (b *baseListViewBuilder) Append(v bool) {
debug.Assert(false, "baseListViewBuilder.Append should never be called -- use AppendWithSize instead")
}
func (b *baseListViewBuilder) AppendWithSize(v bool, listSize int) {
debug.Assert(v || listSize == 0, "invalid list-view should have size 0")
b.Reserve(1)
b.unsafeAppendBoolToBitmap(v)
b.appendOffsetVal(b.values.Len())
b.appendSizeVal(listSize)
}
func (b *baseListViewBuilder) AppendNull() {
b.AppendWithSize(false, 0)
}
func (b *baseListViewBuilder) AppendNulls(n int) {
for i := 0; i < n; i++ {
b.AppendNull()
}
}
func (b *baseListViewBuilder) AppendEmptyValue() {
b.AppendWithSize(true, 0)
}
func (b *baseListViewBuilder) AppendEmptyValues(n int) {
for i := 0; i < n; i++ {
b.AppendEmptyValue()
}
}
func (b *ListViewBuilder) AppendValuesWithSizes(offsets []int32, sizes []int32, valid []bool) {
b.Reserve(len(valid))
b.offsets.(*Int32Builder).AppendValues(offsets, nil)
b.sizes.(*Int32Builder).AppendValues(sizes, nil)
b.builder.unsafeAppendBoolsToBitmap(valid, len(valid))
}
func (b *LargeListViewBuilder) AppendValuesWithSizes(offsets []int64, sizes []int64, valid []bool) {
b.Reserve(len(valid))
b.offsets.(*Int64Builder).AppendValues(offsets, nil)
b.sizes.(*Int64Builder).AppendValues(sizes, nil)
b.builder.unsafeAppendBoolsToBitmap(valid, len(valid))
}
func (b *baseListViewBuilder) unsafeAppendBoolToBitmap(isValid bool) {
if isValid {
bitutil.SetBit(b.nullBitmap.Bytes(), b.length)
} else {
b.nulls++
}
b.length++
}
func (b *baseListViewBuilder) init(capacity int) {
b.builder.init(capacity)
b.offsets.init(capacity)
b.sizes.init(capacity)
}
// Reserve ensures there is enough space for appending n elements
// by checking the capacity and calling Resize if necessary.
func (b *baseListViewBuilder) Reserve(n int) {
b.builder.reserve(n, b.resizeHelper)
b.offsets.Reserve(n)
b.sizes.Reserve(n)
}
// Resize adjusts the space allocated by b to n elements. If n is greater than b.Cap(),
// additional memory will be allocated. If n is smaller, the allocated memory may reduced.
func (b *baseListViewBuilder) Resize(n int) {
b.resizeHelper(n)
b.offsets.Resize(n)
b.sizes.Resize(n)
}
func (b *baseListViewBuilder) resizeHelper(n int) {
if n < minBuilderCapacity {
n = minBuilderCapacity
}
if b.capacity == 0 {
b.init(n)
} else {
b.builder.resize(n, b.builder.init)
}
}
func (b *baseListViewBuilder) ValueBuilder() Builder {
return b.values
}
// NewArray creates a ListView array from the memory buffers used by the builder and
// resets the ListViewBuilder so it can be used to build a new array.
func (b *ListViewBuilder) NewArray() arrow.Array {
return b.NewListViewArray()
}
// NewArray creates a LargeListView array from the memory buffers used by the builder
// and resets the LargeListViewBuilder so it can be used to build a new array.
func (b *LargeListViewBuilder) NewArray() arrow.Array {
return b.NewLargeListViewArray()
}
// NewListViewArray creates a ListView array from the memory buffers used by the builder
// and resets the ListViewBuilder so it can be used to build a new array.
func (b *ListViewBuilder) NewListViewArray() (a *ListView) {
data := b.newData()
a = NewListViewData(data)
data.Release()
return
}
// NewLargeListViewArray creates a ListView array from the memory buffers used by the
// builder and resets the LargeListViewBuilder so it can be used to build a new array.
func (b *LargeListViewBuilder) NewLargeListViewArray() (a *LargeListView) {
data := b.newData()
a = NewLargeListViewData(data)
data.Release()
return
}
func (b *baseListViewBuilder) newData() (data *Data) {
values := b.values.NewArray()
defer values.Release()
var offsets *memory.Buffer
if b.offsets != nil {
arr := b.offsets.NewArray()
defer arr.Release()
offsets = arr.Data().Buffers()[1]
}
var sizes *memory.Buffer
if b.sizes != nil {
arr := b.sizes.NewArray()
defer arr.Release()
sizes = arr.Data().Buffers()[1]
}
data = NewData(
b.Type(), b.length,
[]*memory.Buffer{
b.nullBitmap,
offsets,
sizes,
},
[]arrow.ArrayData{values.Data()},
b.nulls,
0,
)
b.reset()
return
}
func (b *baseListViewBuilder) AppendValueFromString(s string) error {
if s == NullValueStr {
b.AppendNull()
return nil
}
return b.UnmarshalOne(json.NewDecoder(strings.NewReader(s)))
}
func (b *baseListViewBuilder) UnmarshalOne(dec *json.Decoder) error {
t, err := dec.Token()
if err != nil {
return err
}
switch t {
case json.Delim('['):
offset := b.values.Len()
// 0 is a placeholder size as we don't know the actual size yet
b.AppendWithSize(true, 0)
if err := b.values.Unmarshal(dec); err != nil {
return err
}
// consume ']'
_, err := dec.Token()
// replace the last size with the actual size
switch b.sizes.(type) {
case *Int32Builder:
b.sizes.(*Int32Builder).rawData[b.sizes.Len()-1] = int32(b.values.Len() - offset)
case *Int64Builder:
b.sizes.(*Int64Builder).rawData[b.sizes.Len()-1] = int64(b.values.Len() - offset)
}
return err
case nil:
b.AppendNull()
default:
return &json.UnmarshalTypeError{
Value: fmt.Sprint(t),
Struct: b.dt.String(),
}
}
return nil
}
func (b *baseListViewBuilder) Unmarshal(dec *json.Decoder) error {
for dec.More() {
if err := b.UnmarshalOne(dec); err != nil {
return err
}
}
return nil
}
func (b *baseListViewBuilder) UnmarshalJSON(data []byte) error {
dec := json.NewDecoder(bytes.NewReader(data))
t, err := dec.Token()
if err != nil {
return err
}
if delim, ok := t.(json.Delim); !ok || delim != '[' {
return fmt.Errorf("list-view builder must unpack from json array, found %s", delim)
}
return b.Unmarshal(dec)
}
// Find the minimum offset+size in a LIST_VIEW/LARGE_LIST_VIEW array.
//
// Pre-conditions:
//
// input.DataType() is ListViewType if Offset=int32 or LargeListViewType if Offset=int64
// input.Len() > 0 && input.NullN() != input.Len()
func minListViewOffset[Offset int32 | int64](input arrow.ArrayData) Offset {
var bitmap []byte
if input.Buffers()[0] != nil {
bitmap = input.Buffers()[0].Bytes()
}
offsets := arrow.GetData[Offset](input.Buffers()[1].Bytes())[input.Offset():]
sizes := arrow.GetData[Offset](input.Buffers()[2].Bytes())[input.Offset():]
isNull := func(i int) bool {
return bitmap != nil && bitutil.BitIsNotSet(bitmap, input.Offset()+i)
}
// It's very likely that the first non-null non-empty list-view starts at
// offset 0 of the child array.
i := 0
for i < input.Len() && (isNull(i) || sizes[i] == 0) {
i += 1
}
if i >= input.Len() {
return 0
}
minOffset := offsets[i]
if minOffset == 0 {
// early exit: offset 0 found already
return 0
}
// Slow path: scan the buffers entirely.
i += 1
for ; i < input.Len(); i += 1 {
if isNull(i) {
continue
}
offset := offsets[i]
if offset < minOffset && sizes[i] > 0 {
minOffset = offset
}
}
return minOffset
}
// Find the maximum offset+size in a LIST_VIEW/LARGE_LIST_VIEW array.
//
// Pre-conditions:
//
// input.DataType() is ListViewType if Offset=int32 or LargeListViewType if Offset=int64
// input.Len() > 0 && input.NullN() != input.Len()
func maxListViewEnd[Offset int32 | int64](input arrow.ArrayData) Offset {
inputOffset := input.Offset()
var bitmap []byte
if input.Buffers()[0] != nil {
bitmap = input.Buffers()[0].Bytes()
}
offsets := arrow.GetData[Offset](input.Buffers()[1].Bytes())[inputOffset:]
sizes := arrow.GetData[Offset](input.Buffers()[2].Bytes())[inputOffset:]
isNull := func(i int) bool {
return bitmap != nil && bitutil.BitIsNotSet(bitmap, inputOffset+i)
}
i := input.Len() - 1 // safe because input.Len() > 0
for i != 0 && (isNull(i) || sizes[i] == 0) {
i -= 1
}
offset := offsets[i]
size := sizes[i]
if i == 0 {
if isNull(i) || sizes[i] == 0 {
return 0
} else {
return offset + size
}
}
values := input.Children()[0]
maxEnd := offsets[i] + sizes[i]
if maxEnd == Offset(values.Len()) {
// Early-exit: maximum possible view-end found already.
return maxEnd
}
// Slow path: scan the buffers entirely.
for ; i >= 0; i -= 1 {
offset := offsets[i]
size := sizes[i]
if size > 0 && !isNull(i) {
if offset+size > maxEnd {
maxEnd = offset + size
if maxEnd == Offset(values.Len()) {
return maxEnd
}
}
}
}
return maxEnd
}
func rangeOfValuesUsed(input arrow.ArrayData) (int, int) {
if input.Len() == 0 || input.NullN() == input.Len() {
return 0, 0
}
var minOffset, maxEnd int
switch input.DataType().(type) {
case *arrow.ListViewType:
minOffset = int(minListViewOffset[int32](input))
maxEnd = int(maxListViewEnd[int32](input))
case *arrow.LargeListViewType:
minOffset = int(minListViewOffset[int64](input))
maxEnd = int(maxListViewEnd[int64](input))
case *arrow.ListType:
offsets := arrow.Int32Traits.CastFromBytes(input.Buffers()[1].Bytes())[input.Offset():]
minOffset = int(offsets[0])
maxEnd = int(offsets[len(offsets)-1])
case *arrow.LargeListType:
offsets := arrow.Int64Traits.CastFromBytes(input.Buffers()[1].Bytes())[input.Offset():]
minOffset = int(offsets[0])
maxEnd = int(offsets[len(offsets)-1])
case *arrow.MapType:
offsets := arrow.Int32Traits.CastFromBytes(input.Buffers()[1].Bytes())[input.Offset():]
minOffset = int(offsets[0])
maxEnd = int(offsets[len(offsets)-1])
}
return minOffset, maxEnd - minOffset
}
// Returns the smallest contiguous range of values of the child array that are
// referenced by all the list values in the input array.
func RangeOfValuesUsed(input VarLenListLike) (int, int) {
return rangeOfValuesUsed(input.Data())
}
var (
_ arrow.Array = (*List)(nil)
_ arrow.Array = (*LargeList)(nil)
_ arrow.Array = (*ListView)(nil)
_ arrow.Array = (*LargeListView)(nil)
_ Builder = (*ListBuilder)(nil)
_ Builder = (*LargeListBuilder)(nil)
_ Builder = (*ListViewBuilder)(nil)
_ Builder = (*LargeListViewBuilder)(nil)
_ VarLenListLike = (*List)(nil)
_ VarLenListLike = (*LargeList)(nil)
_ VarLenListLike = (*Map)(nil)
_ VarLenListLike = (*ListView)(nil)
_ VarLenListLike = (*LargeListView)(nil)
_ ListLike = (*FixedSizeList)(nil)
_ VarLenListLikeBuilder = (*ListBuilder)(nil)
_ VarLenListLikeBuilder = (*LargeListBuilder)(nil)
_ VarLenListLikeBuilder = (*ListBuilder)(nil)
_ VarLenListLikeBuilder = (*LargeListBuilder)(nil)
_ VarLenListLikeBuilder = (*MapBuilder)(nil)
_ ListLikeBuilder = (*FixedSizeListBuilder)(nil)
)