arrow/array/string.go (569 lines of code) (raw):

// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package array import ( "bytes" "fmt" "reflect" "strings" "unsafe" "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/memory" "github.com/apache/arrow-go/v18/internal/json" ) type StringLike interface { arrow.Array Value(int) string ValueLen(int) int } // String represents an immutable sequence of variable-length UTF-8 strings. type String struct { array offsets []int32 values string } // NewStringData constructs a new String array from data. func NewStringData(data arrow.ArrayData) *String { a := &String{} a.refCount.Add(1) a.setData(data.(*Data)) return a } // Reset resets the String with a different set of Data. func (a *String) Reset(data arrow.ArrayData) { a.setData(data.(*Data)) } // Value returns the slice at index i. This value should not be mutated. func (a *String) Value(i int) string { i = i + a.array.data.offset return a.values[a.offsets[i]:a.offsets[i+1]] } func (a *String) ValueStr(i int) string { if a.IsNull(i) { return NullValueStr } return a.Value(i) } // ValueOffset returns the offset of the value at index i. func (a *String) ValueOffset(i int) int { if i < 0 || i >= a.array.data.length { panic("arrow/array: index out of range") } return int(a.offsets[i+a.array.data.offset]) } func (a *String) ValueOffset64(i int) int64 { return int64(a.ValueOffset(i)) } func (a *String) ValueLen(i int) int { if i < 0 || i >= a.array.data.length { panic("arrow/array: index out of range") } beg := a.array.data.offset + i return int(a.offsets[beg+1] - a.offsets[beg]) } func (a *String) ValueOffsets() []int32 { beg := a.array.data.offset end := beg + a.array.data.length + 1 return a.offsets[beg:end] } func (a *String) ValueBytes() []byte { beg := a.array.data.offset end := beg + a.array.data.length if a.array.data.buffers[2] != nil { return a.array.data.buffers[2].Bytes()[a.offsets[beg]:a.offsets[end]] } return nil } func (a *String) String() string { o := new(strings.Builder) o.WriteString("[") for i := 0; i < a.Len(); i++ { if i > 0 { o.WriteString(" ") } switch { case a.IsNull(i): o.WriteString(NullValueStr) default: fmt.Fprintf(o, "%q", a.Value(i)) } } o.WriteString("]") return o.String() } func (a *String) setData(data *Data) { if len(data.buffers) != 3 { panic("arrow/array: len(data.buffers) != 3") } a.array.setData(data) if vdata := data.buffers[2]; vdata != nil { b := vdata.Bytes() a.values = *(*string)(unsafe.Pointer(&b)) } if offsets := data.buffers[1]; offsets != nil { a.offsets = arrow.Int32Traits.CastFromBytes(offsets.Bytes()) } if a.array.data.length < 1 { return } expNumOffsets := a.array.data.offset + a.array.data.length + 1 if len(a.offsets) < expNumOffsets { panic(fmt.Errorf("arrow/array: string offset buffer must have at least %d values", expNumOffsets)) } if int(a.offsets[expNumOffsets-1]) > len(a.values) { panic("arrow/array: string offsets out of bounds of data buffer") } } func (a *String) GetOneForMarshal(i int) interface{} { if a.IsValid(i) { return a.Value(i) } return nil } func (a *String) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { if a.IsValid(i) { vals[i] = a.Value(i) } else { vals[i] = nil } } return json.Marshal(vals) } func arrayEqualString(left, right *String) bool { for i := 0; i < left.Len(); i++ { if left.IsNull(i) { continue } if left.Value(i) != right.Value(i) { return false } } return true } // String represents an immutable sequence of variable-length UTF-8 strings. type LargeString struct { array offsets []int64 values string } // NewStringData constructs a new String array from data. func NewLargeStringData(data arrow.ArrayData) *LargeString { a := &LargeString{} a.refCount.Add(1) a.setData(data.(*Data)) return a } // Reset resets the String with a different set of Data. func (a *LargeString) Reset(data arrow.ArrayData) { a.setData(data.(*Data)) } // Value returns the slice at index i. This value should not be mutated. func (a *LargeString) Value(i int) string { i = i + a.array.data.offset return a.values[a.offsets[i]:a.offsets[i+1]] } func (a *LargeString) ValueStr(i int) string { if a.IsNull(i) { return NullValueStr } return a.Value(i) } // ValueOffset returns the offset of the value at index i. func (a *LargeString) ValueOffset(i int) int64 { if i < 0 || i > a.array.data.length { panic("arrow/array: index out of range") } return a.offsets[i+a.array.data.offset] } func (a *LargeString) ValueOffset64(i int) int64 { return a.ValueOffset(i) } func (a *LargeString) ValueLen(i int) int { if i < 0 || i >= a.array.data.length { panic("arrow/array: index out of range") } beg := a.array.data.offset + i return int(a.offsets[beg+1] - a.offsets[beg]) } func (a *LargeString) ValueOffsets() []int64 { beg := a.array.data.offset end := beg + a.array.data.length + 1 return a.offsets[beg:end] } func (a *LargeString) ValueBytes() []byte { beg := a.array.data.offset end := beg + a.array.data.length if a.array.data.buffers[2] != nil { return a.array.data.buffers[2].Bytes()[a.offsets[beg]:a.offsets[end]] } return nil } func (a *LargeString) String() string { o := new(strings.Builder) o.WriteString("[") for i := 0; i < a.Len(); i++ { if i > 0 { o.WriteString(" ") } switch { case a.IsNull(i): o.WriteString(NullValueStr) default: fmt.Fprintf(o, "%q", a.Value(i)) } } o.WriteString("]") return o.String() } func (a *LargeString) setData(data *Data) { if len(data.buffers) != 3 { panic("arrow/array: len(data.buffers) != 3") } a.array.setData(data) if vdata := data.buffers[2]; vdata != nil { b := vdata.Bytes() a.values = *(*string)(unsafe.Pointer(&b)) } if offsets := data.buffers[1]; offsets != nil { a.offsets = arrow.Int64Traits.CastFromBytes(offsets.Bytes()) } if a.array.data.length < 1 { return } expNumOffsets := a.array.data.offset + a.array.data.length + 1 if len(a.offsets) < expNumOffsets { panic(fmt.Errorf("arrow/array: string offset buffer must have at least %d values", expNumOffsets)) } if int(a.offsets[expNumOffsets-1]) > len(a.values) { panic("arrow/array: string offsets out of bounds of data buffer") } } func (a *LargeString) GetOneForMarshal(i int) interface{} { if a.IsValid(i) { return a.Value(i) } return nil } func (a *LargeString) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { vals[i] = a.GetOneForMarshal(i) } return json.Marshal(vals) } func arrayEqualLargeString(left, right *LargeString) bool { for i := 0; i < left.Len(); i++ { if left.IsNull(i) { continue } if left.Value(i) != right.Value(i) { return false } } return true } type StringView struct { array values []arrow.ViewHeader dataBuffers []*memory.Buffer } func NewStringViewData(data arrow.ArrayData) *StringView { a := &StringView{} a.refCount.Add(1) a.setData(data.(*Data)) return a } // Reset resets the String with a different set of Data. func (a *StringView) Reset(data arrow.ArrayData) { a.setData(data.(*Data)) } func (a *StringView) setData(data *Data) { if len(data.buffers) < 2 { panic("len(data.buffers) < 2") } a.array.setData(data) if valueData := data.buffers[1]; valueData != nil { a.values = arrow.ViewHeaderTraits.CastFromBytes(valueData.Bytes()) } a.dataBuffers = data.buffers[2:] } func (a *StringView) ValueHeader(i int) *arrow.ViewHeader { if i < 0 || i >= a.array.data.length { panic("arrow/array: index out of range") } return &a.values[a.array.data.offset+i] } func (a *StringView) Value(i int) string { s := a.ValueHeader(i) if s.IsInline() { return s.InlineString() } start := s.BufferOffset() buf := a.dataBuffers[s.BufferIndex()] value := buf.Bytes()[start : start+int32(s.Len())] return *(*string)(unsafe.Pointer(&value)) } func (a *StringView) ValueLen(i int) int { s := a.ValueHeader(i) return s.Len() } func (a *StringView) String() string { var o strings.Builder o.WriteString("[") for i := 0; i < a.Len(); i++ { if i > 0 { o.WriteString(" ") } switch { case a.IsNull(i): o.WriteString(NullValueStr) default: fmt.Fprintf(&o, "%q", a.Value(i)) } } o.WriteString("]") return o.String() } func (a *StringView) ValueStr(i int) string { if a.IsNull(i) { return NullValueStr } return a.Value(i) } func (a *StringView) GetOneForMarshal(i int) interface{} { if a.IsNull(i) { return nil } return a.Value(i) } func (a *StringView) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { vals[i] = a.GetOneForMarshal(i) } return json.Marshal(vals) } func arrayEqualStringView(left, right *StringView) bool { leftBufs, rightBufs := left.dataBuffers, right.dataBuffers for i := 0; i < left.Len(); i++ { if left.IsNull(i) { continue } if !left.ValueHeader(i).Equals(leftBufs, right.ValueHeader(i), rightBufs) { return false } } return true } // A StringBuilder is used to build a String array using the Append methods. type StringBuilder struct { *BinaryBuilder } // NewStringBuilder creates a new StringBuilder. func NewStringBuilder(mem memory.Allocator) *StringBuilder { b := &StringBuilder{ BinaryBuilder: NewBinaryBuilder(mem, arrow.BinaryTypes.String), } return b } func (b *StringBuilder) Type() arrow.DataType { return arrow.BinaryTypes.String } // Append appends a string to the builder. func (b *StringBuilder) Append(v string) { b.BinaryBuilder.Append([]byte(v)) } // AppendValues will append the values in the v slice. The valid slice determines which values // in v are valid (not null). The valid slice must either be empty or be equal in length to v. If empty, // all values in v are appended and considered valid. func (b *StringBuilder) AppendValues(v []string, valid []bool) { b.BinaryBuilder.AppendStringValues(v, valid) } // Value returns the string at index i. func (b *StringBuilder) Value(i int) string { return string(b.BinaryBuilder.Value(i)) } // NewArray creates a String array from the memory buffers used by the builder and resets the StringBuilder // so it can be used to build a new array. func (b *StringBuilder) NewArray() arrow.Array { return b.NewStringArray() } // NewStringArray creates a String array from the memory buffers used by the builder and resets the StringBuilder // so it can be used to build a new array. func (b *StringBuilder) NewStringArray() (a *String) { data := b.newData() a = NewStringData(data) data.Release() return } func (b *StringBuilder) UnmarshalOne(dec *json.Decoder) error { t, err := dec.Token() if err != nil { return err } switch v := t.(type) { case nil: b.AppendNull() case string: b.Append(v) default: return &json.UnmarshalTypeError{ Value: fmt.Sprint(v), Type: reflect.TypeOf(string("")), Offset: dec.InputOffset(), } } return nil } func (b *StringBuilder) Unmarshal(dec *json.Decoder) error { for dec.More() { if err := b.UnmarshalOne(dec); err != nil { return err } } return nil } func (b *StringBuilder) UnmarshalJSON(data []byte) error { dec := json.NewDecoder(bytes.NewReader(data)) t, err := dec.Token() if err != nil { return err } if delim, ok := t.(json.Delim); !ok || delim != '[' { return fmt.Errorf("string builder must unpack from json array, found %s", delim) } return b.Unmarshal(dec) } // A LargeStringBuilder is used to build a LargeString array using the Append methods. // LargeString is for when you need the offset buffer to be 64-bit integers // instead of 32-bit integers. type LargeStringBuilder struct { *BinaryBuilder } // NewStringBuilder creates a new StringBuilder. func NewLargeStringBuilder(mem memory.Allocator) *LargeStringBuilder { b := &LargeStringBuilder{ BinaryBuilder: NewBinaryBuilder(mem, arrow.BinaryTypes.LargeString), } return b } func (b *LargeStringBuilder) Type() arrow.DataType { return arrow.BinaryTypes.LargeString } // Append appends a string to the builder. func (b *LargeStringBuilder) Append(v string) { b.BinaryBuilder.Append([]byte(v)) } // AppendValues will append the values in the v slice. The valid slice determines which values // in v are valid (not null). The valid slice must either be empty or be equal in length to v. If empty, // all values in v are appended and considered valid. func (b *LargeStringBuilder) AppendValues(v []string, valid []bool) { b.BinaryBuilder.AppendStringValues(v, valid) } // Value returns the string at index i. func (b *LargeStringBuilder) Value(i int) string { return string(b.BinaryBuilder.Value(i)) } // NewArray creates a String array from the memory buffers used by the builder and resets the StringBuilder // so it can be used to build a new array. func (b *LargeStringBuilder) NewArray() arrow.Array { return b.NewLargeStringArray() } // NewStringArray creates a String array from the memory buffers used by the builder and resets the StringBuilder // so it can be used to build a new array. func (b *LargeStringBuilder) NewLargeStringArray() (a *LargeString) { data := b.newData() a = NewLargeStringData(data) data.Release() return } func (b *LargeStringBuilder) UnmarshalOne(dec *json.Decoder) error { t, err := dec.Token() if err != nil { return err } switch v := t.(type) { case nil: b.AppendNull() case string: b.Append(v) default: return &json.UnmarshalTypeError{ Value: fmt.Sprint(v), Type: reflect.TypeOf(string("")), Offset: dec.InputOffset(), } } return nil } func (b *LargeStringBuilder) Unmarshal(dec *json.Decoder) error { for dec.More() { if err := b.UnmarshalOne(dec); err != nil { return err } } return nil } func (b *LargeStringBuilder) UnmarshalJSON(data []byte) error { dec := json.NewDecoder(bytes.NewReader(data)) t, err := dec.Token() if err != nil { return err } if delim, ok := t.(json.Delim); !ok || delim != '[' { return fmt.Errorf("string builder must unpack from json array, found %s", delim) } return b.Unmarshal(dec) } type StringViewBuilder struct { *BinaryViewBuilder } func NewStringViewBuilder(mem memory.Allocator) *StringViewBuilder { bldr := &StringViewBuilder{ BinaryViewBuilder: NewBinaryViewBuilder(mem), } bldr.dtype = arrow.BinaryTypes.StringView return bldr } func (b *StringViewBuilder) Append(v string) { b.BinaryViewBuilder.AppendString(v) } func (b *StringViewBuilder) AppendValues(v []string, valid []bool) { b.BinaryViewBuilder.AppendStringValues(v, valid) } func (b *StringViewBuilder) UnmarshalOne(dec *json.Decoder) error { t, err := dec.Token() if err != nil { return err } switch v := t.(type) { case string: b.Append(v) case []byte: b.BinaryViewBuilder.Append(v) case nil: b.AppendNull() default: return &json.UnmarshalTypeError{ Value: fmt.Sprint(t), Type: reflect.TypeOf([]byte{}), Offset: dec.InputOffset(), } } return nil } func (b *StringViewBuilder) Unmarshal(dec *json.Decoder) error { for dec.More() { if err := b.UnmarshalOne(dec); err != nil { return err } } return nil } func (b *StringViewBuilder) UnmarshalJSON(data []byte) error { dec := json.NewDecoder(bytes.NewReader(data)) t, err := dec.Token() if err != nil { return err } if delim, ok := t.(json.Delim); !ok || delim != '[' { return fmt.Errorf("binary view builder must unpack from json array, found %s", delim) } return b.Unmarshal(dec) } func (b *StringViewBuilder) NewArray() arrow.Array { return b.NewStringViewArray() } func (b *StringViewBuilder) NewStringViewArray() (a *StringView) { data := b.newData() a = NewStringViewData(data) data.Release() return } type StringLikeBuilder interface { Builder Append(string) AppendValues([]string, []bool) UnsafeAppend([]byte) ReserveData(int) } var ( _ arrow.Array = (*String)(nil) _ arrow.Array = (*LargeString)(nil) _ arrow.Array = (*StringView)(nil) _ Builder = (*StringBuilder)(nil) _ Builder = (*LargeStringBuilder)(nil) _ Builder = (*StringViewBuilder)(nil) _ StringLikeBuilder = (*StringBuilder)(nil) _ StringLikeBuilder = (*LargeStringBuilder)(nil) _ StringLikeBuilder = (*StringViewBuilder)(nil) _ StringLike = (*String)(nil) _ StringLike = (*LargeString)(nil) _ StringLike = (*StringView)(nil) _ arrow.TypedArray[string] = (*String)(nil) _ arrow.TypedArray[string] = (*LargeString)(nil) _ arrow.TypedArray[string] = (*StringView)(nil) )