arrow/array/binary.go (357 lines of code) (raw):

// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package array import ( "bytes" "encoding/base64" "fmt" "strings" "unsafe" "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/memory" "github.com/apache/arrow-go/v18/internal/json" ) type BinaryLike interface { arrow.Array ValueLen(int) int ValueBytes() []byte ValueOffset64(int) int64 } // A type which represents an immutable sequence of variable-length binary strings. type Binary struct { array valueOffsets []int32 valueBytes []byte } // NewBinaryData constructs a new Binary array from data. func NewBinaryData(data arrow.ArrayData) *Binary { a := &Binary{} a.refCount.Add(1) a.setData(data.(*Data)) return a } // Value returns the slice at index i. This value should not be mutated. func (a *Binary) Value(i int) []byte { if i < 0 || i >= a.array.data.length { panic("arrow/array: index out of range") } idx := a.array.data.offset + i return a.valueBytes[a.valueOffsets[idx]:a.valueOffsets[idx+1]] } // ValueStr returns a copy of the base64-encoded string value or NullValueStr func (a *Binary) ValueStr(i int) string { if a.IsNull(i) { return NullValueStr } return base64.StdEncoding.EncodeToString(a.Value(i)) } // ValueString returns the string at index i without performing additional allocations. // The string is only valid for the lifetime of the Binary array. func (a *Binary) ValueString(i int) string { b := a.Value(i) return *(*string)(unsafe.Pointer(&b)) } func (a *Binary) ValueOffset(i int) int { if i < 0 || i >= a.array.data.length { panic("arrow/array: index out of range") } return int(a.valueOffsets[a.array.data.offset+i]) } func (a *Binary) ValueOffset64(i int) int64 { return int64(a.ValueOffset(i)) } func (a *Binary) ValueLen(i int) int { if i < 0 || i >= a.array.data.length { panic("arrow/array: index out of range") } beg := a.array.data.offset + i return int(a.valueOffsets[beg+1] - a.valueOffsets[beg]) } func (a *Binary) ValueOffsets() []int32 { beg := a.array.data.offset end := beg + a.array.data.length + 1 return a.valueOffsets[beg:end] } func (a *Binary) ValueBytes() []byte { beg := a.array.data.offset end := beg + a.array.data.length return a.valueBytes[a.valueOffsets[beg]:a.valueOffsets[end]] } func (a *Binary) String() string { o := new(strings.Builder) o.WriteString("[") for i := 0; i < a.Len(); i++ { if i > 0 { o.WriteString(" ") } switch { case a.IsNull(i): o.WriteString(NullValueStr) default: fmt.Fprintf(o, "%q", a.ValueString(i)) } } o.WriteString("]") return o.String() } func (a *Binary) setData(data *Data) { if len(data.buffers) != 3 { panic("len(data.buffers) != 3") } a.array.setData(data) if valueData := data.buffers[2]; valueData != nil { a.valueBytes = valueData.Bytes() } if valueOffsets := data.buffers[1]; valueOffsets != nil { a.valueOffsets = arrow.Int32Traits.CastFromBytes(valueOffsets.Bytes()) } if a.array.data.length < 1 { return } expNumOffsets := a.array.data.offset + a.array.data.length + 1 if len(a.valueOffsets) < expNumOffsets { panic(fmt.Errorf("arrow/array: binary offset buffer must have at least %d values", expNumOffsets)) } if int(a.valueOffsets[expNumOffsets-1]) > len(a.valueBytes) { panic("arrow/array: binary offsets out of bounds of data buffer") } } func (a *Binary) GetOneForMarshal(i int) interface{} { if a.IsNull(i) { return nil } return a.Value(i) } func (a *Binary) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { vals[i] = a.GetOneForMarshal(i) } // golang marshal standard says that []byte will be marshalled // as a base64-encoded string return json.Marshal(vals) } func arrayEqualBinary(left, right *Binary) bool { for i := 0; i < left.Len(); i++ { if left.IsNull(i) { continue } if !bytes.Equal(left.Value(i), right.Value(i)) { return false } } return true } type LargeBinary struct { array valueOffsets []int64 valueBytes []byte } func NewLargeBinaryData(data arrow.ArrayData) *LargeBinary { a := &LargeBinary{} a.refCount.Add(1) a.setData(data.(*Data)) return a } func (a *LargeBinary) Value(i int) []byte { if i < 0 || i >= a.array.data.length { panic("arrow/array: index out of range") } idx := a.array.data.offset + i return a.valueBytes[a.valueOffsets[idx]:a.valueOffsets[idx+1]] } func (a *LargeBinary) ValueStr(i int) string { if a.IsNull(i) { return NullValueStr } return base64.StdEncoding.EncodeToString(a.Value(i)) } func (a *LargeBinary) ValueString(i int) string { b := a.Value(i) return *(*string)(unsafe.Pointer(&b)) } func (a *LargeBinary) ValueOffset(i int) int64 { if i < 0 || i >= a.array.data.length { panic("arrow/array: index out of range") } return a.valueOffsets[a.array.data.offset+i] } func (a *LargeBinary) ValueOffset64(i int) int64 { return a.ValueOffset(i) } func (a *LargeBinary) ValueLen(i int) int { if i < 0 || i >= a.array.data.length { panic("arrow/array: index out of range") } beg := a.array.data.offset + i return int(a.valueOffsets[beg+1] - a.valueOffsets[beg]) } func (a *LargeBinary) ValueOffsets() []int64 { beg := a.array.data.offset end := beg + a.array.data.length + 1 return a.valueOffsets[beg:end] } func (a *LargeBinary) ValueBytes() []byte { beg := a.array.data.offset end := beg + a.array.data.length return a.valueBytes[a.valueOffsets[beg]:a.valueOffsets[end]] } func (a *LargeBinary) String() string { var o strings.Builder o.WriteString("[") for i := 0; i < a.Len(); i++ { if i > 0 { o.WriteString(" ") } switch { case a.IsNull(i): o.WriteString(NullValueStr) default: fmt.Fprintf(&o, "%q", a.ValueString(i)) } } o.WriteString("]") return o.String() } func (a *LargeBinary) setData(data *Data) { if len(data.buffers) != 3 { panic("len(data.buffers) != 3") } a.array.setData(data) if valueData := data.buffers[2]; valueData != nil { a.valueBytes = valueData.Bytes() } if valueOffsets := data.buffers[1]; valueOffsets != nil { a.valueOffsets = arrow.Int64Traits.CastFromBytes(valueOffsets.Bytes()) } if a.array.data.length < 1 { return } expNumOffsets := a.array.data.offset + a.array.data.length + 1 if len(a.valueOffsets) < expNumOffsets { panic(fmt.Errorf("arrow/array: large binary offset buffer must have at least %d values", expNumOffsets)) } if int(a.valueOffsets[expNumOffsets-1]) > len(a.valueBytes) { panic("arrow/array: large binary offsets out of bounds of data buffer") } } func (a *LargeBinary) GetOneForMarshal(i int) interface{} { if a.IsNull(i) { return nil } return a.Value(i) } func (a *LargeBinary) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { vals[i] = a.GetOneForMarshal(i) } // golang marshal standard says that []byte will be marshalled // as a base64-encoded string return json.Marshal(vals) } func arrayEqualLargeBinary(left, right *LargeBinary) bool { for i := 0; i < left.Len(); i++ { if left.IsNull(i) { continue } if !bytes.Equal(left.Value(i), right.Value(i)) { return false } } return true } type ViewLike interface { arrow.Array ValueHeader(int) *arrow.ViewHeader } type BinaryView struct { array values []arrow.ViewHeader dataBuffers []*memory.Buffer } func NewBinaryViewData(data arrow.ArrayData) *BinaryView { a := &BinaryView{} a.refCount.Add(1) a.setData(data.(*Data)) return a } func (a *BinaryView) setData(data *Data) { if len(data.buffers) < 2 { panic("len(data.buffers) < 2") } a.array.setData(data) if valueData := data.buffers[1]; valueData != nil { a.values = arrow.ViewHeaderTraits.CastFromBytes(valueData.Bytes()) } a.dataBuffers = data.buffers[2:] } func (a *BinaryView) ValueHeader(i int) *arrow.ViewHeader { if i < 0 || i >= a.array.data.length { panic("arrow/array: index out of range") } return &a.values[a.array.data.offset+i] } func (a *BinaryView) Value(i int) []byte { s := a.ValueHeader(i) if s.IsInline() { return s.InlineBytes() } start := s.BufferOffset() buf := a.dataBuffers[s.BufferIndex()] return buf.Bytes()[start : start+int32(s.Len())] } func (a *BinaryView) ValueLen(i int) int { s := a.ValueHeader(i) return s.Len() } // ValueString returns the value at index i as a string instead of // a byte slice, without copying the underlying data. func (a *BinaryView) ValueString(i int) string { b := a.Value(i) return *(*string)(unsafe.Pointer(&b)) } func (a *BinaryView) String() string { var o strings.Builder o.WriteString("[") for i := 0; i < a.Len(); i++ { if i > 0 { o.WriteString(" ") } switch { case a.IsNull(i): o.WriteString(NullValueStr) default: fmt.Fprintf(&o, "%q", a.ValueString(i)) } } o.WriteString("]") return o.String() } // ValueStr is paired with AppendValueFromString in that it returns // the value at index i as a string: Semantically this means that for // a null value it will return the string "(null)", otherwise it will // return the value as a base64 encoded string suitable for CSV/JSON. // // This is always going to be less performant than just using ValueString // and exists to fulfill the Array interface to provide a method which // can produce a human readable string for a given index. func (a *BinaryView) ValueStr(i int) string { if a.IsNull(i) { return NullValueStr } return base64.StdEncoding.EncodeToString(a.Value(i)) } func (a *BinaryView) GetOneForMarshal(i int) interface{} { if a.IsNull(i) { return nil } return a.Value(i) } func (a *BinaryView) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { vals[i] = a.GetOneForMarshal(i) } // golang marshal standard says that []byte will be marshalled // as a base64-encoded string return json.Marshal(vals) } func arrayEqualBinaryView(left, right *BinaryView) bool { leftBufs, rightBufs := left.dataBuffers, right.dataBuffers for i := 0; i < left.Len(); i++ { if left.IsNull(i) { continue } if !left.ValueHeader(i).Equals(leftBufs, right.ValueHeader(i), rightBufs) { return false } } return true } var ( _ arrow.Array = (*Binary)(nil) _ arrow.Array = (*LargeBinary)(nil) _ arrow.Array = (*BinaryView)(nil) _ BinaryLike = (*Binary)(nil) _ BinaryLike = (*LargeBinary)(nil) _ arrow.TypedArray[[]byte] = (*Binary)(nil) _ arrow.TypedArray[[]byte] = (*LargeBinary)(nil) _ arrow.TypedArray[[]byte] = (*BinaryView)(nil) )