arrow-array/src/builder/struct_builder.rs (482 lines of code) (raw):
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::builder::*;
use crate::StructArray;
use arrow_buffer::NullBufferBuilder;
use arrow_schema::{Fields, SchemaBuilder};
use std::sync::Arc;
/// Builder for [`StructArray`]
///
/// Note that callers should make sure that methods of all the child field builders are
/// properly called to maintain the consistency of the data structure.
///
///
/// Handling arrays with complex layouts, such as `List<Struct<List<Struct>>>`, in Rust can be challenging due to its strong typing system.
/// To construct a collection builder ([`ListBuilder`], [`LargeListBuilder`], or [`MapBuilder`]) using [`make_builder`], multiple calls are required. This complexity arises from the recursive approach utilized by [`StructBuilder::from_fields`].
///
/// Initially, [`StructBuilder::from_fields`] invokes [`make_builder`], which returns a `Box<dyn ArrayBuilder>`. To obtain the specific collection builder, one must first use [`StructBuilder::field_builder`] to get a `Collection<[Box<dyn ArrayBuilder>]>`. Subsequently, the `values()` result from this operation can be downcast to the desired builder type.
///
/// For example, when working with [`ListBuilder`], you would first call [`StructBuilder::field_builder::<ListBuilder<Box<dyn ArrayBuilder>>>`] and then downcast the [`Box<dyn ArrayBuilder>`] to the specific [`StructBuilder`] you need.
///
/// For a practical example see the code below:
///
/// ```rust
/// use arrow_array::builder::{ArrayBuilder, ListBuilder, StringBuilder, StructBuilder};
/// use arrow_schema::{DataType, Field, Fields};
/// use std::sync::Arc;
///
/// // This is an example column that has a List<Struct<List<Struct>>> layout
/// let mut example_col = ListBuilder::new(StructBuilder::from_fields(
/// vec![Field::new(
/// "value_list",
/// DataType::List(Arc::new(Field::new_list_field(
/// DataType::Struct(Fields::from(vec![
/// Field::new("key", DataType::Utf8, true),
/// Field::new("value", DataType::Utf8, true),
/// ])), //In this example we are trying to get to this builder and insert key/value pairs
/// true,
/// ))),
/// true,
/// )],
/// 0,
/// ));
///
/// // We can obtain the StructBuilder without issues, because example_col was created with StructBuilder
/// let col_struct_builder: &mut StructBuilder = example_col.values();
///
/// // We can't obtain the ListBuilder<StructBuilder> with the expected generic types, because under the hood
/// // the StructBuilder was returned as a Box<dyn ArrayBuilder> and passed as such to the ListBuilder constructor
///
/// // This panics in runtime, even though we know that the builder is a ListBuilder<StructBuilder>.
/// // let sb = col_struct_builder
/// // .field_builder::<ListBuilder<StructBuilder>>(0)
/// // .as_mut()
/// // .unwrap();
///
/// //To keep in line with Rust's strong typing, we fetch a ListBuilder<Box<dyn ArrayBuilder>> from the column StructBuilder first...
/// let mut list_builder_option =
/// col_struct_builder.field_builder::<ListBuilder<Box<dyn ArrayBuilder>>>(0);
///
/// let list_builder = list_builder_option.as_mut().unwrap();
///
/// // ... and then downcast the key/value pair values to a StructBuilder
/// let struct_builder = list_builder
/// .values()
/// .as_any_mut()
/// .downcast_mut::<StructBuilder>()
/// .unwrap();
///
/// // We can now append values to the StructBuilder
/// let key_builder = struct_builder.field_builder::<StringBuilder>(0).unwrap();
/// key_builder.append_value("my key");
///
/// let value_builder = struct_builder.field_builder::<StringBuilder>(1).unwrap();
/// value_builder.append_value("my value");
///
/// struct_builder.append(true);
/// list_builder.append(true);
/// col_struct_builder.append(true);
/// example_col.append(true);
///
/// let array = example_col.finish();
///
/// println!("My array: {:?}", array);
/// ```
///
pub struct StructBuilder {
fields: Fields,
field_builders: Vec<Box<dyn ArrayBuilder>>,
null_buffer_builder: NullBufferBuilder,
}
impl std::fmt::Debug for StructBuilder {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("StructBuilder")
.field("fields", &self.fields)
.field("bitmap_builder", &self.null_buffer_builder)
.field("len", &self.len())
.finish()
}
}
impl ArrayBuilder for StructBuilder {
/// Returns the number of array slots in the builder.
///
/// Note that this always return the first child field builder's length, and it is
/// the caller's responsibility to maintain the consistency that all the child field
/// builder should have the equal number of elements.
fn len(&self) -> usize {
self.null_buffer_builder.len()
}
/// Builds the array.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
/// Returns the builder as a non-mutable `Any` reference.
///
/// This is most useful when one wants to call non-mutable APIs on a specific builder
/// type. In this case, one can first cast this into a `Any`, and then use
/// `downcast_ref` to get a reference on the specific builder.
fn as_any(&self) -> &dyn Any {
self
}
/// Returns the builder as a mutable `Any` reference.
///
/// This is most useful when one wants to call mutable APIs on a specific builder
/// type. In this case, one can first cast this into a `Any`, and then use
/// `downcast_mut` to get a reference on the specific builder.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
}
impl StructBuilder {
/// Creates a new `StructBuilder`
pub fn new(fields: impl Into<Fields>, field_builders: Vec<Box<dyn ArrayBuilder>>) -> Self {
Self {
field_builders,
fields: fields.into(),
null_buffer_builder: NullBufferBuilder::new(0),
}
}
/// Creates a new `StructBuilder` from [`Fields`] and `capacity`
pub fn from_fields(fields: impl Into<Fields>, capacity: usize) -> Self {
let fields = fields.into();
let mut builders = Vec::with_capacity(fields.len());
for field in &fields {
builders.push(make_builder(field.data_type(), capacity));
}
Self::new(fields, builders)
}
/// Returns a mutable reference to the child field builder at index `i`.
/// Result will be `None` if the input type `T` provided doesn't match the actual
/// field builder's type.
pub fn field_builder<T: ArrayBuilder>(&mut self, i: usize) -> Option<&mut T> {
self.field_builders[i].as_any_mut().downcast_mut::<T>()
}
/// Returns a reference to field builders
pub fn field_builders(&self) -> &[Box<dyn ArrayBuilder>] {
&self.field_builders
}
/// Returns a mutable reference to field builders
pub fn field_builders_mut(&mut self) -> &mut [Box<dyn ArrayBuilder>] {
&mut self.field_builders
}
/// Returns the number of fields for the struct this builder is building.
pub fn num_fields(&self) -> usize {
self.field_builders.len()
}
/// Appends an element (either null or non-null) to the struct. The actual elements
/// should be appended for each child sub-array in a consistent way.
#[inline]
pub fn append(&mut self, is_valid: bool) {
self.null_buffer_builder.append(is_valid);
}
/// Appends a null element to the struct.
#[inline]
pub fn append_null(&mut self) {
self.append(false)
}
/// Builds the `StructArray` and reset this builder.
pub fn finish(&mut self) -> StructArray {
self.validate_content();
if self.fields.is_empty() {
return StructArray::new_empty_fields(self.len(), self.null_buffer_builder.finish());
}
let arrays = self.field_builders.iter_mut().map(|f| f.finish()).collect();
let nulls = self.null_buffer_builder.finish();
StructArray::new(self.fields.clone(), arrays, nulls)
}
/// Builds the `StructArray` without resetting the builder.
pub fn finish_cloned(&self) -> StructArray {
self.validate_content();
if self.fields.is_empty() {
return StructArray::new_empty_fields(
self.len(),
self.null_buffer_builder.finish_cloned(),
);
}
let arrays = self
.field_builders
.iter()
.map(|f| f.finish_cloned())
.collect();
let nulls = self.null_buffer_builder.finish_cloned();
StructArray::new(self.fields.clone(), arrays, nulls)
}
/// Constructs and validates contents in the builder to ensure that
/// - fields and field_builders are of equal length
/// - the number of items in individual field_builders are equal to self.len()
fn validate_content(&self) {
if self.fields.len() != self.field_builders.len() {
panic!("Number of fields is not equal to the number of field_builders.");
}
self.field_builders.iter().enumerate().for_each(|(idx, x)| {
if x.len() != self.len() {
let builder = SchemaBuilder::from(&self.fields);
let schema = builder.finish();
panic!("{}", format!(
"StructBuilder ({:?}) and field_builder with index {} ({:?}) are of unequal lengths: ({} != {}).",
schema,
idx,
self.fields[idx].data_type(),
self.len(),
x.len()
));
}
});
}
/// Returns the current null buffer as a slice
pub fn validity_slice(&self) -> Option<&[u8]> {
self.null_buffer_builder.as_slice()
}
}
#[cfg(test)]
mod tests {
use std::any::type_name;
use super::*;
use arrow_buffer::Buffer;
use arrow_data::ArrayData;
use arrow_schema::Field;
use crate::{array::Array, types::ArrowDictionaryKeyType};
#[test]
fn test_struct_array_builder() {
let string_builder = StringBuilder::new();
let int_builder = Int32Builder::new();
let fields = vec![
Field::new("f1", DataType::Utf8, true),
Field::new("f2", DataType::Int32, true),
];
let field_builders = vec![
Box::new(string_builder) as Box<dyn ArrayBuilder>,
Box::new(int_builder) as Box<dyn ArrayBuilder>,
];
let mut builder = StructBuilder::new(fields, field_builders);
assert_eq!(2, builder.num_fields());
let string_builder = builder
.field_builder::<StringBuilder>(0)
.expect("builder at field 0 should be string builder");
string_builder.append_value("joe");
string_builder.append_null();
string_builder.append_null();
string_builder.append_value("mark");
let int_builder = builder
.field_builder::<Int32Builder>(1)
.expect("builder at field 1 should be int builder");
int_builder.append_value(1);
int_builder.append_value(2);
int_builder.append_null();
int_builder.append_value(4);
builder.append(true);
builder.append(true);
builder.append_null();
builder.append(true);
let struct_data = builder.finish().into_data();
assert_eq!(4, struct_data.len());
assert_eq!(1, struct_data.null_count());
assert_eq!(&[11_u8], struct_data.nulls().unwrap().validity());
let expected_string_data = ArrayData::builder(DataType::Utf8)
.len(4)
.null_bit_buffer(Some(Buffer::from(&[9_u8])))
.add_buffer(Buffer::from_slice_ref([0, 3, 3, 3, 7]))
.add_buffer(Buffer::from_slice_ref(b"joemark"))
.build()
.unwrap();
let expected_int_data = ArrayData::builder(DataType::Int32)
.len(4)
.null_bit_buffer(Some(Buffer::from_slice_ref([11_u8])))
.add_buffer(Buffer::from_slice_ref([1, 2, 0, 4]))
.build()
.unwrap();
assert_eq!(expected_string_data, struct_data.child_data()[0]);
assert_eq!(expected_int_data, struct_data.child_data()[1]);
}
#[test]
fn test_struct_array_builder_finish() {
let int_builder = Int32Builder::new();
let bool_builder = BooleanBuilder::new();
let fields = vec![
Field::new("f1", DataType::Int32, false),
Field::new("f2", DataType::Boolean, false),
];
let field_builders = vec![
Box::new(int_builder) as Box<dyn ArrayBuilder>,
Box::new(bool_builder) as Box<dyn ArrayBuilder>,
];
let mut builder = StructBuilder::new(fields, field_builders);
builder
.field_builder::<Int32Builder>(0)
.unwrap()
.append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
builder
.field_builder::<BooleanBuilder>(1)
.unwrap()
.append_slice(&[
false, true, false, true, false, true, false, true, false, true,
]);
// Append slot values - all are valid.
for _ in 0..10 {
builder.append(true);
}
assert_eq!(10, builder.len());
let arr = builder.finish();
assert_eq!(10, arr.len());
assert_eq!(0, builder.len());
builder
.field_builder::<Int32Builder>(0)
.unwrap()
.append_slice(&[1, 3, 5, 7, 9]);
builder
.field_builder::<BooleanBuilder>(1)
.unwrap()
.append_slice(&[false, true, false, true, false]);
// Append slot values - all are valid.
for _ in 0..5 {
builder.append(true);
}
assert_eq!(5, builder.len());
let arr = builder.finish();
assert_eq!(5, arr.len());
assert_eq!(0, builder.len());
}
#[test]
fn test_build_fixed_size_list() {
const LIST_LENGTH: i32 = 4;
let fixed_size_list_dtype =
DataType::new_fixed_size_list(DataType::Int32, LIST_LENGTH, false);
let mut builder = make_builder(&fixed_size_list_dtype, 10);
let builder = builder
.as_any_mut()
.downcast_mut::<FixedSizeListBuilder<Box<dyn ArrayBuilder>>>();
match builder {
Some(builder) => {
assert_eq!(builder.value_length(), LIST_LENGTH);
assert!(builder
.values()
.as_any_mut()
.downcast_mut::<Int32Builder>()
.is_some());
}
None => panic!("expected FixedSizeListBuilder, got a different builder type"),
}
}
#[test]
fn test_struct_array_builder_finish_cloned() {
let int_builder = Int32Builder::new();
let bool_builder = BooleanBuilder::new();
let fields = vec![
Field::new("f1", DataType::Int32, false),
Field::new("f2", DataType::Boolean, false),
];
let field_builders = vec![
Box::new(int_builder) as Box<dyn ArrayBuilder>,
Box::new(bool_builder) as Box<dyn ArrayBuilder>,
];
let mut builder = StructBuilder::new(fields, field_builders);
builder
.field_builder::<Int32Builder>(0)
.unwrap()
.append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
builder
.field_builder::<BooleanBuilder>(1)
.unwrap()
.append_slice(&[
false, true, false, true, false, true, false, true, false, true,
]);
// Append slot values - all are valid.
for _ in 0..10 {
builder.append(true);
}
assert_eq!(10, builder.len());
let mut arr = builder.finish_cloned();
assert_eq!(10, arr.len());
assert_eq!(10, builder.len());
builder
.field_builder::<Int32Builder>(0)
.unwrap()
.append_slice(&[1, 3, 5, 7, 9]);
builder
.field_builder::<BooleanBuilder>(1)
.unwrap()
.append_slice(&[false, true, false, true, false]);
// Append slot values - all are valid.
for _ in 0..5 {
builder.append(true);
}
assert_eq!(15, builder.len());
arr = builder.finish();
assert_eq!(15, arr.len());
assert_eq!(0, builder.len());
}
#[test]
fn test_struct_array_builder_from_schema() {
let mut fields = vec![
Field::new("f1", DataType::Float32, false),
Field::new("f2", DataType::Utf8, false),
];
let sub_fields = vec![
Field::new("g1", DataType::Int32, false),
Field::new("g2", DataType::Boolean, false),
];
let struct_type = DataType::Struct(sub_fields.into());
fields.push(Field::new("f3", struct_type, false));
let mut builder = StructBuilder::from_fields(fields, 5);
assert_eq!(3, builder.num_fields());
assert!(builder.field_builder::<Float32Builder>(0).is_some());
assert!(builder.field_builder::<StringBuilder>(1).is_some());
assert!(builder.field_builder::<StructBuilder>(2).is_some());
}
#[test]
fn test_datatype_properties() {
let fields = Fields::from(vec![
Field::new("f1", DataType::Decimal128(1, 2), false),
Field::new(
"f2",
DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
false,
),
]);
let mut builder = StructBuilder::from_fields(fields.clone(), 1);
builder
.field_builder::<Decimal128Builder>(0)
.unwrap()
.append_value(1);
builder
.field_builder::<TimestampMillisecondBuilder>(1)
.unwrap()
.append_value(1);
builder.append(true);
let array = builder.finish();
assert_eq!(array.data_type(), &DataType::Struct(fields.clone()));
assert_eq!(array.column(0).data_type(), fields[0].data_type());
assert_eq!(array.column(1).data_type(), fields[1].data_type());
}
#[test]
fn test_struct_array_builder_from_dictionary_type_int8_key() {
test_struct_array_builder_from_dictionary_type_inner::<Int8Type>(DataType::Int8);
}
#[test]
fn test_struct_array_builder_from_dictionary_type_int16_key() {
test_struct_array_builder_from_dictionary_type_inner::<Int16Type>(DataType::Int16);
}
#[test]
fn test_struct_array_builder_from_dictionary_type_int32_key() {
test_struct_array_builder_from_dictionary_type_inner::<Int32Type>(DataType::Int32);
}
#[test]
fn test_struct_array_builder_from_dictionary_type_int64_key() {
test_struct_array_builder_from_dictionary_type_inner::<Int64Type>(DataType::Int64);
}
fn test_struct_array_builder_from_dictionary_type_inner<K: ArrowDictionaryKeyType>(
key_type: DataType,
) {
let dict_field = Field::new(
"f1",
DataType::Dictionary(Box::new(key_type), Box::new(DataType::Utf8)),
false,
);
let fields = vec![dict_field.clone()];
let expected_dtype = DataType::Struct(fields.into());
let cloned_dict_field = dict_field.clone();
let expected_child_dtype = dict_field.data_type();
let mut struct_builder = StructBuilder::from_fields(vec![cloned_dict_field], 5);
let Some(dict_builder) = struct_builder.field_builder::<StringDictionaryBuilder<K>>(0)
else {
panic!(
"Builder should be StringDictionaryBuilder<{}>",
type_name::<K>()
)
};
dict_builder.append_value("dict string");
struct_builder.append(true);
let array = struct_builder.finish();
assert_eq!(array.data_type(), &expected_dtype);
assert_eq!(array.column(0).data_type(), expected_child_dtype);
assert_eq!(array.column(0).len(), 1);
}
#[test]
#[should_panic(
expected = "Data type Dictionary(UInt64, Utf8) with key type UInt64 is not currently supported"
)]
fn test_struct_array_builder_from_schema_unsupported_type() {
let fields = vec![
Field::new("f1", DataType::UInt64, false),
Field::new(
"f2",
DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)),
false,
),
];
let _ = StructBuilder::from_fields(fields, 5);
}
#[test]
#[should_panic(expected = "Dictionary value type Int32 is not currently supported")]
fn test_struct_array_builder_from_dict_with_unsupported_value_type() {
let fields = vec![Field::new(
"f1",
DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Int32)),
false,
)];
let _ = StructBuilder::from_fields(fields, 5);
}
#[test]
fn test_struct_array_builder_field_builder_type_mismatch() {
let int_builder = Int32Builder::with_capacity(10);
let fields = vec![Field::new("f1", DataType::Int32, false)];
let field_builders = vec![Box::new(int_builder) as Box<dyn ArrayBuilder>];
let mut builder = StructBuilder::new(fields, field_builders);
assert!(builder.field_builder::<BinaryBuilder>(0).is_none());
}
#[test]
#[should_panic(
expected = "StructBuilder (Schema { fields: [Field { name: \"f1\", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"f2\", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }) and field_builder with index 1 (Boolean) are of unequal lengths: (2 != 1)."
)]
fn test_struct_array_builder_unequal_field_builders_lengths() {
let mut int_builder = Int32Builder::with_capacity(10);
let mut bool_builder = BooleanBuilder::new();
int_builder.append_value(1);
int_builder.append_value(2);
bool_builder.append_value(true);
let fields = vec![
Field::new("f1", DataType::Int32, false),
Field::new("f2", DataType::Boolean, false),
];
let field_builders = vec![
Box::new(int_builder) as Box<dyn ArrayBuilder>,
Box::new(bool_builder) as Box<dyn ArrayBuilder>,
];
let mut builder = StructBuilder::new(fields, field_builders);
builder.append(true);
builder.append(true);
builder.finish();
}
#[test]
#[should_panic(expected = "Number of fields is not equal to the number of field_builders.")]
fn test_struct_array_builder_unequal_field_field_builders() {
let int_builder = Int32Builder::with_capacity(10);
let fields = vec![
Field::new("f1", DataType::Int32, false),
Field::new("f2", DataType::Boolean, false),
];
let field_builders = vec![Box::new(int_builder) as Box<dyn ArrayBuilder>];
let mut builder = StructBuilder::new(fields, field_builders);
builder.finish();
}
#[test]
#[should_panic(
expected = "Incorrect datatype for StructArray field \\\"timestamp\\\", expected Timestamp(Nanosecond, Some(\\\"UTC\\\")) got Timestamp(Nanosecond, None)"
)]
fn test_struct_array_mismatch_builder() {
let fields = vec![Field::new(
"timestamp",
DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".to_owned().into())),
false,
)];
let field_builders: Vec<Box<dyn ArrayBuilder>> =
vec![Box::new(TimestampNanosecondBuilder::new())];
let mut sa = StructBuilder::new(fields, field_builders);
sa.finish();
}
#[test]
fn test_empty() {
let mut builder = StructBuilder::new(Fields::empty(), vec![]);
builder.append(true);
builder.append(false);
let a1 = builder.finish_cloned();
let a2 = builder.finish();
assert_eq!(a1, a2);
assert_eq!(a1.len(), 2);
assert_eq!(a1.null_count(), 1);
assert!(a1.is_valid(0));
assert!(a1.is_null(1));
}
}