src/nanoarrow/array.c (941 lines of code) (raw):

// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include <errno.h> #include <stdlib.h> #include <string.h> #include "nanoarrow.h" static void ArrowArrayRelease(struct ArrowArray* array) { // Release buffers held by this array struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; if (private_data != NULL) { ArrowBitmapReset(&private_data->bitmap); ArrowBufferReset(&private_data->buffers[0]); ArrowBufferReset(&private_data->buffers[1]); ArrowFree(private_data); } // This object owns the memory for all the children, but those // children may have been generated elsewhere and might have // their own release() callback. if (array->children != NULL) { for (int64_t i = 0; i < array->n_children; i++) { if (array->children[i] != NULL) { if (array->children[i]->release != NULL) { array->children[i]->release(array->children[i]); } ArrowFree(array->children[i]); } } ArrowFree(array->children); } // This object owns the memory for the dictionary but it // may have been generated somewhere else and have its own // release() callback. if (array->dictionary != NULL) { if (array->dictionary->release != NULL) { array->dictionary->release(array->dictionary); } ArrowFree(array->dictionary); } // Mark released array->release = NULL; } static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, enum ArrowType storage_type) { switch (storage_type) { case NANOARROW_TYPE_UNINITIALIZED: case NANOARROW_TYPE_NA: array->n_buffers = 0; break; case NANOARROW_TYPE_FIXED_SIZE_LIST: case NANOARROW_TYPE_STRUCT: case NANOARROW_TYPE_SPARSE_UNION: array->n_buffers = 1; break; case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_LARGE_LIST: case NANOARROW_TYPE_MAP: case NANOARROW_TYPE_BOOL: case NANOARROW_TYPE_UINT8: case NANOARROW_TYPE_INT8: case NANOARROW_TYPE_UINT16: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_UINT32: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_UINT64: case NANOARROW_TYPE_INT64: case NANOARROW_TYPE_HALF_FLOAT: case NANOARROW_TYPE_FLOAT: case NANOARROW_TYPE_DOUBLE: case NANOARROW_TYPE_DECIMAL128: case NANOARROW_TYPE_DECIMAL256: case NANOARROW_TYPE_INTERVAL_MONTHS: case NANOARROW_TYPE_INTERVAL_DAY_TIME: case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: case NANOARROW_TYPE_FIXED_SIZE_BINARY: case NANOARROW_TYPE_DENSE_UNION: array->n_buffers = 2; break; case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_BINARY: case NANOARROW_TYPE_LARGE_BINARY: array->n_buffers = 3; break; default: return EINVAL; return NANOARROW_OK; } struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; private_data->storage_type = storage_type; return NANOARROW_OK; } ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, enum ArrowType storage_type) { array->length = 0; array->null_count = 0; array->offset = 0; array->n_buffers = 0; array->n_children = 0; array->buffers = NULL; array->children = NULL; array->dictionary = NULL; array->release = &ArrowArrayRelease; array->private_data = NULL; struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)ArrowMalloc(sizeof(struct ArrowArrayPrivateData)); if (private_data == NULL) { array->release = NULL; return ENOMEM; } ArrowBitmapInit(&private_data->bitmap); ArrowBufferInit(&private_data->buffers[0]); ArrowBufferInit(&private_data->buffers[1]); private_data->buffer_data[0] = NULL; private_data->buffer_data[1] = NULL; private_data->buffer_data[2] = NULL; array->private_data = private_data; array->buffers = (const void**)(&private_data->buffer_data); int result = ArrowArraySetStorageType(array, storage_type); if (result != NANOARROW_OK) { array->release(array); return result; } ArrowLayoutInit(&private_data->layout, storage_type); // We can only know this not to be true when initializing based on a schema // so assume this to be true. private_data->union_type_id_is_child_index = 1; return NANOARROW_OK; } ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, struct ArrowArrayView* array_view, struct ArrowError* error) { NANOARROW_RETURN_NOT_OK_WITH_ERROR( ArrowArrayInitFromType(array, array_view->storage_type), error); int result; struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; private_data->layout = array_view->layout; if (array_view->n_children > 0) { result = ArrowArrayAllocateChildren(array, array_view->n_children); if (result != NANOARROW_OK) { array->release(array); return result; } for (int64_t i = 0; i < array_view->n_children; i++) { result = ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error); if (result != NANOARROW_OK) { array->release(array); return result; } } } if (array_view->dictionary != NULL) { result = ArrowArrayAllocateDictionary(array); if (result != NANOARROW_OK) { array->release(array); return result; } result = ArrowArrayInitFromArrayView(array->dictionary, array_view->dictionary, error); if (result != NANOARROW_OK) { array->release(array); return result; } } return NANOARROW_OK; } ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, struct ArrowSchema* schema, struct ArrowError* error) { struct ArrowArrayView array_view; NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema(&array_view, schema, error)); NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromArrayView(array, &array_view, error)); if (array_view.storage_type == NANOARROW_TYPE_DENSE_UNION || array_view.storage_type == NANOARROW_TYPE_SPARSE_UNION) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; // We can still build arrays if this isn't true; however, the append // functions won't work. Instead, we store this value and error only // when StartAppending is called. private_data->union_type_id_is_child_index = _ArrowUnionTypeIdsWillEqualChildIndices(schema->format + 4, schema->n_children); } ArrowArrayViewReset(&array_view); return NANOARROW_OK; } ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_children) { if (array->children != NULL) { return EINVAL; } if (n_children == 0) { return NANOARROW_OK; } array->children = (struct ArrowArray**)ArrowMalloc(n_children * sizeof(struct ArrowArray*)); if (array->children == NULL) { return ENOMEM; } memset(array->children, 0, n_children * sizeof(struct ArrowArray*)); for (int64_t i = 0; i < n_children; i++) { array->children[i] = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); if (array->children[i] == NULL) { return ENOMEM; } array->children[i]->release = NULL; } array->n_children = n_children; return NANOARROW_OK; } ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array) { if (array->dictionary != NULL) { return EINVAL; } array->dictionary = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); if (array->dictionary == NULL) { return ENOMEM; } array->dictionary->release = NULL; return NANOARROW_OK; } void ArrowArraySetValidityBitmap(struct ArrowArray* array, struct ArrowBitmap* bitmap) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; ArrowBufferMove(&bitmap->buffer, &private_data->bitmap.buffer); private_data->bitmap.size_bits = bitmap->size_bits; bitmap->size_bits = 0; private_data->buffer_data[0] = private_data->bitmap.buffer.data; array->null_count = -1; } ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, int64_t i, struct ArrowBuffer* buffer) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; switch (i) { case 0: ArrowBufferMove(buffer, &private_data->bitmap.buffer); private_data->buffer_data[i] = private_data->bitmap.buffer.data; break; case 1: case 2: ArrowBufferMove(buffer, &private_data->buffers[i - 1]); private_data->buffer_data[i] = private_data->buffers[i - 1].data; break; default: return EINVAL; } return NANOARROW_OK; } static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* array_view, struct ArrowArray* array) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; ArrowArrayViewInitFromType(array_view, private_data->storage_type); array_view->layout = private_data->layout; array_view->array = array; array_view->length = array->length; array_view->offset = array->offset; array_view->null_count = array->null_count; array_view->buffer_views[0].data.as_uint8 = private_data->bitmap.buffer.data; array_view->buffer_views[0].size_bytes = private_data->bitmap.buffer.size_bytes; array_view->buffer_views[1].data.as_uint8 = private_data->buffers[0].data; array_view->buffer_views[1].size_bytes = private_data->buffers[0].size_bytes; array_view->buffer_views[2].data.as_uint8 = private_data->buffers[1].data; array_view->buffer_views[2].size_bytes = private_data->buffers[1].size_bytes; int result = ArrowArrayViewAllocateChildren(array_view, array->n_children); if (result != NANOARROW_OK) { ArrowArrayViewReset(array_view); return result; } for (int64_t i = 0; i < array->n_children; i++) { result = ArrowArrayViewInitFromArray(array_view->children[i], array->children[i]); if (result != NANOARROW_OK) { ArrowArrayViewReset(array_view); return result; } } if (array->dictionary != NULL) { result = ArrowArrayViewAllocateDictionary(array_view); if (result != NANOARROW_OK) { ArrowArrayViewReset(array_view); return result; } result = ArrowArrayViewInitFromArray(array_view->dictionary, array->dictionary); if (result != NANOARROW_OK) { ArrowArrayViewReset(array_view); return result; } } return NANOARROW_OK; } static ArrowErrorCode ArrowArrayReserveInternal(struct ArrowArray* array, struct ArrowArrayView* array_view) { // Loop through buffers and reserve the extra space that we know about for (int64_t i = 0; i < array->n_buffers; i++) { // Don't reserve on a validity buffer that hasn't been allocated yet if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY && ArrowArrayBuffer(array, i)->data == NULL) { continue; } int64_t additional_size_bytes = array_view->buffer_views[i].size_bytes - ArrowArrayBuffer(array, i)->size_bytes; if (additional_size_bytes > 0) { NANOARROW_RETURN_NOT_OK( ArrowBufferReserve(ArrowArrayBuffer(array, i), additional_size_bytes)); } } // Recursively reserve children for (int64_t i = 0; i < array->n_children; i++) { NANOARROW_RETURN_NOT_OK( ArrowArrayReserveInternal(array->children[i], array_view->children[i])); } return NANOARROW_OK; } ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array, int64_t additional_size_elements) { struct ArrowArrayView array_view; NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromArray(&array_view, array)); // Calculate theoretical buffer sizes (recursively) ArrowArrayViewSetLength(&array_view, array->length + additional_size_elements); // Walk the structure (recursively) int result = ArrowArrayReserveInternal(array, &array_view); ArrowArrayViewReset(&array_view); if (result != NANOARROW_OK) { return result; } return NANOARROW_OK; } static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; // The only buffer finalizing this currently does is make sure the data // buffer for (Large)String|Binary is never NULL switch (private_data->storage_type) { case NANOARROW_TYPE_BINARY: case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_BINARY: case NANOARROW_TYPE_LARGE_STRING: if (ArrowArrayBuffer(array, 2)->data == NULL) { ArrowBufferAppendUInt8(ArrowArrayBuffer(array, 2), 0); } break; default: break; } for (int64_t i = 0; i < array->n_children; i++) { NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i])); } if (array->dictionary != NULL) { NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->dictionary)); } return NANOARROW_OK; } static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; for (int64_t i = 0; i < 3; i++) { private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data; } for (int64_t i = 0; i < array->n_children; i++) { ArrowArrayFlushInternalPointers(array->children[i]); } if (array->dictionary != NULL) { ArrowArrayFlushInternalPointers(array->dictionary); } } ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, enum ArrowValidationLevel validation_level, struct ArrowError* error) { // Even if the data buffer is size zero, the pointer value needed to be non-null // in some implementations (at least one version of Arrow C++ at the time this // was added). Only do this fix if we can assume CPU data access. if (validation_level >= NANOARROW_VALIDATION_LEVEL_DEFAULT) { NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinalizeBuffers(array), error); } // Make sure the value we get with array->buffers[i] is set to the actual // pointer (which may have changed from the original due to reallocation) ArrowArrayFlushInternalPointers(array); if (validation_level == NANOARROW_VALIDATION_LEVEL_NONE) { return NANOARROW_OK; } // For validation, initialize an ArrowArrayView with our known buffer sizes struct ArrowArrayView array_view; NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayViewInitFromArray(&array_view, array), error); int result = ArrowArrayViewValidate(&array_view, validation_level, error); ArrowArrayViewReset(&array_view); return result; } ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, struct ArrowError* error) { return ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_DEFAULT, error); } void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, enum ArrowType storage_type) { memset(array_view, 0, sizeof(struct ArrowArrayView)); array_view->storage_type = storage_type; ArrowLayoutInit(&array_view->layout, storage_type); } ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, int64_t n_children) { if (array_view->children != NULL) { return EINVAL; } array_view->children = (struct ArrowArrayView**)ArrowMalloc(n_children * sizeof(struct ArrowArrayView*)); if (array_view->children == NULL) { return ENOMEM; } for (int64_t i = 0; i < n_children; i++) { array_view->children[i] = NULL; } array_view->n_children = n_children; for (int64_t i = 0; i < n_children; i++) { array_view->children[i] = (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); if (array_view->children[i] == NULL) { return ENOMEM; } ArrowArrayViewInitFromType(array_view->children[i], NANOARROW_TYPE_UNINITIALIZED); } return NANOARROW_OK; } ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view) { if (array_view->dictionary != NULL) { return EINVAL; } array_view->dictionary = (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); if (array_view->dictionary == NULL) { return ENOMEM; } ArrowArrayViewInitFromType(array_view->dictionary, NANOARROW_TYPE_UNINITIALIZED); return NANOARROW_OK; } ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, struct ArrowSchema* schema, struct ArrowError* error) { struct ArrowSchemaView schema_view; int result = ArrowSchemaViewInit(&schema_view, schema, error); if (result != NANOARROW_OK) { return result; } ArrowArrayViewInitFromType(array_view, schema_view.storage_type); array_view->layout = schema_view.layout; result = ArrowArrayViewAllocateChildren(array_view, schema->n_children); if (result != NANOARROW_OK) { ArrowErrorSet(error, "ArrowArrayViewAllocateChildren() failed"); ArrowArrayViewReset(array_view); return result; } for (int64_t i = 0; i < schema->n_children; i++) { result = ArrowArrayViewInitFromSchema(array_view->children[i], schema->children[i], error); if (result != NANOARROW_OK) { ArrowArrayViewReset(array_view); return result; } } if (schema->dictionary != NULL) { result = ArrowArrayViewAllocateDictionary(array_view); if (result != NANOARROW_OK) { ArrowArrayViewReset(array_view); return result; } result = ArrowArrayViewInitFromSchema(array_view->dictionary, schema->dictionary, error); if (result != NANOARROW_OK) { ArrowArrayViewReset(array_view); return result; } } if (array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION || array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) { array_view->union_type_id_map = (int8_t*)ArrowMalloc(256 * sizeof(int8_t)); if (array_view->union_type_id_map == NULL) { return ENOMEM; } memset(array_view->union_type_id_map, -1, 256); int8_t n_type_ids = _ArrowParseUnionTypeIds(schema_view.union_type_ids, array_view->union_type_id_map + 128); for (int8_t child_index = 0; child_index < n_type_ids; child_index++) { int8_t type_id = array_view->union_type_id_map[128 + child_index]; array_view->union_type_id_map[type_id] = child_index; } } return NANOARROW_OK; } void ArrowArrayViewReset(struct ArrowArrayView* array_view) { if (array_view->children != NULL) { for (int64_t i = 0; i < array_view->n_children; i++) { if (array_view->children[i] != NULL) { ArrowArrayViewReset(array_view->children[i]); ArrowFree(array_view->children[i]); } } ArrowFree(array_view->children); } if (array_view->dictionary != NULL) { ArrowArrayViewReset(array_view->dictionary); ArrowFree(array_view->dictionary); } if (array_view->union_type_id_map != NULL) { ArrowFree(array_view->union_type_id_map); } ArrowArrayViewInitFromType(array_view, NANOARROW_TYPE_UNINITIALIZED); } void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) { for (int i = 0; i < 3; i++) { int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; switch (array_view->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_VALIDITY: array_view->buffer_views[i].size_bytes = _ArrowBytesForBits(length); continue; case NANOARROW_BUFFER_TYPE_DATA_OFFSET: // Probably don't want/need to rely on the producer to have allocated an // offsets buffer of length 1 for a zero-size array array_view->buffer_views[i].size_bytes = (length != 0) * element_size_bytes * (length + 1); continue; case NANOARROW_BUFFER_TYPE_DATA: array_view->buffer_views[i].size_bytes = _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * length) / 8; continue; case NANOARROW_BUFFER_TYPE_TYPE_ID: case NANOARROW_BUFFER_TYPE_UNION_OFFSET: array_view->buffer_views[i].size_bytes = element_size_bytes * length; continue; case NANOARROW_BUFFER_TYPE_NONE: array_view->buffer_views[i].size_bytes = 0; continue; } } switch (array_view->storage_type) { case NANOARROW_TYPE_STRUCT: case NANOARROW_TYPE_SPARSE_UNION: for (int64_t i = 0; i < array_view->n_children; i++) { ArrowArrayViewSetLength(array_view->children[i], length); } break; case NANOARROW_TYPE_FIXED_SIZE_LIST: if (array_view->n_children >= 1) { ArrowArrayViewSetLength(array_view->children[0], length * array_view->layout.child_size_elements); } default: break; } } // This version recursively extracts information from the array and stores it // in the array view, performing any checks that require the original array. static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, struct ArrowArray* array, struct ArrowError* error) { // Check length and offset if (array->offset < 0) { ArrowErrorSet(error, "Expected array offset >= 0 but found array offset of %ld", (long)array->offset); return EINVAL; } if (array->length < 0) { ArrowErrorSet(error, "Expected array length >= 0 but found array length of %ld", (long)array->length); return EINVAL; } array_view->array = array; array_view->offset = array->offset; array_view->length = array->length; array_view->null_count = array->null_count; int64_t buffers_required = 0; for (int i = 0; i < 3; i++) { if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { break; } buffers_required++; // Set buffer pointer array_view->buffer_views[i].data.data = array->buffers[i]; // If non-null, set buffer size to unknown. if (array->buffers[i] == NULL) { array_view->buffer_views[i].size_bytes = 0; } else { array_view->buffer_views[i].size_bytes = -1; } } // Check the number of buffers if (buffers_required != array->n_buffers) { ArrowErrorSet(error, "Expected array with %d buffer(s) but found %d buffer(s)", (int)buffers_required, (int)array->n_buffers); return EINVAL; } // Check number of children if (array_view->n_children != array->n_children) { ArrowErrorSet(error, "Expected %ld children but found %ld children", (long)array_view->n_children, (long)array->n_children); return EINVAL; } // Recurse for children for (int64_t i = 0; i < array_view->n_children; i++) { NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view->children[i], array->children[i], error)); } // Check dictionary if (array->dictionary == NULL && array_view->dictionary != NULL) { ArrowErrorSet(error, "Expected dictionary but found NULL"); return EINVAL; } if (array->dictionary != NULL && array_view->dictionary == NULL) { ArrowErrorSet(error, "Expected NULL dictionary but found dictionary member"); return EINVAL; } if (array->dictionary != NULL) { NANOARROW_RETURN_NOT_OK( ArrowArrayViewSetArrayInternal(array_view->dictionary, array->dictionary, error)); } return NANOARROW_OK; } static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, struct ArrowError* error) { // Calculate buffer sizes that do not require buffer access. If marked as // unknown, assign the buffer size; otherwise, validate it. int64_t offset_plus_length = array_view->offset + array_view->length; // Only loop over the first two buffers because the size of the third buffer // is always data dependent for all current Arrow types. for (int i = 0; i < 2; i++) { int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; // Initialize with a value that will cause an error if accidentally used uninitialized int64_t min_buffer_size_bytes = array_view->buffer_views[i].size_bytes + 1; switch (array_view->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_VALIDITY: if (array_view->null_count == 0 && array_view->buffer_views[i].size_bytes == 0) { continue; } min_buffer_size_bytes = _ArrowBytesForBits(offset_plus_length); break; case NANOARROW_BUFFER_TYPE_DATA_OFFSET: // Probably don't want/need to rely on the producer to have allocated an // offsets buffer of length 1 for a zero-size array min_buffer_size_bytes = (offset_plus_length != 0) * element_size_bytes * (offset_plus_length + 1); break; case NANOARROW_BUFFER_TYPE_DATA: min_buffer_size_bytes = _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * offset_plus_length) / 8; break; case NANOARROW_BUFFER_TYPE_TYPE_ID: case NANOARROW_BUFFER_TYPE_UNION_OFFSET: min_buffer_size_bytes = element_size_bytes * offset_plus_length; break; case NANOARROW_BUFFER_TYPE_NONE: continue; } // Assign or validate buffer size if (array_view->buffer_views[i].size_bytes == -1) { array_view->buffer_views[i].size_bytes = min_buffer_size_bytes; } else if (array_view->buffer_views[i].size_bytes < min_buffer_size_bytes) { ArrowErrorSet(error, "Expected %s array buffer %d to have size >= %ld bytes but found " "buffer with %ld bytes", ArrowTypeString(array_view->storage_type), (int)i, (long)min_buffer_size_bytes, (long)array_view->buffer_views[i].size_bytes); return EINVAL; } } // For list, fixed-size list and map views, we can validate the number of children switch (array_view->storage_type) { case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_LARGE_LIST: case NANOARROW_TYPE_FIXED_SIZE_LIST: case NANOARROW_TYPE_MAP: if (array_view->n_children != 1) { ArrowErrorSet(error, "Expected 1 child of %s array but found %ld child arrays", ArrowTypeString(array_view->storage_type), (long)array_view->n_children); return EINVAL; } default: break; } // For struct, the sparse union, and the fixed-size list views, we can validate child // lengths. int64_t child_min_length; switch (array_view->storage_type) { case NANOARROW_TYPE_SPARSE_UNION: case NANOARROW_TYPE_STRUCT: child_min_length = (array_view->offset + array_view->length); for (int64_t i = 0; i < array_view->n_children; i++) { if (array_view->children[i]->length < child_min_length) { ArrowErrorSet( error, "Expected struct child %d to have length >= %ld but found child with " "length %ld", (int)(i + 1), (long)(child_min_length), (long)array_view->children[i]->length); return EINVAL; } } break; case NANOARROW_TYPE_FIXED_SIZE_LIST: child_min_length = (array_view->offset + array_view->length) * array_view->layout.child_size_elements; if (array_view->children[0]->length < child_min_length) { ArrowErrorSet(error, "Expected child of fixed_size_list array to have length >= %ld but " "found array with length %ld", (long)child_min_length, (long)array_view->children[0]->length); return EINVAL; } break; default: break; } // Recurse for children for (int64_t i = 0; i < array_view->n_children; i++) { NANOARROW_RETURN_NOT_OK( ArrowArrayViewValidateMinimal(array_view->children[i], error)); } // Recurse for dictionary if (array_view->dictionary != NULL) { NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view->dictionary, error)); } return NANOARROW_OK; } static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, struct ArrowError* error) { // Perform minimal validation. This will validate or assign // buffer sizes as long as buffer access is not required. NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); // Calculate buffer sizes or child lengths that require accessing the offsets // buffer. Where appropriate, validate that the first offset is >= 0. // If a buffer size is marked as unknown, assign it; otherwise, validate it. int64_t offset_plus_length = array_view->offset + array_view->length; int64_t first_offset; int64_t last_offset; switch (array_view->storage_type) { case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_BINARY: if (array_view->buffer_views[1].size_bytes != 0) { first_offset = array_view->buffer_views[1].data.as_int32[0]; if (first_offset < 0) { ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", (long)first_offset); return EINVAL; } last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; // If the data buffer size is unknown, assign it; otherwise, check it if (array_view->buffer_views[2].size_bytes == -1) { array_view->buffer_views[2].size_bytes = last_offset; } else if (array_view->buffer_views[2].size_bytes < last_offset) { ArrowErrorSet(error, "Expected %s array buffer 2 to have size >= %ld bytes but found " "buffer with %ld bytes", ArrowTypeString(array_view->storage_type), (long)last_offset, (long)array_view->buffer_views[2].size_bytes); return EINVAL; } } break; case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_LARGE_BINARY: if (array_view->buffer_views[1].size_bytes != 0) { first_offset = array_view->buffer_views[1].data.as_int64[0]; if (first_offset < 0) { ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", (long)first_offset); return EINVAL; } last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; // If the data buffer size is unknown, assign it; otherwise, check it if (array_view->buffer_views[2].size_bytes == -1) { array_view->buffer_views[2].size_bytes = last_offset; } else if (array_view->buffer_views[2].size_bytes < last_offset) { ArrowErrorSet(error, "Expected %s array buffer 2 to have size >= %ld bytes but found " "buffer with %ld bytes", ArrowTypeString(array_view->storage_type), (long)last_offset, (long)array_view->buffer_views[2].size_bytes); return EINVAL; } } break; case NANOARROW_TYPE_STRUCT: for (int64_t i = 0; i < array_view->n_children; i++) { if (array_view->children[i]->length < offset_plus_length) { ArrowErrorSet( error, "Expected struct child %d to have length >= %ld but found child with " "length %ld", (int)(i + 1), (long)offset_plus_length, (long)array_view->children[i]->length); return EINVAL; } } break; case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_MAP: if (array_view->buffer_views[1].size_bytes != 0) { first_offset = array_view->buffer_views[1].data.as_int32[0]; if (first_offset < 0) { ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", (long)first_offset); return EINVAL; } last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; if (array_view->children[0]->length < last_offset) { ArrowErrorSet( error, "Expected child of %s array to have length >= %ld but found array with " "length %ld", ArrowTypeString(array_view->storage_type), (long)last_offset, (long)array_view->children[0]->length); return EINVAL; } } break; case NANOARROW_TYPE_LARGE_LIST: if (array_view->buffer_views[1].size_bytes != 0) { first_offset = array_view->buffer_views[1].data.as_int64[0]; if (first_offset < 0) { ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", (long)first_offset); return EINVAL; } last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; if (array_view->children[0]->length < last_offset) { ArrowErrorSet( error, "Expected child of large list array to have length >= %ld but found array " "with length %ld", (long)last_offset, (long)array_view->children[0]->length); return EINVAL; } } break; default: break; } // Recurse for children for (int64_t i = 0; i < array_view->n_children; i++) { NANOARROW_RETURN_NOT_OK( ArrowArrayViewValidateDefault(array_view->children[i], error)); } // Recurse for dictionary if (array_view->dictionary != NULL) { NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view->dictionary, error)); } return NANOARROW_OK; } ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, struct ArrowArray* array, struct ArrowError* error) { // Extract information from the array into the array view NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); // Run default validation. Because we've marked all non-NULL buffers as having unknown // size, validation will also update the buffer sizes as it goes. NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); return NANOARROW_OK; } ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, struct ArrowArray* array, struct ArrowError* error) { // Extract information from the array into the array view NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); // Run default validation. Because we've marked all non-NULL buffers as having unknown // size, validation will also update the buffer sizes as it goes. NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); return NANOARROW_OK; } static int ArrowAssertIncreasingInt32(struct ArrowBufferView view, struct ArrowError* error) { if (view.size_bytes <= (int64_t)sizeof(int32_t)) { return NANOARROW_OK; } for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int32_t); i++) { int32_t diff = view.data.as_int32[i] - view.data.as_int32[i - 1]; if (diff < 0) { ArrowErrorSet(error, "[%ld] Expected element size >= 0 but found element size %ld", (long)i, (long)diff); return EINVAL; } } return NANOARROW_OK; } static int ArrowAssertIncreasingInt64(struct ArrowBufferView view, struct ArrowError* error) { if (view.size_bytes <= (int64_t)sizeof(int64_t)) { return NANOARROW_OK; } for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int64_t); i++) { int64_t diff = view.data.as_int64[i] - view.data.as_int64[i - 1]; if (diff < 0) { ArrowErrorSet(error, "[%ld] Expected element size >= 0 but found element size %ld", (long)i, (long)diff); return EINVAL; } } return NANOARROW_OK; } static int ArrowAssertRangeInt8(struct ArrowBufferView view, int8_t min_value, int8_t max_value, struct ArrowError* error) { for (int64_t i = 0; i < view.size_bytes; i++) { if (view.data.as_int8[i] < min_value || view.data.as_int8[i] > max_value) { ArrowErrorSet(error, "[%ld] Expected buffer value between %d and %d but found value %d", (long)i, (int)min_value, (int)max_value, (int)view.data.as_int8[i]); return EINVAL; } } return NANOARROW_OK; } static int ArrowAssertInt8In(struct ArrowBufferView view, const int8_t* values, int64_t n_values, struct ArrowError* error) { for (int64_t i = 0; i < view.size_bytes; i++) { int item_found = 0; for (int64_t j = 0; j < n_values; j++) { if (view.data.as_int8[i] == values[j]) { item_found = 1; break; } } if (!item_found) { ArrowErrorSet(error, "[%ld] Unexpected buffer value %d", (long)i, (int)view.data.as_int8[i]); return EINVAL; } } return NANOARROW_OK; } static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, struct ArrowError* error) { for (int i = 0; i < 3; i++) { switch (array_view->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_DATA_OFFSET: if (array_view->layout.element_size_bits[i] == 32) { NANOARROW_RETURN_NOT_OK( ArrowAssertIncreasingInt32(array_view->buffer_views[i], error)); } else { NANOARROW_RETURN_NOT_OK( ArrowAssertIncreasingInt64(array_view->buffer_views[i], error)); } break; default: break; } } if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION || array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION) { if (array_view->union_type_id_map == NULL) { // If the union_type_id map is NULL (e.g., when using ArrowArrayInitFromType() + // ArrowArrayAllocateChildren() + ArrowArrayFinishBuilding()), we don't have enough // information to validate this buffer. ArrowErrorSet(error, "Insufficient information provided for validation of union array"); return EINVAL; } else if (_ArrowParsedUnionTypeIdsWillEqualChildIndices( array_view->union_type_id_map, array_view->n_children, array_view->n_children)) { NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8( array_view->buffer_views[0], 0, (int8_t)(array_view->n_children - 1), error)); } else { NANOARROW_RETURN_NOT_OK(ArrowAssertInt8In(array_view->buffer_views[0], array_view->union_type_id_map + 128, array_view->n_children, error)); } } if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION && array_view->union_type_id_map != NULL) { // Check that offsets refer to child elements that actually exist for (int64_t i = 0; i < array_view->length; i++) { int8_t child_id = ArrowArrayViewUnionChildIndex(array_view, i); int64_t offset = ArrowArrayViewUnionChildOffset(array_view, i); int64_t child_length = array_view->children[child_id]->length; if (offset < 0 || offset > child_length) { ArrowErrorSet( error, "[%ld] Expected union offset for child id %d to be between 0 and %ld but " "found offset value %ld", (long)i, (int)child_id, (long)child_length, offset); return EINVAL; } } } // Recurse for children for (int64_t i = 0; i < array_view->n_children; i++) { NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error)); } // Dictionary valiation not implemented if (array_view->dictionary != NULL) { ArrowErrorSet(error, "Validation for dictionary-encoded arrays is not implemented"); return ENOTSUP; } return NANOARROW_OK; } ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, enum ArrowValidationLevel validation_level, struct ArrowError* error) { switch (validation_level) { case NANOARROW_VALIDATION_LEVEL_NONE: return NANOARROW_OK; case NANOARROW_VALIDATION_LEVEL_MINIMAL: return ArrowArrayViewValidateMinimal(array_view, error); case NANOARROW_VALIDATION_LEVEL_DEFAULT: return ArrowArrayViewValidateDefault(array_view, error); case NANOARROW_VALIDATION_LEVEL_FULL: NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); return ArrowArrayViewValidateFull(array_view, error); } ArrowErrorSet(error, "validation_level not recognized"); return EINVAL; }