src/nanoarrow/common/schema.c (1,424 lines of code) (raw):

// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include <errno.h> #include <inttypes.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include "nanoarrow/nanoarrow.h" static void ArrowSchemaReleaseInternal(struct ArrowSchema* schema) { if (schema->format != NULL) ArrowFree((void*)schema->format); if (schema->name != NULL) ArrowFree((void*)schema->name); if (schema->metadata != NULL) ArrowFree((void*)schema->metadata); // This object owns the memory for all the children, but those // children may have been generated elsewhere and might have // their own release() callback. if (schema->children != NULL) { for (int64_t i = 0; i < schema->n_children; i++) { if (schema->children[i] != NULL) { if (schema->children[i]->release != NULL) { ArrowSchemaRelease(schema->children[i]); } ArrowFree(schema->children[i]); } } ArrowFree(schema->children); } // This object owns the memory for the dictionary but it // may have been generated somewhere else and have its own // release() callback. if (schema->dictionary != NULL) { if (schema->dictionary->release != NULL) { ArrowSchemaRelease(schema->dictionary); } ArrowFree(schema->dictionary); } // private data not currently used if (schema->private_data != NULL) { ArrowFree(schema->private_data); } schema->release = NULL; } static const char* ArrowSchemaFormatTemplate(enum ArrowType type) { switch (type) { case NANOARROW_TYPE_UNINITIALIZED: return NULL; case NANOARROW_TYPE_NA: return "n"; case NANOARROW_TYPE_BOOL: return "b"; case NANOARROW_TYPE_UINT8: return "C"; case NANOARROW_TYPE_INT8: return "c"; case NANOARROW_TYPE_UINT16: return "S"; case NANOARROW_TYPE_INT16: return "s"; case NANOARROW_TYPE_UINT32: return "I"; case NANOARROW_TYPE_INT32: return "i"; case NANOARROW_TYPE_UINT64: return "L"; case NANOARROW_TYPE_INT64: return "l"; case NANOARROW_TYPE_HALF_FLOAT: return "e"; case NANOARROW_TYPE_FLOAT: return "f"; case NANOARROW_TYPE_DOUBLE: return "g"; case NANOARROW_TYPE_STRING: return "u"; case NANOARROW_TYPE_LARGE_STRING: return "U"; case NANOARROW_TYPE_STRING_VIEW: return "vu"; case NANOARROW_TYPE_BINARY: return "z"; case NANOARROW_TYPE_BINARY_VIEW: return "vz"; case NANOARROW_TYPE_LARGE_BINARY: return "Z"; case NANOARROW_TYPE_DATE32: return "tdD"; case NANOARROW_TYPE_DATE64: return "tdm"; case NANOARROW_TYPE_INTERVAL_MONTHS: return "tiM"; case NANOARROW_TYPE_INTERVAL_DAY_TIME: return "tiD"; case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: return "tin"; case NANOARROW_TYPE_LIST: return "+l"; case NANOARROW_TYPE_LARGE_LIST: return "+L"; case NANOARROW_TYPE_LIST_VIEW: return "+vl"; case NANOARROW_TYPE_LARGE_LIST_VIEW: return "+vL"; case NANOARROW_TYPE_STRUCT: return "+s"; case NANOARROW_TYPE_MAP: return "+m"; case NANOARROW_TYPE_RUN_END_ENCODED: return "+r"; default: return NULL; } } static int ArrowSchemaInitChildrenIfNeeded(struct ArrowSchema* schema, enum ArrowType type) { switch (type) { case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_LARGE_LIST: case NANOARROW_TYPE_FIXED_SIZE_LIST: case NANOARROW_TYPE_LIST_VIEW: case NANOARROW_TYPE_LARGE_LIST_VIEW: NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 1)); ArrowSchemaInit(schema->children[0]); NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "item")); break; case NANOARROW_TYPE_MAP: NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 1)); NANOARROW_RETURN_NOT_OK( ArrowSchemaInitFromType(schema->children[0], NANOARROW_TYPE_STRUCT)); NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "entries")); schema->children[0]->flags &= ~ARROW_FLAG_NULLABLE; NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema->children[0], 2)); ArrowSchemaInit(schema->children[0]->children[0]); ArrowSchemaInit(schema->children[0]->children[1]); NANOARROW_RETURN_NOT_OK( ArrowSchemaSetName(schema->children[0]->children[0], "key")); schema->children[0]->children[0]->flags &= ~ARROW_FLAG_NULLABLE; NANOARROW_RETURN_NOT_OK( ArrowSchemaSetName(schema->children[0]->children[1], "value")); break; case NANOARROW_TYPE_RUN_END_ENCODED: NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 2)); ArrowSchemaInit(schema->children[0]); NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "run_ends")); schema->children[0]->flags &= ~ARROW_FLAG_NULLABLE; ArrowSchemaInit(schema->children[1]); NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[1], "values")); default: break; } return NANOARROW_OK; } void ArrowSchemaInit(struct ArrowSchema* schema) { schema->format = NULL; schema->name = NULL; schema->metadata = NULL; schema->flags = ARROW_FLAG_NULLABLE; schema->n_children = 0; schema->children = NULL; schema->dictionary = NULL; schema->private_data = NULL; schema->release = &ArrowSchemaReleaseInternal; } ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type) { // We don't allocate the dictionary because it has to be nullptr // for non-dictionary-encoded arrays. // Set the format to a valid format string for type const char* template_format = ArrowSchemaFormatTemplate(type); // If type isn't recognized and not explicitly unset if (template_format == NULL && type != NANOARROW_TYPE_UNINITIALIZED) { return EINVAL; } NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, template_format)); // For types with an umabiguous child structure, allocate children return ArrowSchemaInitChildrenIfNeeded(schema, type); } ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_children) { NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_STRUCT)); NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children)); for (int64_t i = 0; i < n_children; i++) { ArrowSchemaInit(schema->children[i]); } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowType type) { ArrowSchemaInit(schema); int result = ArrowSchemaSetType(schema, type); if (result != NANOARROW_OK) { ArrowSchemaRelease(schema); return result; } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, enum ArrowType type, int32_t fixed_size) { if (fixed_size <= 0) { return EINVAL; } char buffer[64]; int n_chars; switch (type) { case NANOARROW_TYPE_FIXED_SIZE_BINARY: n_chars = snprintf(buffer, sizeof(buffer), "w:%" PRId32, fixed_size); break; case NANOARROW_TYPE_FIXED_SIZE_LIST: n_chars = snprintf(buffer, sizeof(buffer), "+w:%" PRId32, fixed_size); break; default: return EINVAL; } if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) { return ERANGE; } buffer[n_chars] = '\0'; NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, buffer)); if (type == NANOARROW_TYPE_FIXED_SIZE_LIST) { NANOARROW_RETURN_NOT_OK(ArrowSchemaInitChildrenIfNeeded(schema, type)); } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowType type, int32_t decimal_precision, int32_t decimal_scale) { if (decimal_precision <= 0) { return EINVAL; } char buffer[64]; int n_chars; switch (type) { case NANOARROW_TYPE_DECIMAL32: if (decimal_precision > 9) { return EINVAL; } n_chars = snprintf(buffer, sizeof(buffer), "d:%d,%d,32", decimal_precision, decimal_scale); break; case NANOARROW_TYPE_DECIMAL64: if (decimal_precision > 18) { return EINVAL; } n_chars = snprintf(buffer, sizeof(buffer), "d:%d,%d,64", decimal_precision, decimal_scale); break; case NANOARROW_TYPE_DECIMAL128: if (decimal_precision > 38) { return EINVAL; } n_chars = snprintf(buffer, sizeof(buffer), "d:%d,%d", decimal_precision, decimal_scale); break; case NANOARROW_TYPE_DECIMAL256: if (decimal_precision > 76) { return EINVAL; } n_chars = snprintf(buffer, sizeof(buffer), "d:%d,%d,256", decimal_precision, decimal_scale); break; default: return EINVAL; } if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) { return ERANGE; } buffer[n_chars] = '\0'; return ArrowSchemaSetFormat(schema, buffer); } ArrowErrorCode ArrowSchemaSetTypeRunEndEncoded(struct ArrowSchema* schema, enum ArrowType run_end_type) { switch (run_end_type) { case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_INT64: break; default: return EINVAL; } NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat( schema, ArrowSchemaFormatTemplate(NANOARROW_TYPE_RUN_END_ENCODED))); NANOARROW_RETURN_NOT_OK( ArrowSchemaInitChildrenIfNeeded(schema, NANOARROW_TYPE_RUN_END_ENCODED)); NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema->children[0], run_end_type)); NANOARROW_RETURN_NOT_OK( ArrowSchemaSetType(schema->children[1], NANOARROW_TYPE_UNINITIALIZED)); return NANOARROW_OK; } static const char* ArrowTimeUnitFormatString(enum ArrowTimeUnit time_unit) { switch (time_unit) { case NANOARROW_TIME_UNIT_SECOND: return "s"; case NANOARROW_TIME_UNIT_MILLI: return "m"; case NANOARROW_TIME_UNIT_MICRO: return "u"; case NANOARROW_TIME_UNIT_NANO: return "n"; default: return NULL; } } ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum ArrowType type, enum ArrowTimeUnit time_unit, const char* timezone) { const char* time_unit_str = ArrowTimeUnitFormatString(time_unit); if (time_unit_str == NULL) { return EINVAL; } char buffer[128]; int n_chars; switch (type) { case NANOARROW_TYPE_TIME32: if (timezone != NULL) { return EINVAL; } switch (time_unit) { case NANOARROW_TIME_UNIT_MICRO: case NANOARROW_TIME_UNIT_NANO: return EINVAL; default: break; } n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); break; case NANOARROW_TYPE_TIME64: if (timezone != NULL) { return EINVAL; } switch (time_unit) { case NANOARROW_TIME_UNIT_SECOND: case NANOARROW_TIME_UNIT_MILLI: return EINVAL; default: break; } n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); break; case NANOARROW_TYPE_TIMESTAMP: if (timezone == NULL) { timezone = ""; } n_chars = snprintf(buffer, sizeof(buffer), "ts%s:%s", time_unit_str, timezone); break; case NANOARROW_TYPE_DURATION: if (timezone != NULL) { return EINVAL; } n_chars = snprintf(buffer, sizeof(buffer), "tD%s", time_unit_str); break; default: return EINVAL; } if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) { return ERANGE; } buffer[n_chars] = '\0'; return ArrowSchemaSetFormat(schema, buffer); } ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowType type, int64_t n_children) { if (n_children < 0 || n_children > 127) { return EINVAL; } // Max valid size would be +ud:0,1,...126 = 401 characters + null terminator char format_out[512]; int64_t format_out_size = 512; memset(format_out, 0, format_out_size); int n_chars; char* format_cursor = format_out; switch (type) { case NANOARROW_TYPE_SPARSE_UNION: n_chars = snprintf(format_cursor, format_out_size, "+us:"); format_cursor += n_chars; format_out_size -= n_chars; break; case NANOARROW_TYPE_DENSE_UNION: n_chars = snprintf(format_cursor, format_out_size, "+ud:"); format_cursor += n_chars; format_out_size -= n_chars; break; default: return EINVAL; } // Ensure that an encoding error from snprintf() does not result // in an out-of-bounds access. if (n_chars < 0) { return ERANGE; } if (n_children > 0) { n_chars = snprintf(format_cursor, format_out_size, "0"); format_cursor += n_chars; format_out_size -= n_chars; for (int64_t i = 1; i < n_children; i++) { n_chars = snprintf(format_cursor, format_out_size, ",%" PRId64, i); format_cursor += n_chars; format_out_size -= n_chars; } } // Ensure that an encoding error from snprintf() does not result // in an out-of-bounds access. if (n_chars < 0) { return ERANGE; } NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, format_out)); NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children)); for (int64_t i = 0; i < n_children; i++) { ArrowSchemaInit(schema->children[i]); } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format) { if (schema->format != NULL) { ArrowFree((void*)schema->format); } if (format != NULL) { size_t format_size = strlen(format) + 1; schema->format = (const char*)ArrowMalloc(format_size); if (schema->format == NULL) { return ENOMEM; } memcpy((void*)schema->format, format, format_size); } else { schema->format = NULL; } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name) { if (schema->name != NULL) { ArrowFree((void*)schema->name); } if (name != NULL) { size_t name_size = strlen(name) + 1; schema->name = (const char*)ArrowMalloc(name_size); if (schema->name == NULL) { return ENOMEM; } memcpy((void*)schema->name, name, name_size); } else { schema->name = NULL; } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaSetMetadata(struct ArrowSchema* schema, const char* metadata) { if (schema->metadata != NULL) { ArrowFree((void*)schema->metadata); } if (metadata != NULL) { size_t metadata_size = ArrowMetadataSizeOf(metadata); schema->metadata = (const char*)ArrowMalloc(metadata_size); if (schema->metadata == NULL) { return ENOMEM; } memcpy((void*)schema->metadata, metadata, metadata_size); } else { schema->metadata = NULL; } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaAllocateChildren(struct ArrowSchema* schema, int64_t n_children) { if (schema->children != NULL) { return EEXIST; } if (n_children > 0) { schema->children = (struct ArrowSchema**)ArrowMalloc(n_children * sizeof(struct ArrowSchema*)); if (schema->children == NULL) { return ENOMEM; } schema->n_children = n_children; memset(schema->children, 0, n_children * sizeof(struct ArrowSchema*)); for (int64_t i = 0; i < n_children; i++) { schema->children[i] = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema)); if (schema->children[i] == NULL) { return ENOMEM; } schema->children[i]->release = NULL; } } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema) { if (schema->dictionary != NULL) { return EEXIST; } schema->dictionary = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema)); if (schema->dictionary == NULL) { return ENOMEM; } schema->dictionary->release = NULL; return NANOARROW_OK; } ArrowErrorCode ArrowSchemaDeepCopy(const struct ArrowSchema* schema, struct ArrowSchema* schema_out) { ArrowSchemaInit(schema_out); int result = ArrowSchemaSetFormat(schema_out, schema->format); if (result != NANOARROW_OK) { ArrowSchemaRelease(schema_out); return result; } schema_out->flags = schema->flags; result = ArrowSchemaSetName(schema_out, schema->name); if (result != NANOARROW_OK) { ArrowSchemaRelease(schema_out); return result; } result = ArrowSchemaSetMetadata(schema_out, schema->metadata); if (result != NANOARROW_OK) { ArrowSchemaRelease(schema_out); return result; } result = ArrowSchemaAllocateChildren(schema_out, schema->n_children); if (result != NANOARROW_OK) { ArrowSchemaRelease(schema_out); return result; } for (int64_t i = 0; i < schema->n_children; i++) { result = ArrowSchemaDeepCopy(schema->children[i], schema_out->children[i]); if (result != NANOARROW_OK) { ArrowSchemaRelease(schema_out); return result; } } if (schema->dictionary != NULL) { result = ArrowSchemaAllocateDictionary(schema_out); if (result != NANOARROW_OK) { ArrowSchemaRelease(schema_out); return result; } result = ArrowSchemaDeepCopy(schema->dictionary, schema_out->dictionary); if (result != NANOARROW_OK) { ArrowSchemaRelease(schema_out); return result; } } return NANOARROW_OK; } static void ArrowSchemaViewSetPrimitive(struct ArrowSchemaView* schema_view, enum ArrowType type) { schema_view->type = type; schema_view->storage_type = type; } static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view, const char* format, const char** format_end_out, struct ArrowError* error) { *format_end_out = format; // needed for decimal parsing const char* parse_start; char* parse_end; switch (format[0]) { case 'n': schema_view->type = NANOARROW_TYPE_NA; schema_view->storage_type = NANOARROW_TYPE_NA; *format_end_out = format + 1; return NANOARROW_OK; case 'b': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_BOOL); *format_end_out = format + 1; return NANOARROW_OK; case 'c': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT8); *format_end_out = format + 1; return NANOARROW_OK; case 'C': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT8); *format_end_out = format + 1; return NANOARROW_OK; case 's': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT16); *format_end_out = format + 1; return NANOARROW_OK; case 'S': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT16); *format_end_out = format + 1; return NANOARROW_OK; case 'i': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); *format_end_out = format + 1; return NANOARROW_OK; case 'I': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT32); *format_end_out = format + 1; return NANOARROW_OK; case 'l': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); *format_end_out = format + 1; return NANOARROW_OK; case 'L': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT64); *format_end_out = format + 1; return NANOARROW_OK; case 'e': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_HALF_FLOAT); *format_end_out = format + 1; return NANOARROW_OK; case 'f': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_FLOAT); *format_end_out = format + 1; return NANOARROW_OK; case 'g': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DOUBLE); *format_end_out = format + 1; return NANOARROW_OK; // decimal case 'd': if (format[1] != ':' || format[2] == '\0') { ArrowErrorSet(error, "Expected ':precision,scale[,bitwidth]' following 'd'"); return EINVAL; } parse_start = format + 2; schema_view->decimal_precision = (int32_t)strtol(parse_start, &parse_end, 10); if (parse_end == parse_start || parse_end[0] != ',') { ArrowErrorSet(error, "Expected 'precision,scale[,bitwidth]' following 'd:'"); return EINVAL; } parse_start = parse_end + 1; schema_view->decimal_scale = (int32_t)strtol(parse_start, &parse_end, 10); if (parse_end == parse_start) { ArrowErrorSet(error, "Expected 'scale[,bitwidth]' following 'd:precision,'"); return EINVAL; } else if (parse_end[0] != ',') { schema_view->decimal_bitwidth = 128; } else { parse_start = parse_end + 1; schema_view->decimal_bitwidth = (int32_t)strtol(parse_start, &parse_end, 10); if (parse_start == parse_end) { ArrowErrorSet(error, "Expected precision following 'd:precision,scale,'"); return EINVAL; } } *format_end_out = parse_end; switch (schema_view->decimal_bitwidth) { case 32: ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL32); return NANOARROW_OK; case 64: ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL64); return NANOARROW_OK; case 128: ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL128); return NANOARROW_OK; case 256: ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL256); return NANOARROW_OK; default: ArrowErrorSet(error, "Expected decimal bitwidth of 128 or 256 but found %" PRId32, schema_view->decimal_bitwidth); return EINVAL; } // validity + data case 'w': schema_view->type = NANOARROW_TYPE_FIXED_SIZE_BINARY; schema_view->storage_type = NANOARROW_TYPE_FIXED_SIZE_BINARY; if (format[1] != ':' || format[2] == '\0') { ArrowErrorSet(error, "Expected ':<width>' following 'w'"); return EINVAL; } schema_view->fixed_size = (int32_t)strtol(format + 2, (char**)format_end_out, 10); return NANOARROW_OK; // validity + offset + data case 'z': schema_view->type = NANOARROW_TYPE_BINARY; schema_view->storage_type = NANOARROW_TYPE_BINARY; *format_end_out = format + 1; return NANOARROW_OK; case 'u': schema_view->type = NANOARROW_TYPE_STRING; schema_view->storage_type = NANOARROW_TYPE_STRING; *format_end_out = format + 1; return NANOARROW_OK; // validity + large_offset + data case 'Z': schema_view->type = NANOARROW_TYPE_LARGE_BINARY; schema_view->storage_type = NANOARROW_TYPE_LARGE_BINARY; *format_end_out = format + 1; return NANOARROW_OK; case 'U': schema_view->type = NANOARROW_TYPE_LARGE_STRING; schema_view->storage_type = NANOARROW_TYPE_LARGE_STRING; *format_end_out = format + 1; return NANOARROW_OK; // nested types case '+': switch (format[1]) { // list has validity + offset or offset case 'l': schema_view->storage_type = NANOARROW_TYPE_LIST; schema_view->type = NANOARROW_TYPE_LIST; *format_end_out = format + 2; return NANOARROW_OK; // large list has validity + large_offset or large_offset case 'L': schema_view->storage_type = NANOARROW_TYPE_LARGE_LIST; schema_view->type = NANOARROW_TYPE_LARGE_LIST; *format_end_out = format + 2; return NANOARROW_OK; // run end encoded has no buffer at all case 'r': schema_view->storage_type = NANOARROW_TYPE_RUN_END_ENCODED; schema_view->type = NANOARROW_TYPE_RUN_END_ENCODED; *format_end_out = format + 2; return NANOARROW_OK; // just validity buffer case 'w': if (format[2] != ':' || format[3] == '\0') { ArrowErrorSet(error, "Expected ':<width>' following '+w'"); return EINVAL; } schema_view->storage_type = NANOARROW_TYPE_FIXED_SIZE_LIST; schema_view->type = NANOARROW_TYPE_FIXED_SIZE_LIST; schema_view->fixed_size = (int32_t)strtol(format + 3, (char**)format_end_out, 10); return NANOARROW_OK; case 's': schema_view->storage_type = NANOARROW_TYPE_STRUCT; schema_view->type = NANOARROW_TYPE_STRUCT; *format_end_out = format + 2; return NANOARROW_OK; case 'm': schema_view->storage_type = NANOARROW_TYPE_MAP; schema_view->type = NANOARROW_TYPE_MAP; *format_end_out = format + 2; return NANOARROW_OK; // unions case 'u': switch (format[2]) { case 'd': schema_view->storage_type = NANOARROW_TYPE_DENSE_UNION; schema_view->type = NANOARROW_TYPE_DENSE_UNION; break; case 's': schema_view->storage_type = NANOARROW_TYPE_SPARSE_UNION; schema_view->type = NANOARROW_TYPE_SPARSE_UNION; break; default: ArrowErrorSet(error, "Expected union format string +us:<type_ids> or " "+ud:<type_ids> but found '%s'", format); return EINVAL; } if (format[3] == ':') { schema_view->union_type_ids = format + 4; int64_t n_type_ids = _ArrowParseUnionTypeIds(schema_view->union_type_ids, NULL); if (n_type_ids != schema_view->schema->n_children) { ArrowErrorSet(error, "Expected union type_ids parameter to be a comma-separated " "list of %" PRId64 " values between 0 and 127 but found '%s'", schema_view->schema->n_children, schema_view->union_type_ids); return EINVAL; } *format_end_out = format + strlen(format); return NANOARROW_OK; } else { ArrowErrorSet(error, "Expected union format string +us:<type_ids> or +ud:<type_ids> " "but found '%s'", format); return EINVAL; } // views case 'v': switch (format[2]) { case 'l': schema_view->storage_type = NANOARROW_TYPE_LIST_VIEW; schema_view->type = NANOARROW_TYPE_LIST_VIEW; *format_end_out = format + 3; return NANOARROW_OK; case 'L': schema_view->storage_type = NANOARROW_TYPE_LARGE_LIST_VIEW; schema_view->type = NANOARROW_TYPE_LARGE_LIST_VIEW; *format_end_out = format + 3; return NANOARROW_OK; default: ArrowErrorSet( error, "Expected view format string +vl or +vL but found '%s'", format); return EINVAL; } default: ArrowErrorSet(error, "Expected nested type format string but found '%s'", format); return EINVAL; } // date/time types case 't': switch (format[1]) { // date case 'd': switch (format[2]) { case 'D': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); schema_view->type = NANOARROW_TYPE_DATE32; *format_end_out = format + 3; return NANOARROW_OK; case 'm': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_DATE64; *format_end_out = format + 3; return NANOARROW_OK; default: ArrowErrorSet(error, "Expected 'D' or 'm' following 'td' but found '%s'", format + 2); return EINVAL; } // time of day case 't': switch (format[2]) { case 's': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); schema_view->type = NANOARROW_TYPE_TIME32; schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; *format_end_out = format + 3; return NANOARROW_OK; case 'm': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); schema_view->type = NANOARROW_TYPE_TIME32; schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; *format_end_out = format + 3; return NANOARROW_OK; case 'u': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_TIME64; schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; *format_end_out = format + 3; return NANOARROW_OK; case 'n': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_TIME64; schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; *format_end_out = format + 3; return NANOARROW_OK; default: ArrowErrorSet( error, "Expected 's', 'm', 'u', or 'n' following 'tt' but found '%s'", format + 2); return EINVAL; } // timestamp case 's': switch (format[2]) { case 's': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_TIMESTAMP; schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; break; case 'm': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_TIMESTAMP; schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; break; case 'u': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_TIMESTAMP; schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; break; case 'n': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_TIMESTAMP; schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; break; default: ArrowErrorSet( error, "Expected 's', 'm', 'u', or 'n' following 'ts' but found '%s'", format + 2); return EINVAL; } if (format[3] != ':') { ArrowErrorSet(error, "Expected ':' following '%.3s' but found '%s'", format, format + 3); return EINVAL; } schema_view->timezone = format + 4; *format_end_out = format + strlen(format); return NANOARROW_OK; // duration case 'D': switch (format[2]) { case 's': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_DURATION; schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; *format_end_out = format + 3; return NANOARROW_OK; case 'm': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_DURATION; schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; *format_end_out = format + 3; return NANOARROW_OK; case 'u': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_DURATION; schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; *format_end_out = format + 3; return NANOARROW_OK; case 'n': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_DURATION; schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; *format_end_out = format + 3; return NANOARROW_OK; default: ArrowErrorSet(error, "Expected 's', 'm', u', or 'n' following 'tD' but found '%s'", format + 2); return EINVAL; } // interval case 'i': switch (format[2]) { case 'M': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_MONTHS); *format_end_out = format + 3; return NANOARROW_OK; case 'D': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_DAY_TIME); *format_end_out = format + 3; return NANOARROW_OK; case 'n': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO); *format_end_out = format + 3; return NANOARROW_OK; default: ArrowErrorSet(error, "Expected 'M', 'D', or 'n' following 'ti' but found '%s'", format + 2); return EINVAL; } default: ArrowErrorSet( error, "Expected 'd', 't', 's', 'D', or 'i' following 't' but found '%s'", format + 1); return EINVAL; } // view types case 'v': { switch (format[1]) { case 'u': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_STRING_VIEW); *format_end_out = format + 2; return NANOARROW_OK; case 'z': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_BINARY_VIEW); *format_end_out = format + 2; return NANOARROW_OK; default: ArrowErrorSet(error, "Expected 'u', or 'z' following 'v' but found '%s'", format + 1); return EINVAL; } } default: ArrowErrorSet(error, "Unknown format: '%s'", format); return EINVAL; } } static ArrowErrorCode ArrowSchemaViewValidateNChildren( struct ArrowSchemaView* schema_view, int64_t n_children, struct ArrowError* error) { if (n_children != -1 && schema_view->schema->n_children != n_children) { ArrowErrorSet( error, "Expected schema with %" PRId64 " children but found %" PRId64 " children", n_children, schema_view->schema->n_children); return EINVAL; } // Don't do a full validation of children but do check that they won't // segfault if inspected struct ArrowSchema* child; for (int64_t i = 0; i < schema_view->schema->n_children; i++) { child = schema_view->schema->children[i]; if (child == NULL) { ArrowErrorSet( error, "Expected valid schema at schema->children[%" PRId64 "] but found NULL", i); return EINVAL; } else if (child->release == NULL) { ArrowErrorSet(error, "Expected valid schema at schema->children[%" PRId64 "] but found a released schema", i); return EINVAL; } } return NANOARROW_OK; } static ArrowErrorCode ArrowSchemaViewValidateUnion(struct ArrowSchemaView* schema_view, struct ArrowError* error) { return ArrowSchemaViewValidateNChildren(schema_view, -1, error); } static ArrowErrorCode ArrowSchemaViewValidateMap(struct ArrowSchemaView* schema_view, struct ArrowError* error) { NANOARROW_RETURN_NOT_OK(ArrowSchemaViewValidateNChildren(schema_view, 1, error)); if (schema_view->schema->children[0]->n_children != 2) { ArrowErrorSet(error, "Expected child of map type to have 2 children but found %" PRId64, schema_view->schema->children[0]->n_children); return EINVAL; } if (strcmp(schema_view->schema->children[0]->format, "+s") != 0) { ArrowErrorSet(error, "Expected format of child of map type to be '+s' but found '%s'", schema_view->schema->children[0]->format); return EINVAL; } if (schema_view->schema->children[0]->flags & ARROW_FLAG_NULLABLE) { ArrowErrorSet(error, "Expected child of map type to be non-nullable but was nullable"); return EINVAL; } if (schema_view->schema->children[0]->children[0]->flags & ARROW_FLAG_NULLABLE) { ArrowErrorSet(error, "Expected key of map type to be non-nullable but was nullable"); return EINVAL; } return NANOARROW_OK; } static ArrowErrorCode ArrowSchemaViewValidateDictionary( struct ArrowSchemaView* schema_view, struct ArrowError* error) { // check for valid index type switch (schema_view->storage_type) { case NANOARROW_TYPE_UINT8: case NANOARROW_TYPE_INT8: case NANOARROW_TYPE_UINT16: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_UINT32: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_UINT64: case NANOARROW_TYPE_INT64: break; default: ArrowErrorSet( error, "Expected dictionary schema index type to be an integral type but found '%s'", schema_view->schema->format); return EINVAL; } struct ArrowSchemaView dictionary_schema_view; return ArrowSchemaViewInit(&dictionary_schema_view, schema_view->schema->dictionary, error); } static ArrowErrorCode ArrowSchemaViewValidate(struct ArrowSchemaView* schema_view, enum ArrowType type, struct ArrowError* error) { switch (type) { case NANOARROW_TYPE_NA: case NANOARROW_TYPE_BOOL: case NANOARROW_TYPE_UINT8: case NANOARROW_TYPE_INT8: case NANOARROW_TYPE_UINT16: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_UINT32: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_UINT64: case NANOARROW_TYPE_INT64: case NANOARROW_TYPE_HALF_FLOAT: case NANOARROW_TYPE_FLOAT: case NANOARROW_TYPE_DOUBLE: case NANOARROW_TYPE_DECIMAL32: case NANOARROW_TYPE_DECIMAL64: case NANOARROW_TYPE_DECIMAL128: case NANOARROW_TYPE_DECIMAL256: case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_BINARY: case NANOARROW_TYPE_LARGE_BINARY: case NANOARROW_TYPE_DATE32: case NANOARROW_TYPE_DATE64: case NANOARROW_TYPE_INTERVAL_MONTHS: case NANOARROW_TYPE_INTERVAL_DAY_TIME: case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: case NANOARROW_TYPE_TIMESTAMP: case NANOARROW_TYPE_TIME32: case NANOARROW_TYPE_TIME64: case NANOARROW_TYPE_DURATION: case NANOARROW_TYPE_BINARY_VIEW: case NANOARROW_TYPE_STRING_VIEW: return ArrowSchemaViewValidateNChildren(schema_view, 0, error); case NANOARROW_TYPE_FIXED_SIZE_BINARY: if (schema_view->fixed_size <= 0) { ArrowErrorSet(error, "Expected size > 0 for fixed size binary but found size %d", schema_view->fixed_size); return EINVAL; } return ArrowSchemaViewValidateNChildren(schema_view, 0, error); case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_LIST_VIEW: case NANOARROW_TYPE_LARGE_LIST: case NANOARROW_TYPE_LARGE_LIST_VIEW: case NANOARROW_TYPE_FIXED_SIZE_LIST: return ArrowSchemaViewValidateNChildren(schema_view, 1, error); case NANOARROW_TYPE_RUN_END_ENCODED: return ArrowSchemaViewValidateNChildren(schema_view, 2, error); case NANOARROW_TYPE_STRUCT: return ArrowSchemaViewValidateNChildren(schema_view, -1, error); case NANOARROW_TYPE_SPARSE_UNION: case NANOARROW_TYPE_DENSE_UNION: return ArrowSchemaViewValidateUnion(schema_view, error); case NANOARROW_TYPE_MAP: return ArrowSchemaViewValidateMap(schema_view, error); case NANOARROW_TYPE_DICTIONARY: return ArrowSchemaViewValidateDictionary(schema_view, error); default: ArrowErrorSet(error, "Expected a valid enum ArrowType value but found %d", schema_view->type); return EINVAL; } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, const struct ArrowSchema* schema, struct ArrowError* error) { if (schema == NULL) { ArrowErrorSet(error, "Expected non-NULL schema"); return EINVAL; } if (schema->release == NULL) { ArrowErrorSet(error, "Expected non-released schema"); return EINVAL; } schema_view->schema = schema; const char* format = schema->format; if (format == NULL) { ArrowErrorSet( error, "Error parsing schema->format: Expected a null-terminated string but found NULL"); return EINVAL; } size_t format_len = strlen(format); if (format_len == 0) { ArrowErrorSet(error, "Error parsing schema->format: Expected a string with size > 0"); return EINVAL; } const char* format_end_out; int result = ArrowSchemaViewParse(schema_view, format, &format_end_out, error); if (result != NANOARROW_OK) { if (error != NULL) { char child_error[1024]; memcpy(child_error, ArrowErrorMessage(error), 1024); ArrowErrorSet(error, "Error parsing schema->format: %s", child_error); } return result; } if ((format + format_len) != format_end_out) { ArrowErrorSet(error, "Error parsing schema->format '%s': parsed %d/%zu characters", format, (int)(format_end_out - format), format_len); return EINVAL; } if (schema->dictionary != NULL) { schema_view->type = NANOARROW_TYPE_DICTIONARY; } NANOARROW_RETURN_NOT_OK( ArrowSchemaViewValidate(schema_view, schema_view->storage_type, error)); if (schema_view->storage_type != schema_view->type) { NANOARROW_RETURN_NOT_OK( ArrowSchemaViewValidate(schema_view, schema_view->type, error)); } int64_t unknown_flags = schema->flags & ~NANOARROW_FLAG_ALL_SUPPORTED; if (unknown_flags != 0) { ArrowErrorSet(error, "Unknown ArrowSchema flag"); return EINVAL; } if (schema->flags & ARROW_FLAG_DICTIONARY_ORDERED && schema_view->type != NANOARROW_TYPE_DICTIONARY) { ArrowErrorSet(error, "ARROW_FLAG_DICTIONARY_ORDERED is only relevant for dictionaries"); return EINVAL; } if (schema->flags & ARROW_FLAG_MAP_KEYS_SORTED && schema_view->type != NANOARROW_TYPE_MAP) { ArrowErrorSet(error, "ARROW_FLAG_MAP_KEYS_SORTED is only relevant for a map type"); return EINVAL; } ArrowLayoutInit(&schema_view->layout, schema_view->storage_type); if (schema_view->storage_type == NANOARROW_TYPE_FIXED_SIZE_BINARY) { schema_view->layout.element_size_bits[1] = (int64_t)schema_view->fixed_size * 8; } else if (schema_view->storage_type == NANOARROW_TYPE_FIXED_SIZE_LIST) { schema_view->layout.child_size_elements = schema_view->fixed_size; } schema_view->extension_name = ArrowCharView(NULL); schema_view->extension_metadata = ArrowCharView(NULL); NANOARROW_RETURN_NOT_OK(ArrowMetadataGetValue(schema->metadata, ArrowCharView("ARROW:extension:name"), &schema_view->extension_name)); NANOARROW_RETURN_NOT_OK(ArrowMetadataGetValue(schema->metadata, ArrowCharView("ARROW:extension:metadata"), &schema_view->extension_metadata)); return NANOARROW_OK; } static int64_t ArrowSchemaTypeToStringInternal(struct ArrowSchemaView* schema_view, char* out, int64_t n) { const char* type_string = ArrowTypeString(schema_view->type); switch (schema_view->type) { case NANOARROW_TYPE_DECIMAL32: case NANOARROW_TYPE_DECIMAL64: case NANOARROW_TYPE_DECIMAL128: case NANOARROW_TYPE_DECIMAL256: return snprintf(out, n, "%s(%" PRId32 ", %" PRId32 ")", type_string, schema_view->decimal_precision, schema_view->decimal_scale); case NANOARROW_TYPE_TIMESTAMP: return snprintf(out, n, "%s('%s', '%s')", type_string, ArrowTimeUnitString(schema_view->time_unit), schema_view->timezone); case NANOARROW_TYPE_TIME32: case NANOARROW_TYPE_TIME64: case NANOARROW_TYPE_DURATION: return snprintf(out, n, "%s('%s')", type_string, ArrowTimeUnitString(schema_view->time_unit)); case NANOARROW_TYPE_FIXED_SIZE_BINARY: case NANOARROW_TYPE_FIXED_SIZE_LIST: return snprintf(out, n, "%s(%" PRId32 ")", type_string, schema_view->fixed_size); case NANOARROW_TYPE_SPARSE_UNION: case NANOARROW_TYPE_DENSE_UNION: return snprintf(out, n, "%s([%s])", type_string, schema_view->union_type_ids); default: return snprintf(out, n, "%s", type_string); } } // Helper for bookkeeping to emulate sprintf()-like behaviour spread // among multiple sprintf calls. static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last, int64_t* n_remaining, int64_t* n_chars) { // In the unlikely snprintf() returning a negative value (encoding error), // ensure the result won't cause an out-of-bounds access. if (n_chars_last < 0) { n_chars_last = 0; } *n_chars += n_chars_last; *n_remaining -= n_chars_last; // n_remaining is never less than 0 if (*n_remaining < 0) { *n_remaining = 0; } // Can't do math on a NULL pointer if (*out != NULL) { *out += n_chars_last; } } int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t n, char recursive) { if (schema == NULL) { return snprintf(out, n, "[invalid: pointer is null]"); } if (schema->release == NULL) { return snprintf(out, n, "[invalid: schema is released]"); } struct ArrowSchemaView schema_view; struct ArrowError error; if (ArrowSchemaViewInit(&schema_view, schema, &error) != NANOARROW_OK) { return snprintf(out, n, "[invalid: %s]", ArrowErrorMessage(&error)); } // Extension type and dictionary should include both the top-level type // and the storage type. int is_extension = schema_view.extension_name.size_bytes > 0; int is_dictionary = schema->dictionary != NULL; int64_t n_chars = 0; int64_t n_chars_last = 0; // Uncommon but not technically impossible that both are true if (is_extension && is_dictionary) { n_chars_last = snprintf( out, n, "%.*s{dictionary(%s)<", (int)schema_view.extension_name.size_bytes, schema_view.extension_name.data, ArrowTypeString(schema_view.storage_type)); } else if (is_extension) { n_chars_last = snprintf(out, n, "%.*s{", (int)schema_view.extension_name.size_bytes, schema_view.extension_name.data); } else if (is_dictionary) { n_chars_last = snprintf(out, n, "dictionary(%s)<", ArrowTypeString(schema_view.storage_type)); } ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); if (!is_dictionary) { n_chars_last = ArrowSchemaTypeToStringInternal(&schema_view, out, n); } else { n_chars_last = ArrowSchemaToString(schema->dictionary, out, n, recursive); } ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); if (recursive && schema->format[0] == '+') { n_chars_last = snprintf(out, n, "<"); ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); for (int64_t i = 0; i < schema->n_children; i++) { if (i > 0) { n_chars_last = snprintf(out, n, ", "); ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); } // ArrowSchemaToStringInternal() will validate the child and print the error, // but we need the name first if (schema->children[i] != NULL && schema->children[i]->release != NULL && schema->children[i]->name != NULL) { n_chars_last = snprintf(out, n, "%s: ", schema->children[i]->name); ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); } n_chars_last = ArrowSchemaToString(schema->children[i], out, n, recursive); ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); } n_chars_last = snprintf(out, n, ">"); ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); } if (is_extension && is_dictionary) { n_chars += snprintf(out, n, ">}"); } else if (is_extension) { n_chars += snprintf(out, n, "}"); } else if (is_dictionary) { n_chars += snprintf(out, n, ">"); } // Ensure that we always return a positive result if (n_chars > 0) { return n_chars; } else { return 0; } } ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader, const char* metadata) { reader->metadata = metadata; if (reader->metadata == NULL) { reader->offset = 0; reader->remaining_keys = 0; } else { memcpy(&reader->remaining_keys, reader->metadata, sizeof(int32_t)); reader->offset = sizeof(int32_t); } return NANOARROW_OK; } ArrowErrorCode ArrowMetadataReaderRead(struct ArrowMetadataReader* reader, struct ArrowStringView* key_out, struct ArrowStringView* value_out) { if (reader->remaining_keys <= 0) { return EINVAL; } int64_t pos = 0; int32_t key_size; memcpy(&key_size, reader->metadata + reader->offset + pos, sizeof(int32_t)); pos += sizeof(int32_t); key_out->data = reader->metadata + reader->offset + pos; key_out->size_bytes = key_size; pos += key_size; int32_t value_size; memcpy(&value_size, reader->metadata + reader->offset + pos, sizeof(int32_t)); pos += sizeof(int32_t); value_out->data = reader->metadata + reader->offset + pos; value_out->size_bytes = value_size; pos += value_size; reader->offset += pos; reader->remaining_keys--; return NANOARROW_OK; } int64_t ArrowMetadataSizeOf(const char* metadata) { if (metadata == NULL) { return 0; } struct ArrowMetadataReader reader; struct ArrowStringView key; struct ArrowStringView value; if (ArrowMetadataReaderInit(&reader, metadata) != NANOARROW_OK) { return 0; } int64_t size = sizeof(int32_t); while (ArrowMetadataReaderRead(&reader, &key, &value) == NANOARROW_OK) { size += sizeof(int32_t) + key.size_bytes + sizeof(int32_t) + value.size_bytes; } return size; } static ArrowErrorCode ArrowMetadataGetValueInternal(const char* metadata, struct ArrowStringView* key, struct ArrowStringView* value_out) { struct ArrowMetadataReader reader; struct ArrowStringView existing_key; struct ArrowStringView existing_value; NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, metadata)); while (ArrowMetadataReaderRead(&reader, &existing_key, &existing_value) == NANOARROW_OK) { int key_equal = key->size_bytes == existing_key.size_bytes && strncmp(key->data, existing_key.data, existing_key.size_bytes) == 0; if (key_equal) { value_out->data = existing_value.data; value_out->size_bytes = existing_value.size_bytes; break; } } return NANOARROW_OK; } ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringView key, struct ArrowStringView* value_out) { if (value_out == NULL) { return EINVAL; } return ArrowMetadataGetValueInternal(metadata, &key, value_out); } char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key) { struct ArrowStringView value = ArrowCharView(NULL); if (ArrowMetadataGetValue(metadata, key, &value) != NANOARROW_OK) { return 0; } return value.data != NULL; } ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, const char* metadata) { ArrowBufferInit(buffer); return ArrowBufferAppend(buffer, metadata, ArrowMetadataSizeOf(metadata)); } static ArrowErrorCode ArrowMetadataBuilderAppendInternal(struct ArrowBuffer* buffer, struct ArrowStringView* key, struct ArrowStringView* value) { if (value == NULL) { return NANOARROW_OK; } if (buffer->capacity_bytes == 0) { NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(buffer, 0)); } if (((size_t)buffer->capacity_bytes) < sizeof(int32_t)) { return EINVAL; } int32_t n_keys; memcpy(&n_keys, buffer->data, sizeof(int32_t)); int32_t key_size = (int32_t)key->size_bytes; int32_t value_size = (int32_t)value->size_bytes; NANOARROW_RETURN_NOT_OK(ArrowBufferReserve( buffer, sizeof(int32_t) + key_size + sizeof(int32_t) + value_size)); ArrowBufferAppendUnsafe(buffer, &key_size, sizeof(int32_t)); ArrowBufferAppendUnsafe(buffer, key->data, key_size); ArrowBufferAppendUnsafe(buffer, &value_size, sizeof(int32_t)); ArrowBufferAppendUnsafe(buffer, value->data, value_size); n_keys++; memcpy(buffer->data, &n_keys, sizeof(int32_t)); return NANOARROW_OK; } static ArrowErrorCode ArrowMetadataBuilderSetInternal(struct ArrowBuffer* buffer, struct ArrowStringView* key, struct ArrowStringView* value) { // Inspect the current value to see if we can avoid copying the buffer struct ArrowStringView current_value = ArrowCharView(NULL); NANOARROW_RETURN_NOT_OK( ArrowMetadataGetValueInternal((const char*)buffer->data, key, &current_value)); // The key should be removed but no key exists if (value == NULL && current_value.data == NULL) { return NANOARROW_OK; } // The key/value can be appended because no key exists if (value != NULL && current_value.data == NULL) { return ArrowMetadataBuilderAppendInternal(buffer, key, value); } struct ArrowMetadataReader reader; struct ArrowStringView existing_key; struct ArrowStringView existing_value; NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, (const char*)buffer->data)); struct ArrowBuffer new_buffer; NANOARROW_RETURN_NOT_OK(ArrowMetadataBuilderInit(&new_buffer, NULL)); while (reader.remaining_keys > 0) { int result = ArrowMetadataReaderRead(&reader, &existing_key, &existing_value); if (result != NANOARROW_OK) { ArrowBufferReset(&new_buffer); return result; } if (key->size_bytes == existing_key.size_bytes && strncmp((const char*)key->data, (const char*)existing_key.data, existing_key.size_bytes) == 0) { result = ArrowMetadataBuilderAppendInternal(&new_buffer, key, value); value = NULL; } else { result = ArrowMetadataBuilderAppendInternal(&new_buffer, &existing_key, &existing_value); } if (result != NANOARROW_OK) { ArrowBufferReset(&new_buffer); return result; } } ArrowBufferReset(buffer); ArrowBufferMove(&new_buffer, buffer); return NANOARROW_OK; } ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer, struct ArrowStringView key, struct ArrowStringView value) { return ArrowMetadataBuilderAppendInternal(buffer, &key, &value); } ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer, struct ArrowStringView key, struct ArrowStringView value) { return ArrowMetadataBuilderSetInternal(buffer, &key, &value); } ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, struct ArrowStringView key) { return ArrowMetadataBuilderSetInternal(buffer, &key, NULL); }