r/src/materialize.c (269 lines of code) (raw):

// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include <R.h> #include <Rinternals.h> #include "nanoarrow.h" // Needed for the list_of materializer #include "convert.h" #include "materialize.h" #include "materialize_blob.h" #include "materialize_chr.h" #include "materialize_date.h" #include "materialize_dbl.h" #include "materialize_difftime.h" #include "materialize_int.h" #include "materialize_lgl.h" #include "materialize_posixct.h" #include "materialize_unspecified.h" SEXP nanoarrow_alloc_type(enum VectorType vector_type, R_xlen_t len) { switch (vector_type) { case VECTOR_TYPE_LGL: return Rf_allocVector(LGLSXP, len); case VECTOR_TYPE_INT: return Rf_allocVector(INTSXP, len); case VECTOR_TYPE_DBL: return Rf_allocVector(REALSXP, len); case VECTOR_TYPE_CHR: return Rf_allocVector(STRSXP, len); default: return R_NilValue; } } // A version of Rf_getAttrib(x, sym) != R_NilValue that never // expands the row.names attribute static int has_attrib_safe(SEXP x, SEXP sym) { for (SEXP atts = ATTRIB(x); atts != R_NilValue; atts = CDR(atts)) { if (TAG(atts) == sym) return TRUE; } return FALSE; } R_xlen_t nanoarrow_data_frame_size(SEXP x) { if (Rf_length(x) > 0) { // This both avoids materializing the row.names attribute and // makes this work with struct-style vctrs that don't have a // row.names attribute but that always have one or more element return Rf_xlength(VECTOR_ELT(x, 0)); } else { // Since ALTREP was introduced, materializing the row.names attribute is // usually deferred such that values in the form c(NA, -nrow), 1:nrow, or // as.character(1:nrow) are never actually computed when the length is // taken. return Rf_xlength(Rf_getAttrib(x, R_RowNamesSymbol)); } } void nanoarrow_set_rownames(SEXP x, R_xlen_t len) { // If len fits in the integer range, we can use the c(NA, -nrow) // shortcut for the row.names attribute. R expands this when // the actual value is accessed (even from Rf_getAttrib()). // If len does not fit in the integer range, we need // as.character(seq_len(nrow)) (which returns a deferred ALTREP // string conversion of an ALTREP sequence in recent R). Manipulating // data frames with more than INT_MAX rows is not supported in most // places but column access still works. if (len <= INT_MAX) { SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2)); INTEGER(rownames)[0] = NA_INTEGER; INTEGER(rownames)[1] = -len; Rf_setAttrib(x, R_RowNamesSymbol, rownames); UNPROTECT(1); } else { SEXP length_dbl = PROTECT(Rf_ScalarReal(len)); SEXP seq_len_symbol = PROTECT(Rf_install("seq_len")); SEXP seq_len_call = PROTECT(Rf_lang2(seq_len_symbol, length_dbl)); SEXP rownames_call = PROTECT(Rf_lang2(R_AsCharacterSymbol, seq_len_call)); Rf_setAttrib(x, R_RowNamesSymbol, Rf_eval(rownames_call, R_BaseNamespace)); UNPROTECT(4); } } int nanoarrow_ptype_is_data_frame(SEXP ptype) { return Rf_isObject(ptype) && TYPEOF(ptype) == VECSXP && (Rf_inherits(ptype, "data.frame") || (Rf_xlength(ptype) > 0 && has_attrib_safe(ptype, R_NamesSymbol))); } SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len) { SEXP result; if (Rf_isObject(ptype)) { if (nanoarrow_ptype_is_data_frame(ptype)) { R_xlen_t num_cols = Rf_xlength(ptype); result = PROTECT(Rf_allocVector(VECSXP, num_cols)); for (R_xlen_t i = 0; i < num_cols; i++) { SET_VECTOR_ELT(result, i, nanoarrow_materialize_realloc(VECTOR_ELT(ptype, i), len)); } // Set attributes from ptype Rf_setAttrib(result, R_NamesSymbol, Rf_getAttrib(ptype, R_NamesSymbol)); Rf_copyMostAttrib(ptype, result); // ...except rownames if (Rf_inherits(ptype, "data.frame")) { nanoarrow_set_rownames(result, len); } } else { result = PROTECT(Rf_allocVector(TYPEOF(ptype), len)); Rf_copyMostAttrib(ptype, result); } } else { result = PROTECT(Rf_allocVector(TYPEOF(ptype), len)); } UNPROTECT(1); return result; } // Used in union building to pre-set all values to null static void fill_vec_with_nulls(SEXP x, R_xlen_t offset, R_xlen_t len) { if (nanoarrow_ptype_is_data_frame(x)) { for (R_xlen_t i = 0; i < Rf_xlength(x); i++) { fill_vec_with_nulls(VECTOR_ELT(x, i), offset, len); } return; } switch (TYPEOF(x)) { case LGLSXP: case INTSXP: { int* values = INTEGER(x); for (R_xlen_t i = 0; i < len; i++) { values[offset + i] = NA_INTEGER; } return; } case REALSXP: { double* values = REAL(x); for (R_xlen_t i = 0; i < len; i++) { values[offset + i] = NA_REAL; } return; } case STRSXP: for (R_xlen_t i = 0; i < len; i++) { SET_STRING_ELT(x, offset + i, NA_STRING); } return; case VECSXP: for (R_xlen_t i = 0; i < len; i++) { SET_VECTOR_ELT(x, offset + i, R_NilValue); } return; default: Rf_error("Attempt to fill vector with nulls with unsupported type"); } } static int nanoarrow_materialize_data_frame(struct RConverter* converter, SEXP converter_xptr) { if (converter->ptype_view.vector_type != VECTOR_TYPE_DATA_FRAME) { return EINVAL; } SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3); switch (converter->array_view.storage_type) { case NANOARROW_TYPE_STRUCT: for (R_xlen_t i = 0; i < converter->n_children; i++) { converter->children[i]->src.offset = converter->src.offset; converter->children[i]->src.length = converter->src.length; converter->children[i]->dst.offset = converter->dst.offset; converter->children[i]->dst.length = converter->dst.length; SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, i); NANOARROW_RETURN_NOT_OK( nanoarrow_materialize(converter->children[i], child_converter_xptr)); } return NANOARROW_OK; case NANOARROW_TYPE_DENSE_UNION: case NANOARROW_TYPE_SPARSE_UNION: // Pre-fill everything with nulls fill_vec_with_nulls(converter->dst.vec_sexp, converter->dst.offset, converter->dst.length); // Fill in the possibly non-null values one at a time for (R_xlen_t i = 0; i < converter->dst.length; i++) { int64_t child_index = ArrowArrayViewUnionChildIndex(&converter->array_view, converter->src.offset + i); int64_t child_offset = ArrowArrayViewUnionChildOffset(&converter->array_view, converter->src.offset + i); converter->children[child_index]->src.offset = child_offset; converter->children[child_index]->src.length = 1; converter->children[child_index]->dst.offset = converter->dst.offset + i; converter->children[child_index]->dst.length = 1; SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, child_index); NANOARROW_RETURN_NOT_OK(nanoarrow_materialize(converter->children[child_index], child_converter_xptr)); } return NANOARROW_OK; default: return ENOTSUP; } } static int materialize_list_element(struct RConverter* converter, SEXP converter_xptr, int64_t offset, int64_t length) { if (nanoarrow_converter_reserve(converter_xptr, length) != NANOARROW_OK) { nanoarrow_converter_stop(converter_xptr); } converter->src.offset = offset; converter->src.length = length; converter->dst.offset = 0; converter->dst.length = length; if (nanoarrow_converter_materialize_n(converter_xptr, length) != length) { return EINVAL; } NANOARROW_RETURN_NOT_OK(nanoarrow_converter_finalize(converter_xptr)); return NANOARROW_OK; } static int nanoarrow_materialize_list_of(struct RConverter* converter, SEXP converter_xptr) { SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3); struct RConverter* child_converter = converter->children[0]; SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, 0); struct ArrayViewSlice* src = &converter->src; struct VectorSlice* dst = &converter->dst; const int32_t* offsets = src->array_view->buffer_views[1].data.as_int32; const int64_t* large_offsets = src->array_view->buffer_views[1].data.as_int64; int64_t raw_src_offset = src->array_view->array->offset + src->offset; int64_t offset; int64_t length; switch (src->array_view->storage_type) { case NANOARROW_TYPE_NA: return NANOARROW_OK; case NANOARROW_TYPE_LIST: for (int64_t i = 0; i < dst->length; i++) { if (!ArrowArrayViewIsNull(src->array_view, src->offset + i)) { offset = offsets[raw_src_offset + i]; length = offsets[raw_src_offset + i + 1] - offset; NANOARROW_RETURN_NOT_OK(materialize_list_element( child_converter, child_converter_xptr, offset, length)); SET_VECTOR_ELT(dst->vec_sexp, dst->offset + i, nanoarrow_converter_release_result(child_converter_xptr)); } } break; case NANOARROW_TYPE_LARGE_LIST: for (int64_t i = 0; i < dst->length; i++) { if (!ArrowArrayViewIsNull(src->array_view, src->offset + i)) { offset = large_offsets[raw_src_offset + i]; length = large_offsets[raw_src_offset + i + 1] - offset; NANOARROW_RETURN_NOT_OK(materialize_list_element( child_converter, child_converter_xptr, offset, length)); SET_VECTOR_ELT(dst->vec_sexp, dst->offset + i, nanoarrow_converter_release_result(child_converter_xptr)); } } break; case NANOARROW_TYPE_FIXED_SIZE_LIST: length = src->array_view->layout.child_size_elements; for (int64_t i = 0; i < dst->length; i++) { if (!ArrowArrayViewIsNull(src->array_view, src->offset + i)) { offset = (raw_src_offset + i) * length; NANOARROW_RETURN_NOT_OK(materialize_list_element( child_converter, child_converter_xptr, offset, length)); SET_VECTOR_ELT(dst->vec_sexp, dst->offset + i, nanoarrow_converter_release_result(child_converter_xptr)); } } break; default: return EINVAL; } return NANOARROW_OK; } int nanoarrow_materialize(struct RConverter* converter, SEXP converter_xptr) { struct ArrayViewSlice* src = &converter->src; struct VectorSlice* dst = &converter->dst; struct MaterializeOptions* options = converter->options; switch (converter->ptype_view.vector_type) { case VECTOR_TYPE_UNSPECIFIED: return nanoarrow_materialize_unspecified(src, dst, options); case VECTOR_TYPE_LGL: return nanoarrow_materialize_lgl(src, dst, options); case VECTOR_TYPE_INT: return nanoarrow_materialize_int(src, dst, options); case VECTOR_TYPE_DBL: return nanoarrow_materialize_dbl(converter); case VECTOR_TYPE_CHR: return nanoarrow_materialize_chr(src, dst, options); case VECTOR_TYPE_POSIXCT: return nanoarrow_materialize_posixct(converter); case VECTOR_TYPE_DATE: return nanoarrow_materialize_date(converter); case VECTOR_TYPE_DIFFTIME: return nanoarrow_materialize_difftime(converter); case VECTOR_TYPE_BLOB: return nanoarrow_materialize_blob(src, dst, options); case VECTOR_TYPE_LIST_OF: return nanoarrow_materialize_list_of(converter, converter_xptr); case VECTOR_TYPE_DATA_FRAME: return nanoarrow_materialize_data_frame(converter, converter_xptr); default: return ENOTSUP; } }