Status FieldToNode()

in cpp/src/parquet/arrow/schema.cc [302:498]


Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
                   const WriterProperties& properties,
                   const ArrowWriterProperties& arrow_properties, NodePtr* out) {
  std::shared_ptr<const LogicalType> logical_type = LogicalType::None();
  ParquetType::type type;
  Repetition::type repetition = RepetitionFromNullable(field->nullable());
  int field_id = FieldIdFromMetadata(field->metadata());

  int length = -1;
  int precision = -1;
  int scale = -1;

  switch (field->type()->id()) {
    case ArrowTypeId::NA: {
      type = ParquetType::INT32;
      logical_type = LogicalType::Null();
      if (repetition != Repetition::OPTIONAL) {
        return Status::Invalid("NullType Arrow field must be nullable");
      }
    } break;
    case ArrowTypeId::BOOL:
      type = ParquetType::BOOLEAN;
      break;
    case ArrowTypeId::UINT8:
      type = ParquetType::INT32;
      logical_type = LogicalType::Int(8, false);
      break;
    case ArrowTypeId::INT8:
      type = ParquetType::INT32;
      logical_type = LogicalType::Int(8, true);
      break;
    case ArrowTypeId::UINT16:
      type = ParquetType::INT32;
      logical_type = LogicalType::Int(16, false);
      break;
    case ArrowTypeId::INT16:
      type = ParquetType::INT32;
      logical_type = LogicalType::Int(16, true);
      break;
    case ArrowTypeId::UINT32:
      if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) {
        type = ParquetType::INT64;
      } else {
        type = ParquetType::INT32;
        logical_type = LogicalType::Int(32, false);
      }
      break;
    case ArrowTypeId::INT32:
      type = ParquetType::INT32;
      break;
    case ArrowTypeId::UINT64:
      type = ParquetType::INT64;
      logical_type = LogicalType::Int(64, false);
      break;
    case ArrowTypeId::INT64:
      type = ParquetType::INT64;
      break;
    case ArrowTypeId::FLOAT:
      type = ParquetType::FLOAT;
      break;
    case ArrowTypeId::DOUBLE:
      type = ParquetType::DOUBLE;
      break;
    case ArrowTypeId::LARGE_STRING:
    case ArrowTypeId::STRING:
      type = ParquetType::BYTE_ARRAY;
      logical_type = LogicalType::String();
      break;
    case ArrowTypeId::LARGE_BINARY:
    case ArrowTypeId::BINARY:
      type = ParquetType::BYTE_ARRAY;
      break;
    case ArrowTypeId::FIXED_SIZE_BINARY: {
      type = ParquetType::FIXED_LEN_BYTE_ARRAY;
      const auto& fixed_size_binary_type =
          static_cast<const ::arrow::FixedSizeBinaryType&>(*field->type());
      length = fixed_size_binary_type.byte_width();
    } break;
    case ArrowTypeId::DECIMAL128:
    case ArrowTypeId::DECIMAL256: {
      const auto& decimal_type = static_cast<const ::arrow::DecimalType&>(*field->type());
      precision = decimal_type.precision();
      scale = decimal_type.scale();
      if (properties.store_decimal_as_integer() && 1 <= precision && precision <= 18) {
        type = precision <= 9 ? ParquetType ::INT32 : ParquetType ::INT64;
      } else {
        type = ParquetType::FIXED_LEN_BYTE_ARRAY;
        length = DecimalType::DecimalSize(precision);
      }
      PARQUET_CATCH_NOT_OK(logical_type = LogicalType::Decimal(precision, scale));
    } break;
    case ArrowTypeId::DATE32:
      type = ParquetType::INT32;
      logical_type = LogicalType::Date();
      break;
    case ArrowTypeId::DATE64:
      type = ParquetType::INT32;
      logical_type = LogicalType::Date();
      break;
    case ArrowTypeId::TIMESTAMP:
      RETURN_NOT_OK(
          GetTimestampMetadata(static_cast<::arrow::TimestampType&>(*field->type()),
                               properties, arrow_properties, &type, &logical_type));
      break;
    case ArrowTypeId::TIME32:
      type = ParquetType::INT32;
      logical_type =
          LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::MILLIS);
      break;
    case ArrowTypeId::TIME64: {
      type = ParquetType::INT64;
      auto time_type = static_cast<::arrow::Time64Type*>(field->type().get());
      if (time_type->unit() == ::arrow::TimeUnit::NANO) {
        logical_type =
            LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::NANOS);
      } else {
        logical_type =
            LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::MICROS);
      }
    } break;
    case ArrowTypeId::DURATION:
      type = ParquetType::INT64;
      break;
    case ArrowTypeId::HALF_FLOAT:
      type = ParquetType::FIXED_LEN_BYTE_ARRAY;
      logical_type = LogicalType::Float16();
      length = sizeof(uint16_t);
      break;
    case ArrowTypeId::STRUCT: {
      auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type());
      return StructToNode(struct_type, name, field->nullable(), field_id, properties,
                          arrow_properties, out);
    }
    case ArrowTypeId::FIXED_SIZE_LIST:
    case ArrowTypeId::LARGE_LIST:
    case ArrowTypeId::LIST: {
      auto list_type = std::static_pointer_cast<::arrow::BaseListType>(field->type());
      return ListToNode(list_type, name, field->nullable(), field_id, properties,
                        arrow_properties, out);
    }
    case ArrowTypeId::DICTIONARY: {
      // Parquet has no Dictionary type, dictionary-encoded is handled on
      // the encoding, not the schema level.
      const ::arrow::DictionaryType& dict_type =
          static_cast<const ::arrow::DictionaryType&>(*field->type());
      std::shared_ptr<::arrow::Field> unpacked_field = ::arrow::field(
          name, dict_type.value_type(), field->nullable(), field->metadata());
      return FieldToNode(name, unpacked_field, properties, arrow_properties, out);
    }
    case ArrowTypeId::EXTENSION: {
      auto ext_type = std::static_pointer_cast<::arrow::ExtensionType>(field->type());
      // Set physical and logical types and instantiate primitive node
      // for extension types
      if (ext_type->extension_name() == std::string_view("arrow.json")) {
        type = ParquetType::BYTE_ARRAY;
        logical_type = LogicalType::JSON();
        break;
      } else if (ext_type->extension_name() == std::string("arrow.uuid")) {
        type = ParquetType::FIXED_LEN_BYTE_ARRAY;
        logical_type = LogicalType::UUID();
        length = 16;
        break;
      } else if (ext_type->extension_name() == std::string_view("geoarrow.wkb")) {
        type = ParquetType::BYTE_ARRAY;
        ARROW_ASSIGN_OR_RAISE(logical_type,
                              LogicalTypeFromGeoArrowMetadata(ext_type->Serialize()));
        break;
      } else if (ext_type->extension_name() == std::string("parquet.variant")) {
        auto variant_type = std::static_pointer_cast<VariantExtensionType>(field->type());

        return VariantToNode(variant_type, name, field->nullable(), field_id, properties,
                             arrow_properties, out);
      }

      std::shared_ptr<::arrow::Field> storage_field = ::arrow::field(
          name, ext_type->storage_type(), field->nullable(), field->metadata());
      return FieldToNode(name, storage_field, properties, arrow_properties, out);
    }
    case ArrowTypeId::MAP: {
      auto map_type = std::static_pointer_cast<::arrow::MapType>(field->type());
      return MapToNode(map_type, name, field->nullable(), field_id, properties,
                       arrow_properties, out);
    }

    default: {
      // TODO: DENSE_UNION, SPARE_UNION, DECIMAL_TEXT, VARCHAR
      return Status::NotImplemented(
          "Unhandled type for Arrow to Parquet schema conversion: ",
          field->type()->ToString());
    }
  }

  PARQUET_CATCH_NOT_OK(*out = PrimitiveNode::Make(name, repetition, logical_type, type,
                                                  length, field_id));

  return Status::OK();
}