in cpp/src/parquet/arrow/schema.cc [302:498]
Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
const WriterProperties& properties,
const ArrowWriterProperties& arrow_properties, NodePtr* out) {
std::shared_ptr<const LogicalType> logical_type = LogicalType::None();
ParquetType::type type;
Repetition::type repetition = RepetitionFromNullable(field->nullable());
int field_id = FieldIdFromMetadata(field->metadata());
int length = -1;
int precision = -1;
int scale = -1;
switch (field->type()->id()) {
case ArrowTypeId::NA: {
type = ParquetType::INT32;
logical_type = LogicalType::Null();
if (repetition != Repetition::OPTIONAL) {
return Status::Invalid("NullType Arrow field must be nullable");
}
} break;
case ArrowTypeId::BOOL:
type = ParquetType::BOOLEAN;
break;
case ArrowTypeId::UINT8:
type = ParquetType::INT32;
logical_type = LogicalType::Int(8, false);
break;
case ArrowTypeId::INT8:
type = ParquetType::INT32;
logical_type = LogicalType::Int(8, true);
break;
case ArrowTypeId::UINT16:
type = ParquetType::INT32;
logical_type = LogicalType::Int(16, false);
break;
case ArrowTypeId::INT16:
type = ParquetType::INT32;
logical_type = LogicalType::Int(16, true);
break;
case ArrowTypeId::UINT32:
if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) {
type = ParquetType::INT64;
} else {
type = ParquetType::INT32;
logical_type = LogicalType::Int(32, false);
}
break;
case ArrowTypeId::INT32:
type = ParquetType::INT32;
break;
case ArrowTypeId::UINT64:
type = ParquetType::INT64;
logical_type = LogicalType::Int(64, false);
break;
case ArrowTypeId::INT64:
type = ParquetType::INT64;
break;
case ArrowTypeId::FLOAT:
type = ParquetType::FLOAT;
break;
case ArrowTypeId::DOUBLE:
type = ParquetType::DOUBLE;
break;
case ArrowTypeId::LARGE_STRING:
case ArrowTypeId::STRING:
type = ParquetType::BYTE_ARRAY;
logical_type = LogicalType::String();
break;
case ArrowTypeId::LARGE_BINARY:
case ArrowTypeId::BINARY:
type = ParquetType::BYTE_ARRAY;
break;
case ArrowTypeId::FIXED_SIZE_BINARY: {
type = ParquetType::FIXED_LEN_BYTE_ARRAY;
const auto& fixed_size_binary_type =
static_cast<const ::arrow::FixedSizeBinaryType&>(*field->type());
length = fixed_size_binary_type.byte_width();
} break;
case ArrowTypeId::DECIMAL128:
case ArrowTypeId::DECIMAL256: {
const auto& decimal_type = static_cast<const ::arrow::DecimalType&>(*field->type());
precision = decimal_type.precision();
scale = decimal_type.scale();
if (properties.store_decimal_as_integer() && 1 <= precision && precision <= 18) {
type = precision <= 9 ? ParquetType ::INT32 : ParquetType ::INT64;
} else {
type = ParquetType::FIXED_LEN_BYTE_ARRAY;
length = DecimalType::DecimalSize(precision);
}
PARQUET_CATCH_NOT_OK(logical_type = LogicalType::Decimal(precision, scale));
} break;
case ArrowTypeId::DATE32:
type = ParquetType::INT32;
logical_type = LogicalType::Date();
break;
case ArrowTypeId::DATE64:
type = ParquetType::INT32;
logical_type = LogicalType::Date();
break;
case ArrowTypeId::TIMESTAMP:
RETURN_NOT_OK(
GetTimestampMetadata(static_cast<::arrow::TimestampType&>(*field->type()),
properties, arrow_properties, &type, &logical_type));
break;
case ArrowTypeId::TIME32:
type = ParquetType::INT32;
logical_type =
LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::MILLIS);
break;
case ArrowTypeId::TIME64: {
type = ParquetType::INT64;
auto time_type = static_cast<::arrow::Time64Type*>(field->type().get());
if (time_type->unit() == ::arrow::TimeUnit::NANO) {
logical_type =
LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::NANOS);
} else {
logical_type =
LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::MICROS);
}
} break;
case ArrowTypeId::DURATION:
type = ParquetType::INT64;
break;
case ArrowTypeId::HALF_FLOAT:
type = ParquetType::FIXED_LEN_BYTE_ARRAY;
logical_type = LogicalType::Float16();
length = sizeof(uint16_t);
break;
case ArrowTypeId::STRUCT: {
auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type());
return StructToNode(struct_type, name, field->nullable(), field_id, properties,
arrow_properties, out);
}
case ArrowTypeId::FIXED_SIZE_LIST:
case ArrowTypeId::LARGE_LIST:
case ArrowTypeId::LIST: {
auto list_type = std::static_pointer_cast<::arrow::BaseListType>(field->type());
return ListToNode(list_type, name, field->nullable(), field_id, properties,
arrow_properties, out);
}
case ArrowTypeId::DICTIONARY: {
// Parquet has no Dictionary type, dictionary-encoded is handled on
// the encoding, not the schema level.
const ::arrow::DictionaryType& dict_type =
static_cast<const ::arrow::DictionaryType&>(*field->type());
std::shared_ptr<::arrow::Field> unpacked_field = ::arrow::field(
name, dict_type.value_type(), field->nullable(), field->metadata());
return FieldToNode(name, unpacked_field, properties, arrow_properties, out);
}
case ArrowTypeId::EXTENSION: {
auto ext_type = std::static_pointer_cast<::arrow::ExtensionType>(field->type());
// Set physical and logical types and instantiate primitive node
// for extension types
if (ext_type->extension_name() == std::string_view("arrow.json")) {
type = ParquetType::BYTE_ARRAY;
logical_type = LogicalType::JSON();
break;
} else if (ext_type->extension_name() == std::string("arrow.uuid")) {
type = ParquetType::FIXED_LEN_BYTE_ARRAY;
logical_type = LogicalType::UUID();
length = 16;
break;
} else if (ext_type->extension_name() == std::string_view("geoarrow.wkb")) {
type = ParquetType::BYTE_ARRAY;
ARROW_ASSIGN_OR_RAISE(logical_type,
LogicalTypeFromGeoArrowMetadata(ext_type->Serialize()));
break;
} else if (ext_type->extension_name() == std::string("parquet.variant")) {
auto variant_type = std::static_pointer_cast<VariantExtensionType>(field->type());
return VariantToNode(variant_type, name, field->nullable(), field_id, properties,
arrow_properties, out);
}
std::shared_ptr<::arrow::Field> storage_field = ::arrow::field(
name, ext_type->storage_type(), field->nullable(), field->metadata());
return FieldToNode(name, storage_field, properties, arrow_properties, out);
}
case ArrowTypeId::MAP: {
auto map_type = std::static_pointer_cast<::arrow::MapType>(field->type());
return MapToNode(map_type, name, field->nullable(), field_id, properties,
arrow_properties, out);
}
default: {
// TODO: DENSE_UNION, SPARE_UNION, DECIMAL_TEXT, VARCHAR
return Status::NotImplemented(
"Unhandled type for Arrow to Parquet schema conversion: ",
field->type()->ToString());
}
}
PARQUET_CATCH_NOT_OK(*out = PrimitiveNode::Make(name, repetition, logical_type, type,
length, field_id));
return Status::OK();
}