in src/parquet/arrow/schema.cc [466:605]
Status FieldToNode(const std::shared_ptr<Field>& field,
const WriterProperties& properties,
const ArrowWriterProperties& arrow_properties, NodePtr* out) {
LogicalType::type logical_type = LogicalType::NONE;
ParquetType::type type;
Repetition::type repetition =
field->nullable() ? Repetition::OPTIONAL : Repetition::REQUIRED;
int length = -1;
int precision = -1;
int scale = -1;
switch (field->type()->id()) {
case ArrowType::NA:
type = ParquetType::INT32;
logical_type = LogicalType::NA;
break;
case ArrowType::BOOL:
type = ParquetType::BOOLEAN;
break;
case ArrowType::UINT8:
type = ParquetType::INT32;
logical_type = LogicalType::UINT_8;
break;
case ArrowType::INT8:
type = ParquetType::INT32;
logical_type = LogicalType::INT_8;
break;
case ArrowType::UINT16:
type = ParquetType::INT32;
logical_type = LogicalType::UINT_16;
break;
case ArrowType::INT16:
type = ParquetType::INT32;
logical_type = LogicalType::INT_16;
break;
case ArrowType::UINT32:
if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) {
type = ParquetType::INT64;
} else {
type = ParquetType::INT32;
logical_type = LogicalType::UINT_32;
}
break;
case ArrowType::INT32:
type = ParquetType::INT32;
break;
case ArrowType::UINT64:
type = ParquetType::INT64;
logical_type = LogicalType::UINT_64;
break;
case ArrowType::INT64:
type = ParquetType::INT64;
break;
case ArrowType::FLOAT:
type = ParquetType::FLOAT;
break;
case ArrowType::DOUBLE:
type = ParquetType::DOUBLE;
break;
case ArrowType::STRING:
type = ParquetType::BYTE_ARRAY;
logical_type = LogicalType::UTF8;
break;
case ArrowType::BINARY:
type = ParquetType::BYTE_ARRAY;
break;
case ArrowType::FIXED_SIZE_BINARY: {
type = ParquetType::FIXED_LEN_BYTE_ARRAY;
const auto& fixed_size_binary_type =
static_cast<const ::arrow::FixedSizeBinaryType&>(*field->type());
length = fixed_size_binary_type.byte_width();
} break;
case ArrowType::DECIMAL: {
type = ParquetType::FIXED_LEN_BYTE_ARRAY;
logical_type = LogicalType::DECIMAL;
const auto& decimal_type =
static_cast<const ::arrow::Decimal128Type&>(*field->type());
precision = decimal_type.precision();
scale = decimal_type.scale();
length = DecimalSize(precision);
} break;
case ArrowType::DATE32:
type = ParquetType::INT32;
logical_type = LogicalType::DATE;
break;
case ArrowType::DATE64:
type = ParquetType::INT32;
logical_type = LogicalType::DATE;
break;
case ArrowType::TIMESTAMP:
RETURN_NOT_OK(
GetTimestampMetadata(static_cast<::arrow::TimestampType&>(*field->type()),
arrow_properties, &type, &logical_type));
break;
case ArrowType::TIME32:
type = ParquetType::INT32;
logical_type = LogicalType::TIME_MILLIS;
break;
case ArrowType::TIME64: {
auto time_type = static_cast<::arrow::Time64Type*>(field->type().get());
if (time_type->unit() == ::arrow::TimeUnit::NANO) {
return Status::NotImplemented("Nanosecond time not supported in Parquet.");
}
type = ParquetType::INT64;
logical_type = LogicalType::TIME_MICROS;
} break;
case ArrowType::STRUCT: {
auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type());
return StructToNode(struct_type, field->name(), field->nullable(), properties,
arrow_properties, out);
}
case ArrowType::LIST: {
auto list_type = std::static_pointer_cast<::arrow::ListType>(field->type());
return ListToNode(list_type, field->name(), field->nullable(), properties,
arrow_properties, out);
}
case ArrowType::DICTIONARY: {
// Parquet has no Dictionary type, dictionary-encoded is handled on
// the encoding, not the schema level.
const ::arrow::DictionaryType& dict_type =
static_cast<const ::arrow::DictionaryType&>(*field->type());
std::shared_ptr<::arrow::Field> unpacked_field =
::arrow::field(field->name(), dict_type.dictionary()->type(), field->nullable(),
field->metadata());
return FieldToNode(unpacked_field, properties, arrow_properties, out);
}
default: {
// TODO: DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL_TEXT, VARCHAR
std::stringstream ss;
ss << "Unhandled type for Arrow to Parquet schema conversion: ";
ss << field->type()->ToString();
return Status::NotImplemented(ss.str());
}
}
PARQUET_CATCH_NOT_OK(*out =
PrimitiveNode::Make(field->name(), repetition, type,
logical_type, length, precision, scale));
return Status::OK();
}