Status FieldToNode()

in src/parquet/arrow/schema.cc [466:605]


Status FieldToNode(const std::shared_ptr<Field>& field,
                   const WriterProperties& properties,
                   const ArrowWriterProperties& arrow_properties, NodePtr* out) {
  LogicalType::type logical_type = LogicalType::NONE;
  ParquetType::type type;
  Repetition::type repetition =
      field->nullable() ? Repetition::OPTIONAL : Repetition::REQUIRED;

  int length = -1;
  int precision = -1;
  int scale = -1;

  switch (field->type()->id()) {
    case ArrowType::NA:
      type = ParquetType::INT32;
      logical_type = LogicalType::NA;
      break;
    case ArrowType::BOOL:
      type = ParquetType::BOOLEAN;
      break;
    case ArrowType::UINT8:
      type = ParquetType::INT32;
      logical_type = LogicalType::UINT_8;
      break;
    case ArrowType::INT8:
      type = ParquetType::INT32;
      logical_type = LogicalType::INT_8;
      break;
    case ArrowType::UINT16:
      type = ParquetType::INT32;
      logical_type = LogicalType::UINT_16;
      break;
    case ArrowType::INT16:
      type = ParquetType::INT32;
      logical_type = LogicalType::INT_16;
      break;
    case ArrowType::UINT32:
      if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) {
        type = ParquetType::INT64;
      } else {
        type = ParquetType::INT32;
        logical_type = LogicalType::UINT_32;
      }
      break;
    case ArrowType::INT32:
      type = ParquetType::INT32;
      break;
    case ArrowType::UINT64:
      type = ParquetType::INT64;
      logical_type = LogicalType::UINT_64;
      break;
    case ArrowType::INT64:
      type = ParquetType::INT64;
      break;
    case ArrowType::FLOAT:
      type = ParquetType::FLOAT;
      break;
    case ArrowType::DOUBLE:
      type = ParquetType::DOUBLE;
      break;
    case ArrowType::STRING:
      type = ParquetType::BYTE_ARRAY;
      logical_type = LogicalType::UTF8;
      break;
    case ArrowType::BINARY:
      type = ParquetType::BYTE_ARRAY;
      break;
    case ArrowType::FIXED_SIZE_BINARY: {
      type = ParquetType::FIXED_LEN_BYTE_ARRAY;
      const auto& fixed_size_binary_type =
          static_cast<const ::arrow::FixedSizeBinaryType&>(*field->type());
      length = fixed_size_binary_type.byte_width();
    } break;
    case ArrowType::DECIMAL: {
      type = ParquetType::FIXED_LEN_BYTE_ARRAY;
      logical_type = LogicalType::DECIMAL;
      const auto& decimal_type =
          static_cast<const ::arrow::Decimal128Type&>(*field->type());
      precision = decimal_type.precision();
      scale = decimal_type.scale();
      length = DecimalSize(precision);
    } break;
    case ArrowType::DATE32:
      type = ParquetType::INT32;
      logical_type = LogicalType::DATE;
      break;
    case ArrowType::DATE64:
      type = ParquetType::INT32;
      logical_type = LogicalType::DATE;
      break;
    case ArrowType::TIMESTAMP:
      RETURN_NOT_OK(
          GetTimestampMetadata(static_cast<::arrow::TimestampType&>(*field->type()),
                               arrow_properties, &type, &logical_type));
      break;
    case ArrowType::TIME32:
      type = ParquetType::INT32;
      logical_type = LogicalType::TIME_MILLIS;
      break;
    case ArrowType::TIME64: {
      auto time_type = static_cast<::arrow::Time64Type*>(field->type().get());
      if (time_type->unit() == ::arrow::TimeUnit::NANO) {
        return Status::NotImplemented("Nanosecond time not supported in Parquet.");
      }
      type = ParquetType::INT64;
      logical_type = LogicalType::TIME_MICROS;
    } break;
    case ArrowType::STRUCT: {
      auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type());
      return StructToNode(struct_type, field->name(), field->nullable(), properties,
                          arrow_properties, out);
    }
    case ArrowType::LIST: {
      auto list_type = std::static_pointer_cast<::arrow::ListType>(field->type());
      return ListToNode(list_type, field->name(), field->nullable(), properties,
                        arrow_properties, out);
    }
    case ArrowType::DICTIONARY: {
      // Parquet has no Dictionary type, dictionary-encoded is handled on
      // the encoding, not the schema level.
      const ::arrow::DictionaryType& dict_type =
          static_cast<const ::arrow::DictionaryType&>(*field->type());
      std::shared_ptr<::arrow::Field> unpacked_field =
          ::arrow::field(field->name(), dict_type.dictionary()->type(), field->nullable(),
                         field->metadata());
      return FieldToNode(unpacked_field, properties, arrow_properties, out);
    }
    default: {
      // TODO: DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL_TEXT, VARCHAR
      std::stringstream ss;
      ss << "Unhandled type for Arrow to Parquet schema conversion: ";
      ss << field->type()->ToString();
      return Status::NotImplemented(ss.str());
    }
  }
  PARQUET_CATCH_NOT_OK(*out =
                           PrimitiveNode::Make(field->name(), repetition, type,
                                               logical_type, length, precision, scale));
  return Status::OK();
}