in spark/spark-3.3/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/GeoParquetSchemaConverter.scala [109:200]
private def isGeometryField(fieldName: String): Boolean =
geoParquetMetaData.columns.contains(fieldName)
private def convertPrimitiveField(field: PrimitiveType): DataType = {
val typeName = field.getPrimitiveTypeName
val originalType = field.getOriginalType
def typeString =
if (originalType == null) s"$typeName" else s"$typeName ($originalType)"
def typeNotSupported() =
throw new IllegalArgumentException(s"Parquet type not supported: $typeString")
def typeNotImplemented() =
throw new IllegalArgumentException(s"Parquet type not yet supported: $typeString")
def illegalType() =
throw new IllegalArgumentException(s"Illegal Parquet type: $typeString")
// When maxPrecision = -1, we skip precision range check, and always respect the precision
// specified in field.getDecimalMetadata. This is useful when interpreting decimal types stored
// as binaries with variable lengths.
def makeDecimalType(maxPrecision: Int = -1): DecimalType = {
val precision = field.getDecimalMetadata.getPrecision
val scale = field.getDecimalMetadata.getScale
ParquetSchemaConverter.checkConversionRequirement(
maxPrecision == -1 || 1 <= precision && precision <= maxPrecision,
s"Invalid decimal precision: $typeName cannot store $precision digits (max $maxPrecision)")
DecimalType(precision, scale)
}
typeName match {
case BOOLEAN => BooleanType
case FLOAT => FloatType
case DOUBLE => DoubleType
case INT32 =>
originalType match {
case INT_8 => ByteType
case INT_16 => ShortType
case INT_32 | null => IntegerType
case DATE => DateType
case DECIMAL => makeDecimalType(Decimal.MAX_INT_DIGITS)
case UINT_8 => typeNotSupported()
case UINT_16 => typeNotSupported()
case UINT_32 => typeNotSupported()
case TIME_MILLIS => typeNotImplemented()
case _ => illegalType()
}
case INT64 =>
originalType match {
case INT_64 | null => LongType
case DECIMAL => makeDecimalType(Decimal.MAX_LONG_DIGITS)
case UINT_64 => typeNotSupported()
case TIMESTAMP_MICROS => TimestampType
case TIMESTAMP_MILLIS => TimestampType
case _ => illegalType()
}
case INT96 =>
ParquetSchemaConverter.checkConversionRequirement(
assumeInt96IsTimestamp,
"INT96 is not supported unless it's interpreted as timestamp. " +
s"Please try to set ${SQLConf.PARQUET_INT96_AS_TIMESTAMP.key} to true.")
TimestampType
case BINARY =>
originalType match {
case UTF8 | ENUM | JSON => StringType
case null if isGeometryField(field.getName) => GeometryUDT
case null if assumeBinaryIsString => StringType
case null => BinaryType
case BSON => BinaryType
case DECIMAL => makeDecimalType()
case _ => illegalType()
}
case FIXED_LEN_BYTE_ARRAY =>
originalType match {
case DECIMAL => makeDecimalType(Decimal.maxPrecisionForBytes(field.getTypeLength))
case INTERVAL => typeNotImplemented()
case _ => illegalType()
}
case _ => illegalType()
}
}