in common/src/main/java/org/apache/comet/parquet/TypeUtil.java [39:107]
public static ColumnDescriptor convertToParquet(StructField field) {
Type.Repetition repetition;
int maxDefinitionLevel;
if (field.nullable()) {
repetition = Type.Repetition.OPTIONAL;
maxDefinitionLevel = 1;
} else {
repetition = Type.Repetition.REQUIRED;
maxDefinitionLevel = 0;
}
String[] path = new String[] {field.name()};
DataType type = field.dataType();
Types.PrimitiveBuilder<PrimitiveType> builder = null;
// Only partition column can be `NullType`, which also uses `ConstantColumnReader`. Here we
// piggy-back onto Parquet boolean type for constant vector of null values, we don't really
// care what Parquet type it is.
if (type == DataTypes.BooleanType || type == DataTypes.NullType) {
builder = Types.primitive(PrimitiveType.PrimitiveTypeName.BOOLEAN, repetition);
} else if (type == DataTypes.IntegerType || type instanceof YearMonthIntervalType) {
builder =
Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition)
.as(LogicalTypeAnnotation.intType(32, true));
} else if (type == DataTypes.DateType) {
builder =
Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition)
.as(LogicalTypeAnnotation.dateType());
} else if (type == DataTypes.ByteType) {
builder =
Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition)
.as(LogicalTypeAnnotation.intType(8, true));
} else if (type == DataTypes.ShortType) {
builder =
Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition)
.as(LogicalTypeAnnotation.intType(16, true));
} else if (type == DataTypes.LongType) {
builder = Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition);
} else if (type == DataTypes.BinaryType) {
builder = Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, repetition);
} else if (type == DataTypes.StringType) {
builder =
Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, repetition)
.as(LogicalTypeAnnotation.stringType());
} else if (type == DataTypes.FloatType) {
builder = Types.primitive(PrimitiveType.PrimitiveTypeName.FLOAT, repetition);
} else if (type == DataTypes.DoubleType) {
builder = Types.primitive(PrimitiveType.PrimitiveTypeName.DOUBLE, repetition);
} else if (type == DataTypes.TimestampType) {
builder =
Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition)
.as(LogicalTypeAnnotation.timestampType(true, TimeUnit.MICROS));
} else if (type == TimestampNTZType$.MODULE$) {
builder =
Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition)
.as(LogicalTypeAnnotation.timestampType(false, TimeUnit.MICROS));
} else if (type instanceof DecimalType) {
DecimalType decimalType = (DecimalType) type;
builder =
Types.primitive(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, repetition)
.length(16) // always store as Decimal128
.as(LogicalTypeAnnotation.decimalType(decimalType.scale(), decimalType.precision()));
}
if (builder == null) {
throw new UnsupportedOperationException("Unsupported input Spark type: " + type);
}
return new ColumnDescriptor(path, builder.named(field.name()), 0, maxDefinitionLevel);
}