in server/pxf-hdfs/src/main/java/org/greenplum/pxf/plugins/hdfs/ParquetResolver.java [120:207]
private void fillGroup(int index, OneField field, Group group, Type type) throws IOException {
if (field.val == null)
return;
switch (type.asPrimitiveType().getPrimitiveTypeName()) {
case BINARY:
if (type.getLogicalTypeAnnotation() instanceof StringLogicalTypeAnnotation)
group.add(index, (String) field.val);
else
group.add(index, Binary.fromReusedByteArray((byte[]) field.val));
break;
case INT32:
if (type.getLogicalTypeAnnotation() instanceof DateLogicalTypeAnnotation) {
String dateString = (String) field.val;
group.add(index, ParquetTypeConverter.getDaysFromEpochFromDateString(dateString));
} else if (type.getLogicalTypeAnnotation() instanceof IntLogicalTypeAnnotation &&
((IntLogicalTypeAnnotation) type.getLogicalTypeAnnotation()).getBitWidth() == 16) {
group.add(index, (Short) field.val);
} else {
group.add(index, (Integer) field.val);
}
break;
case INT64:
group.add(index, (Long) field.val);
break;
case DOUBLE:
group.add(index, (Double) field.val);
break;
case FLOAT:
group.add(index, (Float) field.val);
break;
case FIXED_LEN_BYTE_ARRAY:
// From org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriter.DecimalDataWriter#decimalToBinary
String value = (String) field.val;
DecimalLogicalTypeAnnotation typeAnnotation = (DecimalLogicalTypeAnnotation) type.getLogicalTypeAnnotation();
int precision = Math.min(HiveDecimal.MAX_PRECISION, typeAnnotation.getPrecision());
int scale = Math.min(HiveDecimal.MAX_SCALE, typeAnnotation.getScale());
HiveDecimal hiveDecimal = HiveDecimal.enforcePrecisionScale(
HiveDecimal.create(value),
precision,
scale);
if (hiveDecimal == null) {
// When precision is higher than HiveDecimal.MAX_PRECISION
// and enforcePrecisionScale returns null, it means we
// cannot store the value in Parquet because we have
// exceeded the precision. To make the behavior consistent
// with Hive's behavior when storing on a Parquet-backed
// table, we store the value as null.
return;
}
byte[] decimalBytes = hiveDecimal.bigIntegerBytesScaled(scale);
// Estimated number of bytes needed.
int precToBytes = ParquetFileAccessor.PRECISION_TO_BYTE_COUNT[precision - 1];
if (precToBytes == decimalBytes.length) {
// No padding needed.
group.add(index, Binary.fromReusedByteArray(decimalBytes));
} else {
byte[] tgt = new byte[precToBytes];
if (hiveDecimal.signum() == -1) {
// For negative number, initializing bits to 1
for (int i = 0; i < precToBytes; i++) {
tgt[i] |= 0xFF;
}
}
System.arraycopy(decimalBytes, 0, tgt, precToBytes - decimalBytes.length, decimalBytes.length); // Padding leading zeroes/ones.
group.add(index, Binary.fromReusedByteArray(tgt));
}
// end -- org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriter.DecimalDataWriter#decimalToBinary
break;
case INT96: // SQL standard timestamp string value with or without time zone literals: https://www.postgresql.org/docs/9.4/datatype-datetime.html
String timestamp = (String) field.val;
if (TIMESTAMP_PATTERN.matcher(timestamp).find()) {
// Note: this conversion convert type "timestamp with time zone" will lose timezone information
// while preserving the correct value. (as Parquet doesn't support timestamp with time zone.
group.add(index, ParquetTypeConverter.getBinaryFromTimestampWithTimeZone(timestamp));
} else {
group.add(index, ParquetTypeConverter.getBinaryFromTimestamp(timestamp));
}
break;
case BOOLEAN:
group.add(index, (Boolean) field.val);
break;
default:
throw new IOException("Not supported type " + type.asPrimitiveType().getPrimitiveTypeName());
}
}