in hudi-common/src/main/java/org/apache/hudi/common/util/AvroOrcUtils.java [476:603]
public static Object readFromVector(TypeDescription type, ColumnVector colVector, Schema avroSchema, int vectorPos) {
if (colVector.isRepeating) {
vectorPos = 0;
}
if (colVector.isNull[vectorPos]) {
return null;
}
if (avroSchema.getType().equals(Schema.Type.UNION)) {
avroSchema = getActualSchemaType(avroSchema);
}
LogicalType logicalType = avroSchema != null ? avroSchema.getLogicalType() : null;
switch (type.getCategory()) {
case BOOLEAN:
return ((LongColumnVector) colVector).vector[vectorPos] != 0;
case BYTE:
return (byte) ((LongColumnVector) colVector).vector[vectorPos];
case SHORT:
return (short) ((LongColumnVector) colVector).vector[vectorPos];
case INT:
return (int) ((LongColumnVector) colVector).vector[vectorPos];
case LONG:
return ((LongColumnVector) colVector).vector[vectorPos];
case FLOAT:
return (float) ((DoubleColumnVector) colVector).vector[vectorPos];
case DOUBLE:
return ((DoubleColumnVector) colVector).vector[vectorPos];
case VARCHAR:
case CHAR:
int maxLength = type.getMaxLength();
String result = ((BytesColumnVector) colVector).toString(vectorPos);
if (result.length() <= maxLength) {
return result;
} else {
throw new HoodieIOException("CHAR/VARCHAR has length " + result.length() + " greater than Max Length allowed");
}
case STRING:
String stringType = avroSchema.getProp(GenericData.STRING_PROP);
if (stringType == null || !stringType.equals(StringType.String)) {
int stringLength = ((BytesColumnVector) colVector).length[vectorPos];
int stringOffset = ((BytesColumnVector) colVector).start[vectorPos];
byte[] stringBytes = new byte[stringLength];
System.arraycopy(((BytesColumnVector) colVector).vector[vectorPos], stringOffset, stringBytes, 0, stringLength);
return new Utf8(stringBytes);
} else {
return ((BytesColumnVector) colVector).toString(vectorPos);
}
case DATE:
// convert to daysSinceEpoch for LogicalType.Date
return (int) ((LongColumnVector) colVector).vector[vectorPos];
case TIMESTAMP:
// The unit of time in ORC is millis. Convert (time,nanos) to the desired unit per logicalType
long time = ((TimestampColumnVector) colVector).time[vectorPos];
int nanos = ((TimestampColumnVector) colVector).nanos[vectorPos];
if (logicalType instanceof LogicalTypes.TimestampMillis) {
return time;
} else if (logicalType instanceof LogicalTypes.TimestampMicros) {
return time * MICROS_PER_MILLI + nanos / NANOS_PER_MICRO;
} else {
return ((TimestampColumnVector) colVector).getTimestampAsLong(vectorPos);
}
case BINARY:
int binaryLength = ((BytesColumnVector) colVector).length[vectorPos];
int binaryOffset = ((BytesColumnVector) colVector).start[vectorPos];
byte[] binaryBytes = new byte[binaryLength];
System.arraycopy(((BytesColumnVector) colVector).vector[vectorPos], binaryOffset, binaryBytes, 0, binaryLength);
// return a ByteBuffer to be consistent with AvroRecordConverter
return ByteBuffer.wrap(binaryBytes);
case DECIMAL:
// HiveDecimal always ignores trailing zeros, thus modifies the scale implicitly,
// therefore, the scale must be enforced here.
BigDecimal bigDecimal = ((DecimalColumnVector) colVector).vector[vectorPos]
.getHiveDecimal().bigDecimalValue()
.setScale(((LogicalTypes.Decimal) logicalType).getScale());
Schema.Type baseType = avroSchema.getType();
if (baseType.equals(Schema.Type.FIXED)) {
return new Conversions.DecimalConversion().toFixed(bigDecimal, avroSchema, logicalType);
} else if (baseType.equals(Schema.Type.BYTES)) {
return bigDecimal.unscaledValue().toByteArray();
} else {
throw new HoodieIOException(baseType.getName() + "is not a valid type for LogicalTypes.DECIMAL.");
}
case LIST:
ArrayList<Object> list = new ArrayList<>();
ListColumnVector listVector = (ListColumnVector) colVector;
int listLength = (int) listVector.lengths[vectorPos];
int listOffset = (int) listVector.offsets[vectorPos];
list.ensureCapacity(listLength);
TypeDescription childType = type.getChildren().get(0);
for (int i = 0; i < listLength; i++) {
list.add(readFromVector(childType, listVector.child, avroSchema.getElementType(), listOffset + i));
}
return list;
case MAP:
Map<String, Object> map = new HashMap<String, Object>();
MapColumnVector mapVector = (MapColumnVector) colVector;
int mapLength = (int) mapVector.lengths[vectorPos];
int mapOffset = (int) mapVector.offsets[vectorPos];
// keys are always strings for maps in Avro
Schema keySchema = Schema.create(Schema.Type.STRING);
for (int i = 0; i < mapLength; i++) {
map.put(
readFromVector(type.getChildren().get(0), mapVector.keys, keySchema, i + mapOffset).toString(),
readFromVector(type.getChildren().get(1), mapVector.values,
avroSchema.getValueType(), i + mapOffset));
}
return map;
case STRUCT:
StructColumnVector structVector = (StructColumnVector) colVector;
List<TypeDescription> children = type.getChildren();
GenericData.Record record = new GenericData.Record(avroSchema);
for (int i = 0; i < children.size(); i++) {
record.put(i, readFromVector(children.get(i), structVector.fields[i],
avroSchema.getFields().get(i).schema(), vectorPos));
}
return record;
case UNION:
UnionColumnVector unionVector = (UnionColumnVector) colVector;
int tag = unionVector.tags[vectorPos];
ColumnVector fieldVector = unionVector.fields[tag];
return readFromVector(type.getChildren().get(tag), fieldVector, avroSchema.getTypes().get(tag), vectorPos);
default:
throw new HoodieIOException("Unrecognized TypeDescription " + type.toString());
}
}