in java/core/src/java/org/apache/orc/impl/ReaderImpl.java [957:1004]
private static long getRawDataSizeOfColumn(TypeDescription column,
List<OrcProto.ColumnStatistics> stats) {
OrcProto.ColumnStatistics colStat = stats.get(column.getId());
long numVals = colStat.getNumberOfValues();
switch (column.getCategory()) {
case BINARY:
// old orc format doesn't support binary statistics. checking for binary
// statistics is not required as protocol buffers takes care of it.
return colStat.getBinaryStatistics().getSum();
case STRING:
case CHAR:
case VARCHAR:
// old orc format doesn't support sum for string statistics. checking for
// existence is not required as protocol buffers takes care of it.
// ORC strings are deserialized to java strings. so use java data model's
// string size
numVals = numVals == 0 ? 1 : numVals;
int avgStrLen = (int) (colStat.getStringStatistics().getSum() / numVals);
return numVals * JavaDataModel.get().lengthForStringOfLength(avgStrLen);
case TIMESTAMP:
case TIMESTAMP_INSTANT:
return numVals * JavaDataModel.get().lengthOfTimestamp();
case DATE:
return numVals * JavaDataModel.get().lengthOfDate();
case DECIMAL:
return numVals * JavaDataModel.get().lengthOfDecimal();
case DOUBLE:
case LONG:
return numVals * JavaDataModel.get().primitive2();
case FLOAT:
case INT:
case SHORT:
case BOOLEAN:
case BYTE:
case STRUCT:
case UNION:
case MAP:
case LIST:
return numVals * JavaDataModel.get().primitive1();
default:
LOG.debug("Unknown primitive category: {}", column.getCategory());
break;
}
return 0;
}