in orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java [120:210]
private static Metrics buildOrcMetrics(
final long numOfRows,
final TypeDescription orcSchema,
final ColumnStatistics[] colStats,
final Stream<FieldMetrics<?>> fieldMetricsStream,
final MetricsConfig metricsConfig,
final NameMapping mapping) {
TypeDescription orcSchemaWithIds;
if (ORCSchemaUtil.hasIds(orcSchema)) {
orcSchemaWithIds = orcSchema;
} else if (mapping != null) {
orcSchemaWithIds = ORCSchemaUtil.applyNameMapping(orcSchema, mapping);
} else {
return new Metrics(numOfRows);
}
final Set<Integer> statsColumns = statsColumns(orcSchemaWithIds);
final MetricsConfig effectiveMetricsConfig =
Optional.ofNullable(metricsConfig).orElseGet(MetricsConfig::getDefault);
Map<Integer, Long> columnSizes = Maps.newHashMapWithExpectedSize(colStats.length);
Map<Integer, Long> valueCounts = Maps.newHashMapWithExpectedSize(colStats.length);
Map<Integer, Long> nullCounts = Maps.newHashMapWithExpectedSize(colStats.length);
final Schema schema = ORCSchemaUtil.convert(orcSchemaWithIds);
Map<Integer, ByteBuffer> lowerBounds = Maps.newHashMap();
Map<Integer, ByteBuffer> upperBounds = Maps.newHashMap();
Map<Integer, FieldMetrics<?>> fieldMetricsMap =
Optional.ofNullable(fieldMetricsStream)
.map(stream -> stream.collect(Collectors.toMap(FieldMetrics::id, Function.identity())))
.orElseGet(Maps::newHashMap);
for (int i = 0; i < colStats.length; i++) {
final ColumnStatistics colStat = colStats[i];
final TypeDescription orcCol = orcSchemaWithIds.findSubtype(i);
final Optional<Types.NestedField> icebergColOpt =
ORCSchemaUtil.icebergID(orcCol).map(schema::findField);
if (icebergColOpt.isPresent()) {
final Types.NestedField icebergCol = icebergColOpt.get();
final int fieldId = icebergCol.fieldId();
final MetricsMode metricsMode =
MetricsUtil.metricsMode(schema, effectiveMetricsConfig, icebergCol.fieldId());
if (metricsMode == MetricsModes.None.get() || inMapOrList(orcCol)) {
continue;
}
columnSizes.put(fieldId, colStat.getBytesOnDisk());
if (statsColumns.contains(fieldId)) {
// Since ORC does not track null values nor repeated ones, the value count for columns in
// containers (maps, list) may be larger than what it actually is, however these are not
// used in expressions right now. For such cases, we use the value number of values
// directly stored in ORC.
if (colStat.hasNull()) {
nullCounts.put(fieldId, numOfRows - colStat.getNumberOfValues());
} else {
nullCounts.put(fieldId, 0L);
}
valueCounts.put(fieldId, colStat.getNumberOfValues() + nullCounts.get(fieldId));
if (metricsMode != MetricsModes.Counts.get()) {
Optional<ByteBuffer> orcMin =
(colStat.getNumberOfValues() > 0)
? fromOrcMin(
icebergCol.type(), colStat, metricsMode, fieldMetricsMap.get(fieldId))
: Optional.empty();
orcMin.ifPresent(byteBuffer -> lowerBounds.put(icebergCol.fieldId(), byteBuffer));
Optional<ByteBuffer> orcMax =
(colStat.getNumberOfValues() > 0)
? fromOrcMax(
icebergCol.type(), colStat, metricsMode, fieldMetricsMap.get(fieldId))
: Optional.empty();
orcMax.ifPresent(byteBuffer -> upperBounds.put(icebergCol.fieldId(), byteBuffer));
}
}
}
}
return new Metrics(
numOfRows,
columnSizes,
valueCounts,
nullCounts,
MetricsUtil.createNanValueCounts(
fieldMetricsMap.values().stream(), effectiveMetricsConfig, schema),
lowerBounds,
upperBounds);
}