private static Metrics buildOrcMetrics()

in orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java [120:210]


  private static Metrics buildOrcMetrics(
      final long numOfRows,
      final TypeDescription orcSchema,
      final ColumnStatistics[] colStats,
      final Stream<FieldMetrics<?>> fieldMetricsStream,
      final MetricsConfig metricsConfig,
      final NameMapping mapping) {
    TypeDescription orcSchemaWithIds;
    if (ORCSchemaUtil.hasIds(orcSchema)) {
      orcSchemaWithIds = orcSchema;
    } else if (mapping != null) {
      orcSchemaWithIds = ORCSchemaUtil.applyNameMapping(orcSchema, mapping);
    } else {
      return new Metrics(numOfRows);
    }

    final Set<Integer> statsColumns = statsColumns(orcSchemaWithIds);
    final MetricsConfig effectiveMetricsConfig =
        Optional.ofNullable(metricsConfig).orElseGet(MetricsConfig::getDefault);
    Map<Integer, Long> columnSizes = Maps.newHashMapWithExpectedSize(colStats.length);
    Map<Integer, Long> valueCounts = Maps.newHashMapWithExpectedSize(colStats.length);
    Map<Integer, Long> nullCounts = Maps.newHashMapWithExpectedSize(colStats.length);

    final Schema schema = ORCSchemaUtil.convert(orcSchemaWithIds);
    Map<Integer, ByteBuffer> lowerBounds = Maps.newHashMap();
    Map<Integer, ByteBuffer> upperBounds = Maps.newHashMap();

    Map<Integer, FieldMetrics<?>> fieldMetricsMap =
        Optional.ofNullable(fieldMetricsStream)
            .map(stream -> stream.collect(Collectors.toMap(FieldMetrics::id, Function.identity())))
            .orElseGet(Maps::newHashMap);

    for (int i = 0; i < colStats.length; i++) {
      final ColumnStatistics colStat = colStats[i];
      final TypeDescription orcCol = orcSchemaWithIds.findSubtype(i);
      final Optional<Types.NestedField> icebergColOpt =
          ORCSchemaUtil.icebergID(orcCol).map(schema::findField);

      if (icebergColOpt.isPresent()) {
        final Types.NestedField icebergCol = icebergColOpt.get();
        final int fieldId = icebergCol.fieldId();

        final MetricsMode metricsMode =
            MetricsUtil.metricsMode(schema, effectiveMetricsConfig, icebergCol.fieldId());

        if (metricsMode == MetricsModes.None.get() || inMapOrList(orcCol)) {
          continue;
        }

        columnSizes.put(fieldId, colStat.getBytesOnDisk());

        if (statsColumns.contains(fieldId)) {
          // Since ORC does not track null values nor repeated ones, the value count for columns in
          // containers (maps, list) may be larger than what it actually is, however these are not
          // used in expressions right now. For such cases, we use the value number of values
          // directly stored in ORC.
          if (colStat.hasNull()) {
            nullCounts.put(fieldId, numOfRows - colStat.getNumberOfValues());
          } else {
            nullCounts.put(fieldId, 0L);
          }
          valueCounts.put(fieldId, colStat.getNumberOfValues() + nullCounts.get(fieldId));

          if (metricsMode != MetricsModes.Counts.get()) {
            Optional<ByteBuffer> orcMin =
                (colStat.getNumberOfValues() > 0)
                    ? fromOrcMin(
                        icebergCol.type(), colStat, metricsMode, fieldMetricsMap.get(fieldId))
                    : Optional.empty();
            orcMin.ifPresent(byteBuffer -> lowerBounds.put(icebergCol.fieldId(), byteBuffer));
            Optional<ByteBuffer> orcMax =
                (colStat.getNumberOfValues() > 0)
                    ? fromOrcMax(
                        icebergCol.type(), colStat, metricsMode, fieldMetricsMap.get(fieldId))
                    : Optional.empty();
            orcMax.ifPresent(byteBuffer -> upperBounds.put(icebergCol.fieldId(), byteBuffer));
          }
        }
      }
    }

    return new Metrics(
        numOfRows,
        columnSizes,
        valueCounts,
        nullCounts,
        MetricsUtil.createNanValueCounts(
            fieldMetricsMap.values().stream(), effectiveMetricsConfig, schema),
        lowerBounds,
        upperBounds);
  }