std::optional DwrfRowReaderShared::estimatedRowSizeHelper()

in velox/dwio/dwrf/reader/DwrfReaderShared.cpp [420:508]


std::optional<size_t> DwrfRowReaderShared::estimatedRowSizeHelper(
    const proto::Footer& footer,
    const dwio::common::Statistics& stats,
    uint32_t nodeId) const {
  DWIO_ENSURE_LT(nodeId, footer.types_size(), "Types missing in footer");

  const auto& s = stats.getColumnStatistics(nodeId);
  const auto& t = footer.types(nodeId);
  if (!s.getNumberOfValues()) {
    return std::nullopt;
  }
  auto valueCount = s.getNumberOfValues().value();
  if (valueCount < 1) {
    return 0;
  }
  switch (t.kind()) {
    case proto::Type_Kind_BOOLEAN: {
      return valueCount * sizeof(uint8_t);
    }
    case proto::Type_Kind_BYTE: {
      return valueCount * sizeof(uint8_t);
    }
    case proto::Type_Kind_SHORT: {
      return valueCount * sizeof(uint16_t);
    }
    case proto::Type_Kind_INT: {
      return valueCount * sizeof(uint32_t);
    }
    case proto::Type_Kind_LONG: {
      return valueCount * sizeof(uint64_t);
    }
    case proto::Type_Kind_FLOAT: {
      return valueCount * sizeof(float);
    }
    case proto::Type_Kind_DOUBLE: {
      return valueCount * sizeof(double);
    }
    case proto::Type_Kind_STRING: {
      auto stringStats =
          dynamic_cast<const dwio::common::StringColumnStatistics*>(&s);
      if (!stringStats) {
        return std::nullopt;
      }
      auto length = stringStats->getTotalLength();
      if (!length) {
        return std::nullopt;
      }
      return length.value();
    }
    case proto::Type_Kind_BINARY: {
      auto binaryStats =
          dynamic_cast<const dwio::common::BinaryColumnStatistics*>(&s);
      if (!binaryStats) {
        return std::nullopt;
      }
      auto length = binaryStats->getTotalLength();
      if (!length) {
        return std::nullopt;
      }
      return length.value();
    }
    case proto::Type_Kind_TIMESTAMP: {
      return valueCount * sizeof(uint64_t) * 2;
    }
    case proto::Type_Kind_LIST:
    case proto::Type_Kind_MAP:
    case proto::Type_Kind_STRUCT:
    case proto::Type_Kind_UNION: {
      // start the estimate with the offsets and hasNulls vectors sizes
      size_t totalEstimate = valueCount * (sizeof(uint8_t) + sizeof(uint64_t));
      for (int32_t i = 0; i < t.subtypes_size() &&
           columnSelector_->shouldReadNode(t.subtypes(i));
           ++i) {
        auto subtypeEstimate =
            estimatedRowSizeHelper(footer, stats, t.subtypes(i));
        if (subtypeEstimate.has_value()) {
          totalEstimate = t.kind() == proto::Type_Kind_UNION
              ? std::max(totalEstimate, subtypeEstimate.value())
              : (totalEstimate + subtypeEstimate.value());
        } else {
          return std::nullopt;
        }
      }
      return totalEstimate;
    }
    default:
      return std::nullopt;
  }
}