in velox/dwio/dwrf/reader/DwrfReaderShared.cpp [420:508]
std::optional<size_t> DwrfRowReaderShared::estimatedRowSizeHelper(
const proto::Footer& footer,
const dwio::common::Statistics& stats,
uint32_t nodeId) const {
DWIO_ENSURE_LT(nodeId, footer.types_size(), "Types missing in footer");
const auto& s = stats.getColumnStatistics(nodeId);
const auto& t = footer.types(nodeId);
if (!s.getNumberOfValues()) {
return std::nullopt;
}
auto valueCount = s.getNumberOfValues().value();
if (valueCount < 1) {
return 0;
}
switch (t.kind()) {
case proto::Type_Kind_BOOLEAN: {
return valueCount * sizeof(uint8_t);
}
case proto::Type_Kind_BYTE: {
return valueCount * sizeof(uint8_t);
}
case proto::Type_Kind_SHORT: {
return valueCount * sizeof(uint16_t);
}
case proto::Type_Kind_INT: {
return valueCount * sizeof(uint32_t);
}
case proto::Type_Kind_LONG: {
return valueCount * sizeof(uint64_t);
}
case proto::Type_Kind_FLOAT: {
return valueCount * sizeof(float);
}
case proto::Type_Kind_DOUBLE: {
return valueCount * sizeof(double);
}
case proto::Type_Kind_STRING: {
auto stringStats =
dynamic_cast<const dwio::common::StringColumnStatistics*>(&s);
if (!stringStats) {
return std::nullopt;
}
auto length = stringStats->getTotalLength();
if (!length) {
return std::nullopt;
}
return length.value();
}
case proto::Type_Kind_BINARY: {
auto binaryStats =
dynamic_cast<const dwio::common::BinaryColumnStatistics*>(&s);
if (!binaryStats) {
return std::nullopt;
}
auto length = binaryStats->getTotalLength();
if (!length) {
return std::nullopt;
}
return length.value();
}
case proto::Type_Kind_TIMESTAMP: {
return valueCount * sizeof(uint64_t) * 2;
}
case proto::Type_Kind_LIST:
case proto::Type_Kind_MAP:
case proto::Type_Kind_STRUCT:
case proto::Type_Kind_UNION: {
// start the estimate with the offsets and hasNulls vectors sizes
size_t totalEstimate = valueCount * (sizeof(uint8_t) + sizeof(uint64_t));
for (int32_t i = 0; i < t.subtypes_size() &&
columnSelector_->shouldReadNode(t.subtypes(i));
++i) {
auto subtypeEstimate =
estimatedRowSizeHelper(footer, stats, t.subtypes(i));
if (subtypeEstimate.has_value()) {
totalEstimate = t.kind() == proto::Type_Kind_UNION
? std::max(totalEstimate, subtypeEstimate.value())
: (totalEstimate + subtypeEstimate.value());
} else {
return std::nullopt;
}
}
return totalEstimate;
}
default:
return std::nullopt;
}
}