in velox/dwio/dwrf/common/DecoderUtil.cpp [54:156]
bool nonNullRowsFromSparse(
const uint64_t* nulls,
RowSet rows,
raw_vector<int32_t>& innerRows,
raw_vector<int32_t>& outerRows,
uint64_t* resultNulls,
int32_t& tailSkip) {
constexpr int32_t kStep = 8;
bool anyNull = false;
auto numIn = rows.size();
innerRows.resize(numIn);
outerRows.resize(numIn);
auto inner = innerRows.data();
auto outer = outerRows.data();
int32_t numNulls = 0;
int32_t numInner = 0;
int32_t lastNonNull = -1;
auto resultNullBytes = reinterpret_cast<uint8_t*>(resultNulls);
// Returns the index in terms of non-null rows for
// 'rows[i]'. Assumes that i is increasing between calls.
auto innerFor = [&](int32_t i) {
auto row = rows[i];
DCHECK_GT(row, lastNonNull);
auto skip = row - lastNonNull - 1;
if (!skip) {
// Consecutive non-nulls
} else if (skip < 56) {
auto bits = loadUpTo56Bits(nulls, lastNonNull + 1, skip);
numNulls += skip - __builtin_popcountll(bits);
} else {
numNulls += skip -
bits::countBits(nulls, lastNonNull + 1, lastNonNull + skip + 1);
}
lastNonNull = row;
return row - numNulls;
};
for (auto i = 0; i < numIn; i += kStep) {
int32_t width = i + kStep > numIn ? numIn - i : kStep;
int16_t widthMask = bits::lowMask(width);
if (isDense(rows.data() + i, width)) {
uint16_t flags = load8Bits(nulls, rows[i]) & widthMask;
if (outputNulls) {
resultNullBytes[i / 8] = flags;
anyNull |= flags != widthMask;
}
if (!flags) {
continue;
}
auto numNonNull = __builtin_popcount(flags);
auto setBits = V32::compareSetBits(flags);
// contiguous inner rows.
auto innerStart = innerFor(i);
V32::store(inner + numInner, V32::iota() + innerStart);
if (isFilter) {
simd::storePermute(
outer + numInner, V32::load(rows.data() + i), setBits);
} else {
V32::store(outer + numInner, setBits + i);
}
// We processed 'width' consecutive. The inner count is incremented
// by number of non-nulls. Nulls are counted for the 'width' rows,
// so we set lastNonNull to be the last of these and increment
// numNulls by the number of nulls in the 8 rows. This is
// correct even if the last of the 8 is null since the null
// count is correct up to that row.
numInner += numNonNull;
lastNonNull = rows[i + width - 1];
numNulls += width - numNonNull;
} else {
auto next8Rows = V32::load(rows.data() + i);
uint16_t flags = simd::gather8Bits(nulls, next8Rows, width);
if (outputNulls) {
resultNullBytes[i / 8] = flags;
anyNull |= flags != widthMask;
}
if (!flags) {
continue;
}
auto setBits = V32::compareSetBits(flags);
if (isFilter) {
// The non-null indices among 'rows' are possible filter results.
simd::storePermute(outer + numInner, next8Rows, setBits);
} else {
// Make scatter indices so that there are gaps for 'rows' that hit a
// null.
V32::store(outer + numInner, setBits + i);
}
// Calculate the inner row corresponding to each non-null in 'next8Rows'.
while (flags) {
int32_t index = bits::getAndClearLastSetBit(flags);
inner[numInner++] = innerFor(i + index);
}
}
}
innerRows.resize(numInner);
outerRows.resize(numInner);
// If ending with a null, skip the non-nulls between last non-null in
// rows and the last null in 'rows'.
tailSkip = bits::countBits(nulls, lastNonNull + 1, rows.back());
return anyNull;
}