in velox/dwio/dwrf/reader/ColumnVisitors.h [678:818]
void processRun(
const T* input,
int32_t numInput,
const int32_t* scatterRows,
int32_t* filterHits,
T* values,
int32_t& numValues) {
DCHECK_EQ(input, values + numValues);
if (!hasFilter) {
if (hasHook) {
translateByDict(input, numInput, values);
super::values_.hook().addValues(
scatter ? scatterRows + super::rowIndex_
: velox::iota(super::numRows_, super::innerNonNullRows()) +
super::rowIndex_,
values,
numInput,
sizeof(T));
super::rowIndex_ += numInput;
return;
}
if (inDict_) {
translateScatter<true, scatter>(
input, numInput, scatterRows, numValues, values);
} else {
translateScatter<false, scatter>(
input, numInput, scatterRows, numValues, values);
}
super::rowIndex_ += numInput;
numValues = scatter ? scatterRows[super::rowIndex_ - 1] + 1
: numValues + numInput;
return;
}
// The filter path optionally extracts values but always sets
// filterHits. It first loads a vector of indices. It translates
// those indices that refer to dictionary via the dictionary in
// bulk. It checks the dictionary filter cache 8 values at a
// time. It calls the scalar filter for the indices that were not
// found in the cache. It gets a bitmask of up to 8 filter
// results. It stores these in filterHits. If values are to be
// written, the passing bitmap is used to load a permute mask to
// permute the passing values to the left of a vector register and
// write the whole register to the end of 'values'
constexpr bool kFilterOnly =
std::is_same<typename super::Extract, DropValues>::value;
constexpr int32_t kWidth = V32::VSize;
int32_t last = numInput & ~(kWidth - 1);
for (auto i = 0; i < numInput; i += kWidth) {
int8_t width = UNLIKELY(i == last) ? numInput - last : 8;
auto indices = load8Indices(input + i);
__m256si dictMask;
if (inDict_) {
if (simd::isDense(super::rows_ + super::rowIndex_ + i, width)) {
dictMask = load8MaskDense(
inDict_, super::rows_[super::rowIndex_ + i], width);
} else {
dictMask = load8MaskSparse(
inDict_, super::rows_ + super::rowIndex_ + i, width);
}
} else {
dictMask = V32::leadingMask(width);
}
// Load 8 filter cache values. Defaults the extra to values to 0 if
// loading less than 8.
V32::TV cache = V32::maskGather32<1>(
V32::setAll(0), dictMask, filterCache_ - 3, indices);
auto unknowns = V32::compareBitMask((cache & (kUnknown << 24)) << 1);
auto passed = V32::compareBitMask(cache);
if (UNLIKELY(unknowns)) {
uint16_t bits = unknowns;
// Ranges only over inputs that are in dictionary, the not in dictionary
// were masked off in 'dictMask'.
while (bits) {
int index = bits::getAndClearLastSetBit(bits);
auto value = input[i + index];
if (applyFilter(super::filter_, dict_[value])) {
filterCache_[value] = FilterResult::kSuccess;
passed |= 1 << index;
} else {
filterCache_[value] = FilterResult::kFailure;
}
}
}
// Were there values not in dictionary?
if (inDict_) {
auto mask = V32::compareBitMask(dictMask);
if (mask != V32::kAllTrue) {
uint16_t bits = (V32::kAllTrue ^ mask) & bits::lowMask(kWidth);
while (bits) {
auto index = bits::getAndClearLastSetBit(bits);
if (i + index >= numInput) {
break;
}
if (common::applyFilter(super::filter_, input[i + index])) {
passed |= 1 << index;
}
}
}
}
// We know 8 compare results. If all false, process next batch.
if (!passed) {
continue;
} else if (passed == (1 << V32::VSize) - 1) {
// All passed, no need to shuffle the indices or values, write then to
// 'values' and 'filterHits'.
V32::store(
filterHits + numValues,
V32::load(
(scatter ? scatterRows : super::rows_) + super::rowIndex_ + i));
if (!kFilterOnly) {
storeTranslate(
input, i, indices, dictMask, dict_, values + numValues);
}
numValues += kWidth;
} else {
// Some passed. Permute the passing row numbers and values to the left
// of the SIMD vector and store.
int8_t numBits = __builtin_popcount(passed);
auto setBits = V32::load(&V32::byteSetBits()[passed]);
simd::storePermute(
filterHits + numValues,
V32::load(
(scatter ? scatterRows : super::rows_) + super::rowIndex_ + i),
setBits);
if (!kFilterOnly) {
storeTranslatePermute(
input,
i,
indices,
setBits,
dictMask,
numBits,
dict_,
values + numValues);
}
numValues += numBits;
}
}
super::rowIndex_ += numInput;
}