void processRun()

in velox/dwio/dwrf/reader/ColumnVisitors.h [678:818]


  void processRun(
      const T* input,
      int32_t numInput,
      const int32_t* scatterRows,
      int32_t* filterHits,
      T* values,
      int32_t& numValues) {
    DCHECK_EQ(input, values + numValues);
    if (!hasFilter) {
      if (hasHook) {
        translateByDict(input, numInput, values);
        super::values_.hook().addValues(
            scatter ? scatterRows + super::rowIndex_
                    : velox::iota(super::numRows_, super::innerNonNullRows()) +
                    super::rowIndex_,
            values,
            numInput,
            sizeof(T));
        super::rowIndex_ += numInput;
        return;
      }
      if (inDict_) {
        translateScatter<true, scatter>(
            input, numInput, scatterRows, numValues, values);
      } else {
        translateScatter<false, scatter>(
            input, numInput, scatterRows, numValues, values);
      }
      super::rowIndex_ += numInput;
      numValues = scatter ? scatterRows[super::rowIndex_ - 1] + 1
                          : numValues + numInput;
      return;
    }
    // The filter path optionally extracts values but always sets
    // filterHits. It first loads a vector of indices. It translates
    // those indices that refer to dictionary via the dictionary in
    // bulk. It checks the dictionary filter cache 8 values at a
    // time. It calls the scalar filter for the indices that were not
    // found in the cache. It gets a bitmask of up to 8 filter
    // results. It stores these in filterHits. If values are to be
    // written, the passing bitmap is used to load a permute mask to
    // permute the passing values to the left of a vector register and
    // write  the whole register to the end of 'values'
    constexpr bool kFilterOnly =
        std::is_same<typename super::Extract, DropValues>::value;
    constexpr int32_t kWidth = V32::VSize;
    int32_t last = numInput & ~(kWidth - 1);
    for (auto i = 0; i < numInput; i += kWidth) {
      int8_t width = UNLIKELY(i == last) ? numInput - last : 8;
      auto indices = load8Indices(input + i);
      __m256si dictMask;
      if (inDict_) {
        if (simd::isDense(super::rows_ + super::rowIndex_ + i, width)) {
          dictMask = load8MaskDense(
              inDict_, super::rows_[super::rowIndex_ + i], width);
        } else {
          dictMask = load8MaskSparse(
              inDict_, super::rows_ + super::rowIndex_ + i, width);
        }
      } else {
        dictMask = V32::leadingMask(width);
      }

      // Load 8 filter cache values. Defaults the extra to values to 0 if
      // loading less than 8.
      V32::TV cache = V32::maskGather32<1>(
          V32::setAll(0), dictMask, filterCache_ - 3, indices);
      auto unknowns = V32::compareBitMask((cache & (kUnknown << 24)) << 1);
      auto passed = V32::compareBitMask(cache);
      if (UNLIKELY(unknowns)) {
        uint16_t bits = unknowns;
        // Ranges only over inputs that are in dictionary, the not in dictionary
        // were masked off in 'dictMask'.
        while (bits) {
          int index = bits::getAndClearLastSetBit(bits);
          auto value = input[i + index];
          if (applyFilter(super::filter_, dict_[value])) {
            filterCache_[value] = FilterResult::kSuccess;
            passed |= 1 << index;
          } else {
            filterCache_[value] = FilterResult::kFailure;
          }
        }
      }
      // Were there values not in dictionary?
      if (inDict_) {
        auto mask = V32::compareBitMask(dictMask);
        if (mask != V32::kAllTrue) {
          uint16_t bits = (V32::kAllTrue ^ mask) & bits::lowMask(kWidth);
          while (bits) {
            auto index = bits::getAndClearLastSetBit(bits);
            if (i + index >= numInput) {
              break;
            }
            if (common::applyFilter(super::filter_, input[i + index])) {
              passed |= 1 << index;
            }
          }
        }
      }
      // We know 8 compare results. If all false, process next batch.
      if (!passed) {
        continue;
      } else if (passed == (1 << V32::VSize) - 1) {
        // All passed, no need to shuffle the indices or values, write then to
        // 'values' and 'filterHits'.
        V32::store(
            filterHits + numValues,
            V32::load(
                (scatter ? scatterRows : super::rows_) + super::rowIndex_ + i));
        if (!kFilterOnly) {
          storeTranslate(
              input, i, indices, dictMask, dict_, values + numValues);
        }
        numValues += kWidth;
      } else {
        // Some passed. Permute  the passing row numbers and values to the left
        // of the SIMD vector and store.
        int8_t numBits = __builtin_popcount(passed);
        auto setBits = V32::load(&V32::byteSetBits()[passed]);
        simd::storePermute(
            filterHits + numValues,
            V32::load(
                (scatter ? scatterRows : super::rows_) + super::rowIndex_ + i),
            setBits);
        if (!kFilterOnly) {
          storeTranslatePermute(
              input,
              i,
              indices,
              setBits,
              dictMask,
              numBits,
              dict_,
              values + numValues);
        }
        numValues += numBits;
      }
    }
    super::rowIndex_ += numInput;
  }