void RleEncoderV2::determineEncoding()

in c++/src/RleEncoderV2.cc [295:440]


  void RleEncoderV2::determineEncoding(EncodingOption& option) {
    // We need to compute zigzag values for DIRECT and PATCHED_BASE encodings,
    // but not for SHORT_REPEAT or DELTA. So we only perform the zigzag
    // computation when it's determined to be necessary.

    // not a big win for shorter runs to determine encoding
    if (numLiterals <= MIN_REPEAT) {
      // we need to compute zigzag values for DIRECT encoding if we decide to
      // break early for delta overflows or for shorter runs
      prepareForDirectOrPatchedBase(option);
      option.encoding = DIRECT;
      return;
    }

    // DELTA encoding check

    // for identifying monotonic sequences
    bool isIncreasing = true;
    bool isDecreasing = true;
    option.isFixedDelta = true;

    option.min = literals[0];
    int64_t max = literals[0];
    int64_t initialDelta = literals[1] - literals[0];
    int64_t currDelta = 0;
    int64_t deltaMax = 0;
    adjDeltas_[option.adjDeltasCount++] = initialDelta;

    for (size_t i = 1; i < numLiterals; i++) {
      const int64_t l1 = literals[i];
      const int64_t l0 = literals[i - 1];
      currDelta = l1 - l0;
      option.min = std::min(option.min, l1);
      max = std::max(max, l1);

      isIncreasing &= (l0 <= l1);
      isDecreasing &= (l0 >= l1);

      option.isFixedDelta &= (currDelta == initialDelta);
      if (i > 1) {
        adjDeltas_[option.adjDeltasCount++] = std::abs(currDelta);
        deltaMax = std::max(deltaMax, adjDeltas_[i - 1]);
      }
    }

    // it's faster to exit under delta overflow condition without checking for
    // PATCHED_BASE condition as encoding using DIRECT is faster and has less
    // overhead than PATCHED_BASE
    if (!isSafeSubtract(max, option.min)) {
      prepareForDirectOrPatchedBase(option);
      option.encoding = DIRECT;
      return;
    }

    // invariant - subtracting any number from any other in the literals after
    // option point won't overflow

    // if min is equal to max then the delta is 0, option condition happens for
    // fixed values run >10 which cannot be encoded with SHORT_REPEAT
    if (option.min == max) {
      if (!option.isFixedDelta) {
        throw InvalidArgument(to_string(option.min) + "==" + to_string(max) +
                              ", isFixedDelta cannot be false");
      }

      if (currDelta != 0) {
        throw InvalidArgument(to_string(option.min) + "==" + to_string(max) +
                              ", currDelta should be zero");
      }
      option.fixedDelta = 0;
      option.encoding = DELTA;
      return;
    }

    if (option.isFixedDelta) {
      if (currDelta != initialDelta) {
        throw InvalidArgument("currDelta should be equal to initialDelta for fixed delta encoding");
      }

      option.encoding = DELTA;
      option.fixedDelta = currDelta;
      return;
    }

    // if initialDelta is 0 then we cannot delta encode as we cannot identify
    // the sign of deltas (increasing or decreasing)
    if (initialDelta != 0) {
      // stores the number of bits required for packing delta blob in
      // delta encoding
      option.bitsDeltaMax = findClosestNumBits(deltaMax);

      // monotonic condition
      if (isIncreasing || isDecreasing) {
        option.encoding = DELTA;
        return;
      }
    }

    // PATCHED_BASE encoding check

    // percentile values are computed for the zigzag encoded values. if the
    // number of bit requirement between 90th and 100th percentile varies
    // beyond a threshold then we need to patch the values. if the variation
    // is not significant then we can use direct encoding

    int64_t* currentZigzagLiterals = prepareForDirectOrPatchedBase(option);
    option.zzBits90p = percentileBits(currentZigzagLiterals, 0, numLiterals, 0.9, true);
    uint32_t diffBitsLH = option.zzBits100p - option.zzBits90p;

    // if the difference between 90th percentile and 100th percentile fixed
    // bits is > 1 then we need patch the values
    if (diffBitsLH > 1) {
      // patching is done only on base reduced values.
      // remove base from literals
      for (size_t i = 0; i < numLiterals; i++) {
        baseRedLiterals_[option.baseRedLiteralsCount++] = (literals[i] - option.min);
      }

      // 95th percentile width is used to determine max allowed value
      // after which patching will be done
      option.brBits95p = percentileBits(baseRedLiterals_, 0, numLiterals, 0.95);

      // 100th percentile is used to compute the max patch width
      option.brBits100p = percentileBits(baseRedLiterals_, 0, numLiterals, 1.0, true);

      // after base reducing the values, if the difference in bits between
      // 95th percentile and 100th percentile value is zero then there
      // is no point in patching the values, in which case we will
      // fallback to DIRECT encoding.
      // The decision to use patched base was based on zigzag values, but the
      // actual patching is done on base reduced literals.
      if ((option.brBits100p - option.brBits95p) != 0) {
        option.encoding = PATCHED_BASE;
        preparePatchedBlob(option);
        return;
      } else {
        option.encoding = DIRECT;
        return;
      }
    } else {
      // if difference in bits between 95th percentile and 100th percentile is
      // 0, then patch length will become 0. Hence we will fallback to direct
      option.encoding = DIRECT;
      return;
    }
  }