private void determineEncoding()

in java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java [428:559]


  private void determineEncoding() {
    // we need to compute zigzag values for DIRECT encoding if we decide to
    // break early for delta overflows or for shorter runs

    // not a big win for shorter runs to determine encoding
    if (numLiterals <= MIN_REPEAT) {
      prepareForDirectOrPatchedBase();
      encoding = EncodingType.DIRECT;
      return;
    }

    // DELTA encoding check

    // for identifying monotonic sequences
    boolean isIncreasing = true;
    boolean isDecreasing = true;
    this.isFixedDelta = true;

    this.min = literals[0];
    long max = literals[0];
    final long initialDelta = literals[1] - literals[0];
    long currDelta = 0;
    long deltaMax = 0;
    this.adjDeltas[0] = initialDelta;

    for (int i = 1; i < numLiterals; i++) {
      final long l1 = literals[i];
      final long l0 = literals[i - 1];
      currDelta = l1 - l0;
      min = Math.min(min, l1);
      max = Math.max(max, l1);

      isIncreasing &= (l0 <= l1);
      isDecreasing &= (l0 >= l1);

      isFixedDelta &= (currDelta == initialDelta);
      if (i > 1) {
        adjDeltas[i - 1] = Math.abs(currDelta);
        deltaMax = Math.max(deltaMax, adjDeltas[i - 1]);
      }
    }

    // its faster to exit under delta overflow condition without checking for
    // PATCHED_BASE condition as encoding using DIRECT is faster and has less
    // overhead than PATCHED_BASE
    if (!utils.isSafeSubtract(max, min)) {
      prepareForDirectOrPatchedBase();
      encoding = EncodingType.DIRECT;
      return;
    }

    // invariant - subtracting any number from any other in the literals after
    // this point won't overflow

    // if min is equal to max then the delta is 0, this condition happens for
    // fixed values run >10 which cannot be encoded with SHORT_REPEAT
    if (min == max) {
      assert isFixedDelta : min + "==" + max +
          ", isFixedDelta cannot be false";
      assert currDelta == 0 : min + "==" + max + ", currDelta should be zero";
      fixedDelta = 0;
      encoding = EncodingType.DELTA;
      return;
    }

    if (isFixedDelta) {
      assert currDelta == initialDelta
          : "currDelta should be equal to initialDelta for fixed delta encoding";
      encoding = EncodingType.DELTA;
      fixedDelta = currDelta;
      return;
    }

    // if initialDelta is 0 then we cannot delta encode as we cannot identify
    // the sign of deltas (increasing or decreasing)
    if (initialDelta != 0) {
      // stores the number of bits required for packing delta blob in
      // delta encoding
      bitsDeltaMax = utils.findClosestNumBits(deltaMax);

      // monotonic condition
      if (isIncreasing || isDecreasing) {
        encoding = EncodingType.DELTA;
        return;
      }
    }

    // PATCHED_BASE encoding check

    // percentile values are computed for the zigzag encoded values. if the
    // number of bit requirement between 90th and 100th percentile varies
    // beyond a threshold then we need to patch the values. if the variation
    // is not significant then we can use direct encoding
    long[] currentZigzagLiterals = prepareForDirectOrPatchedBase();
    zzBits90p = utils.percentileBits(currentZigzagLiterals, 0, numLiterals, 0.9);
    int diffBitsLH = zzBits100p - zzBits90p;

    // if the difference between 90th percentile and 100th percentile fixed
    // bits is > 1 then we need patch the values
    if (diffBitsLH > 1) {

      // patching is done only on base reduced values.
      // remove base from literals
      for (int i = 0; i < numLiterals; i++) {
        baseRedLiterals[i] = literals[i] - min;
      }

      // 95th percentile width is used to determine max allowed value
      // after which patching will be done
      brBits95p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 0.95);

      // 100th percentile is used to compute the max patch width
      brBits100p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 1.0);

      // after base reducing the values, if the difference in bits between
      // 95th percentile and 100th percentile value is zero then there
      // is no point in patching the values, in which case we will
      // fallback to DIRECT encoding.
      // The decision to use patched base was based on zigzag values, but the
      // actual patching is done on base reduced literals.
      if ((brBits100p - brBits95p) != 0 && Math.abs(min) < BASE_VALUE_LIMIT) {
        encoding = EncodingType.PATCHED_BASE;
        preparePatchedBlob();
      } else {
        encoding = EncodingType.DIRECT;
      }
    } else {
      // if difference in bits between 95th percentile and 100th percentile is
      // 0, then patch length will become 0. Hence we will fallback to direct
      encoding = EncodingType.DIRECT;
    }
  }