in java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java [428:559]
private void determineEncoding() {
// we need to compute zigzag values for DIRECT encoding if we decide to
// break early for delta overflows or for shorter runs
// not a big win for shorter runs to determine encoding
if (numLiterals <= MIN_REPEAT) {
prepareForDirectOrPatchedBase();
encoding = EncodingType.DIRECT;
return;
}
// DELTA encoding check
// for identifying monotonic sequences
boolean isIncreasing = true;
boolean isDecreasing = true;
this.isFixedDelta = true;
this.min = literals[0];
long max = literals[0];
final long initialDelta = literals[1] - literals[0];
long currDelta = 0;
long deltaMax = 0;
this.adjDeltas[0] = initialDelta;
for (int i = 1; i < numLiterals; i++) {
final long l1 = literals[i];
final long l0 = literals[i - 1];
currDelta = l1 - l0;
min = Math.min(min, l1);
max = Math.max(max, l1);
isIncreasing &= (l0 <= l1);
isDecreasing &= (l0 >= l1);
isFixedDelta &= (currDelta == initialDelta);
if (i > 1) {
adjDeltas[i - 1] = Math.abs(currDelta);
deltaMax = Math.max(deltaMax, adjDeltas[i - 1]);
}
}
// its faster to exit under delta overflow condition without checking for
// PATCHED_BASE condition as encoding using DIRECT is faster and has less
// overhead than PATCHED_BASE
if (!utils.isSafeSubtract(max, min)) {
prepareForDirectOrPatchedBase();
encoding = EncodingType.DIRECT;
return;
}
// invariant - subtracting any number from any other in the literals after
// this point won't overflow
// if min is equal to max then the delta is 0, this condition happens for
// fixed values run >10 which cannot be encoded with SHORT_REPEAT
if (min == max) {
assert isFixedDelta : min + "==" + max +
", isFixedDelta cannot be false";
assert currDelta == 0 : min + "==" + max + ", currDelta should be zero";
fixedDelta = 0;
encoding = EncodingType.DELTA;
return;
}
if (isFixedDelta) {
assert currDelta == initialDelta
: "currDelta should be equal to initialDelta for fixed delta encoding";
encoding = EncodingType.DELTA;
fixedDelta = currDelta;
return;
}
// if initialDelta is 0 then we cannot delta encode as we cannot identify
// the sign of deltas (increasing or decreasing)
if (initialDelta != 0) {
// stores the number of bits required for packing delta blob in
// delta encoding
bitsDeltaMax = utils.findClosestNumBits(deltaMax);
// monotonic condition
if (isIncreasing || isDecreasing) {
encoding = EncodingType.DELTA;
return;
}
}
// PATCHED_BASE encoding check
// percentile values are computed for the zigzag encoded values. if the
// number of bit requirement between 90th and 100th percentile varies
// beyond a threshold then we need to patch the values. if the variation
// is not significant then we can use direct encoding
long[] currentZigzagLiterals = prepareForDirectOrPatchedBase();
zzBits90p = utils.percentileBits(currentZigzagLiterals, 0, numLiterals, 0.9);
int diffBitsLH = zzBits100p - zzBits90p;
// if the difference between 90th percentile and 100th percentile fixed
// bits is > 1 then we need patch the values
if (diffBitsLH > 1) {
// patching is done only on base reduced values.
// remove base from literals
for (int i = 0; i < numLiterals; i++) {
baseRedLiterals[i] = literals[i] - min;
}
// 95th percentile width is used to determine max allowed value
// after which patching will be done
brBits95p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 0.95);
// 100th percentile is used to compute the max patch width
brBits100p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 1.0);
// after base reducing the values, if the difference in bits between
// 95th percentile and 100th percentile value is zero then there
// is no point in patching the values, in which case we will
// fallback to DIRECT encoding.
// The decision to use patched base was based on zigzag values, but the
// actual patching is done on base reduced literals.
if ((brBits100p - brBits95p) != 0 && Math.abs(min) < BASE_VALUE_LIMIT) {
encoding = EncodingType.PATCHED_BASE;
preparePatchedBlob();
} else {
encoding = EncodingType.DIRECT;
}
} else {
// if difference in bits between 95th percentile and 100th percentile is
// 0, then patch length will become 0. Hence we will fallback to direct
encoding = EncodingType.DIRECT;
}
}