in c++/src/RleEncoderV2.cc [295:440]
void RleEncoderV2::determineEncoding(EncodingOption& option) {
// We need to compute zigzag values for DIRECT and PATCHED_BASE encodings,
// but not for SHORT_REPEAT or DELTA. So we only perform the zigzag
// computation when it's determined to be necessary.
// not a big win for shorter runs to determine encoding
if (numLiterals <= MIN_REPEAT) {
// we need to compute zigzag values for DIRECT encoding if we decide to
// break early for delta overflows or for shorter runs
prepareForDirectOrPatchedBase(option);
option.encoding = DIRECT;
return;
}
// DELTA encoding check
// for identifying monotonic sequences
bool isIncreasing = true;
bool isDecreasing = true;
option.isFixedDelta = true;
option.min = literals[0];
int64_t max = literals[0];
int64_t initialDelta = literals[1] - literals[0];
int64_t currDelta = 0;
int64_t deltaMax = 0;
adjDeltas_[option.adjDeltasCount++] = initialDelta;
for (size_t i = 1; i < numLiterals; i++) {
const int64_t l1 = literals[i];
const int64_t l0 = literals[i - 1];
currDelta = l1 - l0;
option.min = std::min(option.min, l1);
max = std::max(max, l1);
isIncreasing &= (l0 <= l1);
isDecreasing &= (l0 >= l1);
option.isFixedDelta &= (currDelta == initialDelta);
if (i > 1) {
adjDeltas_[option.adjDeltasCount++] = std::abs(currDelta);
deltaMax = std::max(deltaMax, adjDeltas_[i - 1]);
}
}
// it's faster to exit under delta overflow condition without checking for
// PATCHED_BASE condition as encoding using DIRECT is faster and has less
// overhead than PATCHED_BASE
if (!isSafeSubtract(max, option.min)) {
prepareForDirectOrPatchedBase(option);
option.encoding = DIRECT;
return;
}
// invariant - subtracting any number from any other in the literals after
// option point won't overflow
// if min is equal to max then the delta is 0, option condition happens for
// fixed values run >10 which cannot be encoded with SHORT_REPEAT
if (option.min == max) {
if (!option.isFixedDelta) {
throw InvalidArgument(to_string(option.min) + "==" + to_string(max) +
", isFixedDelta cannot be false");
}
if (currDelta != 0) {
throw InvalidArgument(to_string(option.min) + "==" + to_string(max) +
", currDelta should be zero");
}
option.fixedDelta = 0;
option.encoding = DELTA;
return;
}
if (option.isFixedDelta) {
if (currDelta != initialDelta) {
throw InvalidArgument("currDelta should be equal to initialDelta for fixed delta encoding");
}
option.encoding = DELTA;
option.fixedDelta = currDelta;
return;
}
// if initialDelta is 0 then we cannot delta encode as we cannot identify
// the sign of deltas (increasing or decreasing)
if (initialDelta != 0) {
// stores the number of bits required for packing delta blob in
// delta encoding
option.bitsDeltaMax = findClosestNumBits(deltaMax);
// monotonic condition
if (isIncreasing || isDecreasing) {
option.encoding = DELTA;
return;
}
}
// PATCHED_BASE encoding check
// percentile values are computed for the zigzag encoded values. if the
// number of bit requirement between 90th and 100th percentile varies
// beyond a threshold then we need to patch the values. if the variation
// is not significant then we can use direct encoding
int64_t* currentZigzagLiterals = prepareForDirectOrPatchedBase(option);
option.zzBits90p = percentileBits(currentZigzagLiterals, 0, numLiterals, 0.9, true);
uint32_t diffBitsLH = option.zzBits100p - option.zzBits90p;
// if the difference between 90th percentile and 100th percentile fixed
// bits is > 1 then we need patch the values
if (diffBitsLH > 1) {
// patching is done only on base reduced values.
// remove base from literals
for (size_t i = 0; i < numLiterals; i++) {
baseRedLiterals_[option.baseRedLiteralsCount++] = (literals[i] - option.min);
}
// 95th percentile width is used to determine max allowed value
// after which patching will be done
option.brBits95p = percentileBits(baseRedLiterals_, 0, numLiterals, 0.95);
// 100th percentile is used to compute the max patch width
option.brBits100p = percentileBits(baseRedLiterals_, 0, numLiterals, 1.0, true);
// after base reducing the values, if the difference in bits between
// 95th percentile and 100th percentile value is zero then there
// is no point in patching the values, in which case we will
// fallback to DIRECT encoding.
// The decision to use patched base was based on zigzag values, but the
// actual patching is done on base reduced literals.
if ((option.brBits100p - option.brBits95p) != 0) {
option.encoding = PATCHED_BASE;
preparePatchedBlob(option);
return;
} else {
option.encoding = DIRECT;
return;
}
} else {
// if difference in bits between 95th percentile and 100th percentile is
// 0, then patch length will become 0. Hence we will fallback to direct
option.encoding = DIRECT;
return;
}
}