size_t Lz4Immutable::compressCommon()

in mcrouter/lib/Lz4Immutable.cpp [282:475]


size_t Lz4Immutable::compressCommon(
    IovecCursor source,
    uint8_t* output,
    size_t maxOutputSize) const {
  assert(source.totalLength() <= kMaxInputSize && "check max size first");
  assert(
      maxOutputSize >= compressBound(source.totalLength()) &&
      "check available space first");

  // Creates a match cursor - a cursor that will keep track of matches
  // found in the dictionary.
  struct iovec dicIov = getDictionaryIovec(state_);
  const IovecCursor dicCursor(&dicIov, 1);

  // The difference between the dictionary size and the max we can look back
  // to find a match (64KB).
  // It is used to see if a match is valid to be used (it has to
  // be, at most, 64KB "behind" the data we are compresing right now).
  const size_t dictionaryDiff = kMaxDictionarySize - dicCursor.totalLength();

  // Upper limit of where we can look for a match.
  const size_t matchFindLimit = source.totalLength() - kMatchFindLimit;
  // Upper limit of where a match can go.
  const size_t matchLimit = source.totalLength() - kLastLiterals;

  // Lower and upper limit to where the output buffer can go.
  const uint8_t* const outputStart = output;
  const uint8_t* const outputLimit = output + maxOutputSize;
  (void)outputLimit;

  // Controls the compression main loop.
  bool running = true;

  // Next position (0..sourceSize] in source that was not
  // yet written to destination buffer.
  IovecCursor anchorCursor = source;

  // Cursor that points to current match.
  IovecCursor match = dicCursor;

  if (source.totalLength() < kMinInputSize) {
    // Not enough data to compress. Don't even enter the compress loop,
    // skip directly to the part that encodes the last literals.
    running = false;
  } else {
    // Skip first byte.
    source.advance(1);
  }

  // Main loop
  while (running) {
    // LZ4 token
    uint8_t* token;

    // Find a match
    {
      size_t step = 0;
      size_t searchMatchNumBytes = 1 << kSkipTrigger;

      do {
        // Advance cursor and calculate next step.
        source.advance(step);
        step = (searchMatchNumBytes++ >> kSkipTrigger);

        // Hash current position
        uint32_t hash = hashPosition(source);

        // Verify if the current position in the source buffer
        // can still be compressed.
        if (UNLIKELY(source.tell() + step > matchFindLimit) ||
            UNLIKELY(source.tell() > kMaxDictionarySize)) {
          running = false;
          break;
        }

        uint32_t matchPos = getPositionOnHash(state_.table, hash);
        match.seek(matchPos);
      } while (((match.tell() + dictionaryDiff) <= source.tell()) ||
               (match.peek<uint32_t>() != source.peek<uint32_t>()));

      if (!running) {
        break;
      }
    }

    // Catch up - try to expand the match backwards.
    while (source.tell() > anchorCursor.tell() && match.tell() > 0) {
      source.retreat(1);
      match.retreat(1);
      if (LIKELY(source.peek<uint8_t>() != match.peek<uint8_t>())) {
        source.advance(1);
        match.advance(1);
        break;
      }
    }

    // Write literal
    {
      size_t literalLen = source.tell() - anchorCursor.tell();
      token = output++;

      // Check output limit
      assert(
          output + literalLen + (2 + 1 + kLastLiterals) + (literalLen / 255) <=
          outputLimit);

      // Encode literal length
      if (literalLen >= kRunMask) {
        int len = static_cast<int>(literalLen - kRunMask);
        *token = (kRunMask << kMlBits);
        for (; len >= 255; len -= 255) {
          *output++ = 255;
        }
        *output++ = static_cast<uint8_t>(len);
      } else {
        *token = static_cast<uint8_t>(literalLen << kMlBits);
      }

      // Copy literals to output buffer.
      wildCopy(output, anchorCursor, literalLen);
      output += literalLen;
    }

    // Encode offset
    uint16_t offset = dicCursor.totalLength() - match.tell() + source.tell();
    writeLE(output, static_cast<uint16_t>(offset));
    output += 2;

    // Encode matchLength
    {
      // we cannot go past the dictionary
      size_t posLimit =
          source.tell() + (dicCursor.totalLength() - match.tell());
      // Nor go past the source buffer
      posLimit = std::min(posLimit, matchLimit);

      source.advance(kMinMatch);
      match.advance(kMinMatch);
      size_t matchLen = calculateMatchLength(source, match, posLimit);

      assert(output + (1 + kLastLiterals) + (matchLen >> 8) <= outputLimit);

      // Write match length.
      if (matchLen >= kMlMask) {
        *token += kMlMask;
        matchLen -= kMlMask;
        for (; matchLen >= 510; matchLen -= 510) {
          *output++ = 255;
          *output++ = 255;
        }
        if (matchLen >= 255) {
          matchLen -= 255;
          *output++ = 255;
        }
        *output++ = static_cast<uint8_t>(matchLen);
      } else {
        *token += static_cast<uint8_t>(matchLen);
      }
    }

    // Update anchor
    anchorCursor.seek(source.tell());

    // Test end of chunk
    if (source.tell() > matchFindLimit) {
      break;
    }
  }

  // Encode last literals
  {
    size_t lastRun = source.totalLength() - anchorCursor.tell();

    assert(
        (output - outputStart) + lastRun + 1 +
            ((lastRun + 255 - kRunMask) / 255) <=
        maxOutputSize);

    if (lastRun >= kRunMask) {
      size_t accumulator = lastRun - kRunMask;
      *output++ = kRunMask << kMlBits;
      for (; accumulator >= 255; accumulator -= 255) {
        *output++ = 255;
      }
      *output++ = static_cast<uint8_t>(accumulator);
    } else {
      *output++ = static_cast<uint8_t>(lastRun << kMlBits);
    }
    safeCopy(output, anchorCursor, lastRun);
    output += lastRun;
  }

  return output - outputStart;
}