uint32_t Extend()

in util/crc32c.cc [276:377]
75 lines of code
13 McCabe index (conditional complexity)

uint32_t Extend(uint32_t crc, const char* data, size_t n) {
  static bool accelerate = CanAccelerateCRC32C();
  if (accelerate) {
    return port::AcceleratedCRC32C(crc, data, n);
  }

  const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
  const uint8_t* e = p + n;
  uint32_t l = crc ^ kCRC32Xor;

// Process one byte at a time.
#define STEP1                              \
  do {                                     \
    int c = (l & 0xff) ^ *p++;             \
    l = kByteExtensionTable[c] ^ (l >> 8); \
  } while (0)

// Process one of the 4 strides of 4-byte data.
#define STEP4(s)                                                               \
  do {                                                                         \
    crc##s = ReadUint32LE(p + s * 4) ^ kStrideExtensionTable3[crc##s & 0xff] ^ \
             kStrideExtensionTable2[(crc##s >> 8) & 0xff] ^                    \
             kStrideExtensionTable1[(crc##s >> 16) & 0xff] ^                   \
             kStrideExtensionTable0[crc##s >> 24];                             \
  } while (0)

// Process a 16-byte swath of 4 strides, each of which has 4 bytes of data.
#define STEP16 \
  do {         \
    STEP4(0);  \
    STEP4(1);  \
    STEP4(2);  \
    STEP4(3);  \
    p += 16;   \
  } while (0)

// Process 4 bytes that were already loaded into a word.
#define STEP4W(w)                                   \
  do {                                              \
    w ^= l;                                         \
    for (size_t i = 0; i < 4; ++i) {                \
      w = (w >> 8) ^ kByteExtensionTable[w & 0xff]; \
    }                                               \
    l = w;                                          \
  } while (0)

  // Point x at first 4-byte aligned byte in the buffer. This might be past the
  // end of the buffer.
  const uint8_t* x = RoundUp<4>(p);
  if (x <= e) {
    // Process bytes p is 4-byte aligned.
    while (p != x) {
      STEP1;
    }
  }

  if ((e - p) >= 16) {
    // Load a 16-byte swath into the stride partial results.
    uint32_t crc0 = ReadUint32LE(p + 0 * 4) ^ l;
    uint32_t crc1 = ReadUint32LE(p + 1 * 4);
    uint32_t crc2 = ReadUint32LE(p + 2 * 4);
    uint32_t crc3 = ReadUint32LE(p + 3 * 4);
    p += 16;

    // It is possible to get better speeds (at least on x86) by interleaving
    // prefetching 256 bytes ahead with processing 64 bytes at a time. See the
    // portable implementation in https://github.com/google/crc32c/.

    // Process one 16-byte swath at a time.
    while ((e - p) >= 16) {
      STEP16;
    }

    // Advance one word at a time as far as possible.
    while ((e - p) >= 4) {
      STEP4(0);
      uint32_t tmp = crc0;
      crc0 = crc1;
      crc1 = crc2;
      crc2 = crc3;
      crc3 = tmp;
      p += 4;
    }

    // Combine the 4 partial stride results.
    l = 0;
    STEP4W(crc0);
    STEP4W(crc1);
    STEP4W(crc2);
    STEP4W(crc3);
  }

  // Process the last few bytes.
  while (p != e) {
    STEP1;
  }
#undef STEP4W
#undef STEP16
#undef STEP4
#undef STEP1
  return l ^ kCRC32Xor;
}