inline float Image::Sum()

in IsometricPatternMatcher/Image.h [404:432]


inline float Image<float>::Sum<float>() const {
  const size_t w32loops = w / 32;
  const size_t w8loops = (w - w32loops * 32) / 8;
  const size_t w1loops = w & 7;

  __m256 sum = _mm256_setzero_ps();
  alignas(32) float arr[8];
  const float* rowStart = RowPtr(0);
  const size_t rowElts = pitch / sizeof(float);

  _mm256_store_ps(arr, sum);

  for (size_t r = 0; r < h; ++r, rowStart += rowElts) {
    const float* rowOffset = rowStart;
    for (size_t i = 0; i < w32loops; ++i, rowOffset += 32) {
      sum = _mm256_add_ps(sum, _mm256_loadu_ps(rowOffset));
      sum = _mm256_add_ps(sum, _mm256_loadu_ps(rowOffset + 8));
      sum = _mm256_add_ps(sum, _mm256_loadu_ps(rowOffset + 16));
      sum = _mm256_add_ps(sum, _mm256_loadu_ps(rowOffset + 24));
    }
    for (size_t i = 0; i < w8loops; ++i, rowOffset += 8) {
      sum = _mm256_add_ps(sum, _mm256_loadu_ps(rowOffset));
    }
    std::copy_n(rowOffset, w1loops, arr);
    sum = _mm256_add_ps(sum, _mm256_load_ps(arr));
  }

  return hsum(sum);
}