in IsometricPatternMatcher/Image.h [404:432]
inline float Image<float>::Sum<float>() const {
const size_t w32loops = w / 32;
const size_t w8loops = (w - w32loops * 32) / 8;
const size_t w1loops = w & 7;
__m256 sum = _mm256_setzero_ps();
alignas(32) float arr[8];
const float* rowStart = RowPtr(0);
const size_t rowElts = pitch / sizeof(float);
_mm256_store_ps(arr, sum);
for (size_t r = 0; r < h; ++r, rowStart += rowElts) {
const float* rowOffset = rowStart;
for (size_t i = 0; i < w32loops; ++i, rowOffset += 32) {
sum = _mm256_add_ps(sum, _mm256_loadu_ps(rowOffset));
sum = _mm256_add_ps(sum, _mm256_loadu_ps(rowOffset + 8));
sum = _mm256_add_ps(sum, _mm256_loadu_ps(rowOffset + 16));
sum = _mm256_add_ps(sum, _mm256_loadu_ps(rowOffset + 24));
}
for (size_t i = 0; i < w8loops; ++i, rowOffset += 8) {
sum = _mm256_add_ps(sum, _mm256_loadu_ps(rowOffset));
}
std::copy_n(rowOffset, w1loops, arr);
sum = _mm256_add_ps(sum, _mm256_load_ps(arr));
}
return hsum(sum);
}