in renderscript-toolkit/src/main/cpp/Convolve5x5.cpp [149:192]
void Convolve5x5Task::kernelU4(uchar* pout, uint32_t x1, uint32_t x2, const uchar* ppy0,
const uchar* ppy1, const uchar* ppy2, const uchar* ppy3,
const uchar* ppy4) {
uchar4* out = (uchar4*)pout;
const uchar4* py0 = (const uchar4*)ppy0;
const uchar4* py1 = (const uchar4*)ppy1;
const uchar4* py2 = (const uchar4*)ppy2;
const uchar4* py3 = (const uchar4*)ppy3;
const uchar4* py4 = (const uchar4*)ppy4;
while ((x1 < x2) && (x1 < 2)) {
ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
out++;
x1++;
}
#if defined(ARCH_X86_HAVE_SSSE3)
// for x86 SIMD, require minimum of 7 elements (4 for SIMD,
// 3 for end boundary where x may hit the end boundary)
if (mUsesSimd && ((x1 + 6) < x2)) {
// subtract 3 for end boundary
uint32_t len = (x2 - x1 - 3) >> 2;
rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
py4 + x1 - 2, mIp, len);
out += len << 2;
x1 += len << 2;
}
#endif
#if defined(ARCH_ARM_USE_INTRINSICS)
if (mUsesSimd && ((x1 + 3) < x2)) {
uint32_t len = (x2 - x1 - 3) >> 1;
rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
py4 + x1 - 2, mIp, len);
out += len << 1;
x1 += len << 1;
}
#endif
while (x1 < x2) {
ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
out++;
x1++;
}
}