in renderscript-toolkit/src/main/cpp/Convolve3x3.cpp [133:166]
void Convolve3x3Task::kernelU4(uchar* pout, uint32_t xstart, uint32_t xend, const uchar* ppy0,
const uchar* ppy1, const uchar* ppy2) {
uchar4* out = (uchar4*)pout;
const uchar4* py0 = (const uchar4*)ppy0;
const uchar4* py1 = (const uchar4*)ppy1;
const uchar4* py2 = (const uchar4*)ppy2;
uint32_t x1 = xstart;
uint32_t x2 = xend;
if (x1 == 0) {
convolveOneU<uchar4, float4>(0, out, py0, py1, py2, mFp, mSizeX);
x1++;
out++;
}
if (x2 > x1) {
#if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3)
if (mUsesSimd) {
int32_t len = (x2 - x1 - 1) >> 1;
if (len > 0) {
rsdIntrinsicConvolve3x3_K(out, &py0[x1 - 1], &py1[x1 - 1], &py2[x1 - 1], mIp, len);
x1 += len << 1;
out += len << 1;
}
}
#endif
while (x1 != x2) {
convolveOneU<uchar4, float4>(x1, out, py0, py1, py2, mFp, mSizeX);
out++;
x1++;
}
}
}