void Convolve3x3Task::kernelU4()

in renderscript-toolkit/src/main/cpp/Convolve3x3.cpp [133:166]


void Convolve3x3Task::kernelU4(uchar* pout, uint32_t xstart, uint32_t xend, const uchar* ppy0,
                               const uchar* ppy1, const uchar* ppy2) {
    uchar4* out = (uchar4*)pout;
    const uchar4* py0 = (const uchar4*)ppy0;
    const uchar4* py1 = (const uchar4*)ppy1;
    const uchar4* py2 = (const uchar4*)ppy2;

    uint32_t x1 = xstart;
    uint32_t x2 = xend;
    if (x1 == 0) {
        convolveOneU<uchar4, float4>(0, out, py0, py1, py2, mFp, mSizeX);
        x1++;
        out++;
    }

    if (x2 > x1) {
#if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3)
        if (mUsesSimd) {
            int32_t len = (x2 - x1 - 1) >> 1;
            if (len > 0) {
                rsdIntrinsicConvolve3x3_K(out, &py0[x1 - 1], &py1[x1 - 1], &py2[x1 - 1], mIp, len);
                x1 += len << 1;
                out += len << 1;
            }
        }
#endif

        while (x1 != x2) {
            convolveOneU<uchar4, float4>(x1, out, py0, py1, py2, mFp, mSizeX);
            out++;
            x1++;
        }
    }
}