void Convolve5x5Task::kernelU4()

in renderscript-toolkit/src/main/cpp/Convolve5x5.cpp [149:192]


void Convolve5x5Task::kernelU4(uchar* pout, uint32_t x1, uint32_t x2, const uchar* ppy0,
                               const uchar* ppy1, const uchar* ppy2, const uchar* ppy3,
                               const uchar* ppy4) {
    uchar4* out = (uchar4*)pout;
    const uchar4* py0 = (const uchar4*)ppy0;
    const uchar4* py1 = (const uchar4*)ppy1;
    const uchar4* py2 = (const uchar4*)ppy2;
    const uchar4* py3 = (const uchar4*)ppy3;
    const uchar4* py4 = (const uchar4*)ppy4;

    while ((x1 < x2) && (x1 < 2)) {
        ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
        out++;
        x1++;
    }
#if defined(ARCH_X86_HAVE_SSSE3)
    // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
    // 3 for end boundary where x may hit the end boundary)
    if (mUsesSimd && ((x1 + 6) < x2)) {
        // subtract 3 for end boundary
        uint32_t len = (x2 - x1 - 3) >> 2;
        rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
                                  py4 + x1 - 2, mIp, len);
        out += len << 2;
        x1 += len << 2;
    }
#endif

#if defined(ARCH_ARM_USE_INTRINSICS)
    if (mUsesSimd && ((x1 + 3) < x2)) {
        uint32_t len = (x2 - x1 - 3) >> 1;
        rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
                                  py4 + x1 - 2, mIp, len);
        out += len << 1;
        x1 += len << 1;
    }
#endif

    while (x1 < x2) {
        ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
        out++;
        x1++;
    }
}