void BlurTask::kernelU4()

in renderscript-toolkit/src/main/cpp/Blur.cpp [370:433]


void BlurTask::kernelU4(void *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY,
                        uint32_t threadIndex) {
    float4 stackbuf[2048];
    float4 *buf = &stackbuf[0];
    const uint32_t stride = mSizeX * mVectorSize;

    uchar4 *out = (uchar4 *)outPtr;
    uint32_t x1 = xstart;
    uint32_t x2 = xend;

#if defined(ARCH_ARM_USE_INTRINSICS)
    if (mUsesSimd && mSizeX >= 4) {
      rsdIntrinsicBlurU4_K(out, (uchar4 const *)(mIn + stride * currentY),
                 mSizeX, mSizeY,
                 stride, x1, currentY, x2 - x1, mIradius, mIp + mIradius);
        return;
    }
#endif

    if (mSizeX > 2048) {
        if ((mSizeX > mScratchSize[threadIndex]) || !mScratch[threadIndex]) {
            // Pad the side of the allocation by one unit to allow alignment later
            mScratch[threadIndex] = realloc(mScratch[threadIndex], (mSizeX + 1) * 16);
            mScratchSize[threadIndex] = mSizeX;
        }
        // realloc only aligns to 8 bytes so we manually align to 16.
        buf = (float4 *) ((((intptr_t)mScratch[threadIndex]) + 15) & ~0xf);
    }
    float4 *fout = (float4 *)buf;
    int y = currentY;
    if ((y > mIradius) && (y < ((int)mSizeY - mIradius))) {
        const uchar *pi = mIn + (y - mIradius) * stride;
        OneVFU4(fout, pi, stride, mFp, mIradius * 2 + 1, mSizeX, mUsesSimd);
    } else {
        x1 = 0;
        while(mSizeX > x1) {
            OneVU4(mSizeY, fout, x1, y, mIn, stride, mFp, mIradius);
            fout++;
            x1++;
        }
    }

    x1 = xstart;
    while ((x1 < (uint32_t)mIradius) && (x1 < x2)) {
        OneHU4(mSizeX, out, x1, buf, mFp, mIradius);
        out++;
        x1++;
    }
#if defined(ARCH_X86_HAVE_SSSE3)
    if (mUsesSimd) {
        if ((x1 + mIradius) < x2) {
            rsdIntrinsicBlurHFU4_K(out, buf - mIradius, mFp,
                                   mIradius * 2 + 1, x1, x2 - mIradius);
            out += (x2 - mIradius) - x1;
            x1 = x2 - mIradius;
        }
    }
#endif
    while(x2 > x1) {
        OneHU4(mSizeX, out, x1, buf, mFp, mIradius);
        out++;
        x1++;
    }
}