void BlurTask::kernelU1()

in renderscript-toolkit/src/main/cpp/Blur.cpp [443:509]


void BlurTask::kernelU1(void *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
    float buf[4 * 2048];
    const uint32_t stride = mSizeX * mVectorSize;

    uchar *out = (uchar *)outPtr;
    uint32_t x1 = xstart;
    uint32_t x2 = xend;

#if defined(ARCH_ARM_USE_INTRINSICS)
    if (mUsesSimd && mSizeX >= 16) {
        // The specialisation for r<=8 has an awkward prefill case, which is
        // fiddly to resolve, where starting close to the right edge can cause
        // a read beyond the end of input.  So avoid that case here.
        if (mIradius > 8 || (mSizeX - std::max(0, (int32_t)x1 - 8)) >= 16) {
            rsdIntrinsicBlurU1_K(out, mIn + stride * currentY, mSizeX, mSizeY,
                     stride, x1, currentY, x2 - x1, mIradius, mIp + mIradius);
            return;
        }
    }
#endif

    float *fout = (float *)buf;
    int y = currentY;
    if ((y > mIradius) && (y < ((int)mSizeY - mIradius -1))) {
        const uchar *pi = mIn + (y - mIradius) * stride;
        OneVFU1(fout, pi, stride, mFp, mIradius * 2 + 1, mSizeX, mUsesSimd);
    } else {
        x1 = 0;
        while(mSizeX > x1) {
            OneVU1(mSizeY, fout, x1, y, mIn, stride, mFp, mIradius);
            fout++;
            x1++;
        }
    }

    x1 = xstart;
    while ((x1 < x2) &&
           ((x1 < (uint32_t)mIradius) || (((uintptr_t)out) & 0x3))) {
        OneHU1(mSizeX, out, x1, buf, mFp, mIradius);
        out++;
        x1++;
    }
#if defined(ARCH_X86_HAVE_SSSE3)
    if (mUsesSimd) {
        if ((x1 + mIradius) < x2) {
            uint32_t len = x2 - (x1 + mIradius);
            len &= ~3;

            // rsdIntrinsicBlurHFU1_K() processes each four float values in |buf| at once, so it
            // nees to ensure four more values can be accessed in order to avoid accessing
            // uninitialized buffer.
            if (len > 4) {
                len -= 4;
                rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - mIradius, mFp,
                                       mIradius * 2 + 1, x1, x1 + len);
                out += len;
                x1 += len;
            }
        }
    }
#endif
    while(x2 > x1) {
        OneHU1(mSizeX, out, x1, buf, mFp, mIradius);
        out++;
        x1++;
    }
}