void ResizeTask::kernelU1()

in renderscript-toolkit/src/main/cpp/Resize.cpp [493:575]


void ResizeTask::kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
    //ALOGI("TK kernelU1 xstart %u, xend %u, outstep %u", xstart, xend);
    const uchar *pin = mIn;
    const int srcHeight = mInputSizeY;
    const int srcWidth = mInputSizeX;
    const size_t stride = mInputSizeX * mVectorSize;

    // ALOGI("Toolkit   ResizeU1 (%ux%u) by (%f,%f), xstart:%u to %u, stride %zu, out %p", srcWidth,
    // srcHeight, scaleX, scaleY, xstart, xend, stride, outPtr);

#if defined(ARCH_X86_HAVE_AVX2)
    float yf = _mm_cvtss_f32(
            _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
#else
    float yf = (currentY + 0.5f) * mScaleY - 0.5f;
#endif

    int starty = (int) floor(yf - 1);
    yf = yf - floor(yf);
    int maxy = srcHeight - 1;
    int ys0 = std::max(0, starty + 0);
    int ys1 = std::min(maxy, std::max(0, starty + 1));
    int ys2 = std::min(maxy, starty + 2);
    int ys3 = std::min(maxy, starty + 3);

    const uchar *yp0 = pin + stride * ys0;
    const uchar *yp1 = pin + stride * ys1;
    const uchar *yp2 = pin + stride * ys2;
    const uchar *yp3 = pin + stride * ys3;

    uchar *out = ((uchar *)outPtr);
    uint32_t x1 = xstart;
    uint32_t x2 = xend;

#if defined(ARCH_ARM_USE_INTRINSICS)
    if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
        long xf16 = rint(xf * 0x10000);
        uint32_t xinc16 = rint(mScaleX * 0x10000);

        int xoff = (xf16 >> 16) - 1;
        int xclip = std::max(0, xoff) - xoff;
        int len = x2 - x1;

        int32_t yr[4];
        uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
        mkYCoeff(yr, yf);

        // ALOGI("ys0 %d, ys1 %d, ys2 %d, ys3 %d, x1 %u, x2 %u, xf %f, xf16 %ld, xinc16 %u, xoff %d,
        // xclip %d, len %d, osc_ctl %lu)",
        //       ys0, ys1, ys2, ys3, x1, x2, xf, xf16, xinc16, xoff, xclip, len, (unsigned long)
        //       osc_ctl);
        // ALOGI("TK scaleX %f, xf %f, xf16 %ld, xinc16 %d, xoff %d, xclip %d, len %d", scaleX, xf,
        // xf16, xinc16, xoff, xclip, len); ALOGI("TK xf16 & 0xffff %ld, ys0 %u, ys1 %u, ys2 %u, ys3
        // %u, srcWidth - xoff + xclip %d", xf16 & 0xffff, ys0, ys1, ys2, ys3, srcWidth - xoff);

        xoff += xclip;

        rsdIntrinsicResizeB1_K(
                out, len,
                xf16 & 0xffff, xinc16,
                yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
                xclip, srcWidth - xoff + xclip,
                osc_ctl, yr);
        out += len;
        x1 += len;
    }
#endif

    while(x1 < x2) {

#if defined(ARCH_X86_HAVE_AVX2)
        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
                                              _mm_set1_ps(0.5f)));
#else
        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
#endif

        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
        out++;
        x1++;
    }
}