in renderscript-toolkit/src/main/cpp/Resize.cpp [493:575]
void ResizeTask::kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
//ALOGI("TK kernelU1 xstart %u, xend %u, outstep %u", xstart, xend);
const uchar *pin = mIn;
const int srcHeight = mInputSizeY;
const int srcWidth = mInputSizeX;
const size_t stride = mInputSizeX * mVectorSize;
// ALOGI("Toolkit ResizeU1 (%ux%u) by (%f,%f), xstart:%u to %u, stride %zu, out %p", srcWidth,
// srcHeight, scaleX, scaleY, xstart, xend, stride, outPtr);
#if defined(ARCH_X86_HAVE_AVX2)
float yf = _mm_cvtss_f32(
_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
#else
float yf = (currentY + 0.5f) * mScaleY - 0.5f;
#endif
int starty = (int) floor(yf - 1);
yf = yf - floor(yf);
int maxy = srcHeight - 1;
int ys0 = std::max(0, starty + 0);
int ys1 = std::min(maxy, std::max(0, starty + 1));
int ys2 = std::min(maxy, starty + 2);
int ys3 = std::min(maxy, starty + 3);
const uchar *yp0 = pin + stride * ys0;
const uchar *yp1 = pin + stride * ys1;
const uchar *yp2 = pin + stride * ys2;
const uchar *yp3 = pin + stride * ys3;
uchar *out = ((uchar *)outPtr);
uint32_t x1 = xstart;
uint32_t x2 = xend;
#if defined(ARCH_ARM_USE_INTRINSICS)
if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
float xf = (x1 + 0.5f) * mScaleX - 0.5f;
long xf16 = rint(xf * 0x10000);
uint32_t xinc16 = rint(mScaleX * 0x10000);
int xoff = (xf16 >> 16) - 1;
int xclip = std::max(0, xoff) - xoff;
int len = x2 - x1;
int32_t yr[4];
uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
mkYCoeff(yr, yf);
// ALOGI("ys0 %d, ys1 %d, ys2 %d, ys3 %d, x1 %u, x2 %u, xf %f, xf16 %ld, xinc16 %u, xoff %d,
// xclip %d, len %d, osc_ctl %lu)",
// ys0, ys1, ys2, ys3, x1, x2, xf, xf16, xinc16, xoff, xclip, len, (unsigned long)
// osc_ctl);
// ALOGI("TK scaleX %f, xf %f, xf16 %ld, xinc16 %d, xoff %d, xclip %d, len %d", scaleX, xf,
// xf16, xinc16, xoff, xclip, len); ALOGI("TK xf16 & 0xffff %ld, ys0 %u, ys1 %u, ys2 %u, ys3
// %u, srcWidth - xoff + xclip %d", xf16 & 0xffff, ys0, ys1, ys2, ys3, srcWidth - xoff);
xoff += xclip;
rsdIntrinsicResizeB1_K(
out, len,
xf16 & 0xffff, xinc16,
yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
xclip, srcWidth - xoff + xclip,
osc_ctl, yr);
out += len;
x1 += len;
}
#endif
while(x1 < x2) {
#if defined(ARCH_X86_HAVE_AVX2)
float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
_mm_set1_ps(0.5f)));
#else
float xf = (x1 + 0.5f) * mScaleX - 0.5f;
#endif
*out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
out++;
x1++;
}
}