in renderscript-toolkit/src/main/cpp/Blur.cpp [443:509]
void BlurTask::kernelU1(void *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
float buf[4 * 2048];
const uint32_t stride = mSizeX * mVectorSize;
uchar *out = (uchar *)outPtr;
uint32_t x1 = xstart;
uint32_t x2 = xend;
#if defined(ARCH_ARM_USE_INTRINSICS)
if (mUsesSimd && mSizeX >= 16) {
// The specialisation for r<=8 has an awkward prefill case, which is
// fiddly to resolve, where starting close to the right edge can cause
// a read beyond the end of input. So avoid that case here.
if (mIradius > 8 || (mSizeX - std::max(0, (int32_t)x1 - 8)) >= 16) {
rsdIntrinsicBlurU1_K(out, mIn + stride * currentY, mSizeX, mSizeY,
stride, x1, currentY, x2 - x1, mIradius, mIp + mIradius);
return;
}
}
#endif
float *fout = (float *)buf;
int y = currentY;
if ((y > mIradius) && (y < ((int)mSizeY - mIradius -1))) {
const uchar *pi = mIn + (y - mIradius) * stride;
OneVFU1(fout, pi, stride, mFp, mIradius * 2 + 1, mSizeX, mUsesSimd);
} else {
x1 = 0;
while(mSizeX > x1) {
OneVU1(mSizeY, fout, x1, y, mIn, stride, mFp, mIradius);
fout++;
x1++;
}
}
x1 = xstart;
while ((x1 < x2) &&
((x1 < (uint32_t)mIradius) || (((uintptr_t)out) & 0x3))) {
OneHU1(mSizeX, out, x1, buf, mFp, mIradius);
out++;
x1++;
}
#if defined(ARCH_X86_HAVE_SSSE3)
if (mUsesSimd) {
if ((x1 + mIradius) < x2) {
uint32_t len = x2 - (x1 + mIradius);
len &= ~3;
// rsdIntrinsicBlurHFU1_K() processes each four float values in |buf| at once, so it
// nees to ensure four more values can be accessed in order to avoid accessing
// uninitialized buffer.
if (len > 4) {
len -= 4;
rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - mIradius, mFp,
mIradius * 2 + 1, x1, x1 + len);
out += len;
x1 += len;
}
}
}
#endif
while(x2 > x1) {
OneHU1(mSizeX, out, x1, buf, mFp, mIradius);
out++;
x1++;
}
}