in renderscript-toolkit/src/main/cpp/Blur.cpp [370:433]
void BlurTask::kernelU4(void *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY,
uint32_t threadIndex) {
float4 stackbuf[2048];
float4 *buf = &stackbuf[0];
const uint32_t stride = mSizeX * mVectorSize;
uchar4 *out = (uchar4 *)outPtr;
uint32_t x1 = xstart;
uint32_t x2 = xend;
#if defined(ARCH_ARM_USE_INTRINSICS)
if (mUsesSimd && mSizeX >= 4) {
rsdIntrinsicBlurU4_K(out, (uchar4 const *)(mIn + stride * currentY),
mSizeX, mSizeY,
stride, x1, currentY, x2 - x1, mIradius, mIp + mIradius);
return;
}
#endif
if (mSizeX > 2048) {
if ((mSizeX > mScratchSize[threadIndex]) || !mScratch[threadIndex]) {
// Pad the side of the allocation by one unit to allow alignment later
mScratch[threadIndex] = realloc(mScratch[threadIndex], (mSizeX + 1) * 16);
mScratchSize[threadIndex] = mSizeX;
}
// realloc only aligns to 8 bytes so we manually align to 16.
buf = (float4 *) ((((intptr_t)mScratch[threadIndex]) + 15) & ~0xf);
}
float4 *fout = (float4 *)buf;
int y = currentY;
if ((y > mIradius) && (y < ((int)mSizeY - mIradius))) {
const uchar *pi = mIn + (y - mIradius) * stride;
OneVFU4(fout, pi, stride, mFp, mIradius * 2 + 1, mSizeX, mUsesSimd);
} else {
x1 = 0;
while(mSizeX > x1) {
OneVU4(mSizeY, fout, x1, y, mIn, stride, mFp, mIradius);
fout++;
x1++;
}
}
x1 = xstart;
while ((x1 < (uint32_t)mIradius) && (x1 < x2)) {
OneHU4(mSizeX, out, x1, buf, mFp, mIradius);
out++;
x1++;
}
#if defined(ARCH_X86_HAVE_SSSE3)
if (mUsesSimd) {
if ((x1 + mIradius) < x2) {
rsdIntrinsicBlurHFU4_K(out, buf - mIradius, mFp,
mIradius * 2 + 1, x1, x2 - mIradius);
out += (x2 - mIradius) - x1;
x1 = x2 - mIradius;
}
}
#endif
while(x2 > x1) {
OneHU4(mSizeX, out, x1, buf, mFp, mIradius);
out++;
x1++;
}
}