in renderscript-toolkit/src/main/cpp/ColorMatrix.cpp [906:957]
void ColorMatrixTask::kernel(uchar *out, uchar *in, uint32_t xstart, uint32_t xend) {
uint32_t x1 = xstart;
uint32_t x2 = xend;
uint32_t vsin = mLastKey.u.inVecSize;
uint32_t vsout = mLastKey.u.outVecSize;
bool floatIn = !!mLastKey.u.inType;
bool floatOut = !!mLastKey.u.outType;
//if (!info->current.y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout);
if(x2 > x1) {
int32_t len = x2 - x1;
if (mUsesSimd) {
if((mOptKernel != nullptr) && (len >= 4)) {
// The optimized kernel processes 4 pixels at once
// and requires a minimum of 1 chunk of 4
mOptKernel(out, in, mIp, len >> 2);
// Update the len and pointers so the generic code can
// finish any leftover pixels
len &= ~3;
x1 += len;
out += mOutstep * len;
in += mInstep * len;
}
#if defined(ARCH_ARM64_USE_INTRINSICS)
else {
if (mLastKey.u.inType == RS_TYPE_FLOAT_32 ||
mLastKey.u.outType == RS_TYPE_FLOAT_32) {
// Currently this generates off by one errors.
// rsdIntrinsicColorMatrix_float_K(out, in, len, &mFnTab, tmpFp, tmpFpa);
// x1 += len;
// out += outstep * len;
// in += instep * len;
} else {
rsdIntrinsicColorMatrix_int_K(out, in, len, &mFnTab, mIp, mIpa);
x1 += len;
out += mOutstep * len;
in += mInstep * len;
}
}
#endif
}
while(x1 != x2) {
One(out, in, mTmpFp, mTmpFpa, vsin, vsout, floatIn, floatOut);
out += mOutstep;
in += mInstep;
x1++;
}
}
}