in renderscript-toolkit/src/main/cpp/ColorMatrix.cpp [520:781]
bool ColorMatrixTask::build(Key_t key) {
#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
mBufSize = 4096;
//StopWatch build_time("rs cm: build time");
mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANON, -1, 0);
if (mBuf == MAP_FAILED) {
mBuf = NULL;
return false;
}
uint8_t *buf = mBuf;
uint8_t *buf2 = nullptr;
int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final
int opInit[4] = {0, 0, 0, 0};
memset(ops, 0, sizeof(ops));
for (int i=0; i < 4; i++) {
if (key.u.coeffMask & (1 << (i*4))) {
ops[i][0] = 0x2 | opInit[0];
opInit[0] = 1;
}
if (!key.u.dot) {
if (key.u.coeffMask & (1 << (1 + i*4))) {
ops[i][1] = 0x2 | opInit[1];
opInit[1] = 1;
}
if (key.u.coeffMask & (1 << (2 + i*4))) {
ops[i][2] = 0x2 | opInit[2];
opInit[2] = 1;
}
}
if (!key.u.copyAlpha) {
if (key.u.coeffMask & (1 << (3 + i*4))) {
ops[i][3] = 0x2 | opInit[3];
opInit[3] = 1;
}
}
}
if (key.u.inType || key.u.outType) {
key.u.copyAlpha = 0;
ADD_CHUNK(prefix_f);
buf2 = buf;
// Load the incoming r,g,b,a as needed
if (key.u.inType) {
switch(key.u.inVecSize) {
case 3:
ADD_CHUNK(load_f32_4);
break;
case 2:
ADD_CHUNK(load_f32_3);
break;
case 1:
ADD_CHUNK(load_f32_2);
break;
case 0:
ADD_CHUNK(load_f32_1);
break;
}
} else {
switch(key.u.inVecSize) {
case 3:
ADD_CHUNK(load_u8f_4);
break;
case 2:
ADD_CHUNK(load_u8f_3);
break;
case 1:
ADD_CHUNK(load_u8f_2);
break;
case 0:
ADD_CHUNK(load_u8f_1);
break;
}
}
for (int i=0; i < 4; i++) {
for (int j=0; j < 4; j++) {
switch(ops[i][j]) {
case 0:
break;
case 2:
buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
break;
case 3:
buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
break;
}
}
}
for (int j=0; j < 4; j++) {
if (opInit[j]) {
if (key.u.addMask & (1 << j)) {
buf = addVADD_F32(buf, j, 12+j, 8+j);
} else {
buf = addVORR_32(buf, j, 12+j, 12+j);
}
} else {
if (key.u.addMask & (1 << j)) {
buf = addVORR_32(buf, j, 8+j, 8+j);
} else {
buf = addVMOV_32(buf, j, 0);
}
}
}
if (key.u.outType) {
switch(key.u.outVecSize) {
case 3:
ADD_CHUNK(store_f32_4);
break;
case 2:
ADD_CHUNK(store_f32_3);
break;
case 1:
ADD_CHUNK(store_f32_2);
break;
case 0:
ADD_CHUNK(store_f32_1);
break;
}
} else {
switch(key.u.outVecSize) {
case 3:
case 2:
ADD_CHUNK(store_f32u_4);
break;
case 1:
ADD_CHUNK(store_f32u_2);
break;
case 0:
ADD_CHUNK(store_f32u_1);
break;
}
}
} else {
// Add the function prefix
// Store the address for the loop return
ADD_CHUNK(prefix_i);
buf2 = buf;
// Load the incoming r,g,b,a as needed
switch(key.u.inVecSize) {
case 3:
ADD_CHUNK(load_u8_4);
if (key.u.copyAlpha) {
ADD_CHUNK(unpack_u8_3);
} else {
ADD_CHUNK(unpack_u8_4);
}
break;
case 2:
ADD_CHUNK(load_u8_3);
ADD_CHUNK(unpack_u8_3);
break;
case 1:
ADD_CHUNK(load_u8_2);
ADD_CHUNK(unpack_u8_2);
break;
case 0:
ADD_CHUNK(load_u8_1);
ADD_CHUNK(unpack_u8_1);
break;
}
// Add multiply and accumulate
// use MULL to init the output register,
// use MLAL from there
for (int i=0; i < 4; i++) {
for (int j=0; j < 4; j++) {
switch(ops[i][j]) {
case 0:
break;
case 2:
buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
break;
case 3:
buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
break;
}
}
}
for (int j=0; j < 4; j++) {
if (opInit[j]) {
if (key.u.addMask & (1 << j)) {
buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
}
} else {
if (key.u.addMask & (1 << j)) {
buf = addVORR_32(buf, 8+j, 4+j, 4+j);
}
}
}
// If we have a dot product, perform the special pack.
if (key.u.dot) {
ADD_CHUNK(pack_u8_1);
ADD_CHUNK(dot);
} else {
switch(key.u.outVecSize) {
case 3:
if (key.u.copyAlpha) {
ADD_CHUNK(pack_u8_3);
} else {
ADD_CHUNK(pack_u8_4);
}
break;
case 2:
ADD_CHUNK(pack_u8_3);
break;
case 1:
ADD_CHUNK(pack_u8_2);
break;
case 0:
ADD_CHUNK(pack_u8_1);
break;
}
}
// Write out result
switch(key.u.outVecSize) {
case 3:
case 2:
ADD_CHUNK(store_u8_4);
break;
case 1:
ADD_CHUNK(store_u8_2);
break;
case 0:
ADD_CHUNK(store_u8_1);
break;
}
}
if (key.u.inType != key.u.outType) {
key.u.copyAlpha = 0;
key.u.dot = 0;
}
// Loop, branch, and cleanup
ADD_CHUNK(postfix1);
buf = addBranch(buf, buf2, 0x01);
ADD_CHUNK(postfix2);
int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
if (ret == -1) {
ALOGE("mprotect error %i", ret);
return false;
}
__builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
return true;
#else
(void) key; // Avoid unused parameter warning.
return false;
#endif
}