bool ColorMatrixTask::build()

in renderscript-toolkit/src/main/cpp/ColorMatrix.cpp [520:781]


bool ColorMatrixTask::build(Key_t key) {
#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
    mBufSize = 4096;
    //StopWatch build_time("rs cm: build time");
    mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
                                  MAP_PRIVATE | MAP_ANON, -1, 0);
    if (mBuf == MAP_FAILED) {
        mBuf = NULL;
        return false;
    }

    uint8_t *buf = mBuf;
    uint8_t *buf2 = nullptr;

    int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
    int opInit[4] = {0, 0, 0, 0};

    memset(ops, 0, sizeof(ops));
    for (int i=0; i < 4; i++) {
        if (key.u.coeffMask & (1 << (i*4))) {
            ops[i][0] = 0x2 | opInit[0];
            opInit[0] = 1;
        }
        if (!key.u.dot) {
            if (key.u.coeffMask & (1 << (1 + i*4))) {
                ops[i][1] = 0x2 | opInit[1];
                opInit[1] = 1;
            }
            if (key.u.coeffMask & (1 << (2 + i*4))) {
                ops[i][2] = 0x2 | opInit[2];
                opInit[2] = 1;
            }
        }
        if (!key.u.copyAlpha) {
            if (key.u.coeffMask & (1 << (3 + i*4))) {
                ops[i][3] = 0x2 | opInit[3];
                opInit[3] = 1;
            }
        }
    }

    if (key.u.inType || key.u.outType) {
        key.u.copyAlpha = 0;
        ADD_CHUNK(prefix_f);
        buf2 = buf;

        // Load the incoming r,g,b,a as needed
        if (key.u.inType) {
            switch(key.u.inVecSize) {
            case 3:
                ADD_CHUNK(load_f32_4);
                break;
            case 2:
                ADD_CHUNK(load_f32_3);
                break;
            case 1:
                ADD_CHUNK(load_f32_2);
                break;
            case 0:
                ADD_CHUNK(load_f32_1);
                break;
            }
        } else {
            switch(key.u.inVecSize) {
            case 3:
                ADD_CHUNK(load_u8f_4);
                break;
            case 2:
                ADD_CHUNK(load_u8f_3);
                break;
            case 1:
                ADD_CHUNK(load_u8f_2);
                break;
            case 0:
                ADD_CHUNK(load_u8f_1);
                break;
            }
        }

        for (int i=0; i < 4; i++) {
            for (int j=0; j < 4; j++) {
                switch(ops[i][j]) {
                case 0:
                    break;
                case 2:
                    buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
                    break;
                case 3:
                    buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
                    break;
                }
            }
        }
        for (int j=0; j < 4; j++) {
            if (opInit[j]) {
                if (key.u.addMask & (1 << j)) {
                    buf = addVADD_F32(buf, j, 12+j, 8+j);
                } else {
                    buf = addVORR_32(buf, j, 12+j, 12+j);
                }
            } else {
                if (key.u.addMask & (1 << j)) {
                    buf = addVORR_32(buf, j, 8+j, 8+j);
                } else {
                    buf = addVMOV_32(buf, j, 0);
                }
            }
        }

        if (key.u.outType) {
            switch(key.u.outVecSize) {
            case 3:
                ADD_CHUNK(store_f32_4);
                break;
            case 2:
                ADD_CHUNK(store_f32_3);
                break;
            case 1:
                ADD_CHUNK(store_f32_2);
                break;
            case 0:
                ADD_CHUNK(store_f32_1);
                break;
            }
        } else {
            switch(key.u.outVecSize) {
            case 3:
            case 2:
                ADD_CHUNK(store_f32u_4);
                break;
            case 1:
                ADD_CHUNK(store_f32u_2);
                break;
            case 0:
                ADD_CHUNK(store_f32u_1);
                break;
            }
        }


    } else {
        // Add the function prefix
        // Store the address for the loop return
        ADD_CHUNK(prefix_i);
        buf2 = buf;

        // Load the incoming r,g,b,a as needed
        switch(key.u.inVecSize) {
        case 3:
            ADD_CHUNK(load_u8_4);
            if (key.u.copyAlpha) {
                ADD_CHUNK(unpack_u8_3);
            } else {
                ADD_CHUNK(unpack_u8_4);
            }
            break;
        case 2:
            ADD_CHUNK(load_u8_3);
            ADD_CHUNK(unpack_u8_3);
            break;
        case 1:
            ADD_CHUNK(load_u8_2);
            ADD_CHUNK(unpack_u8_2);
            break;
        case 0:
            ADD_CHUNK(load_u8_1);
            ADD_CHUNK(unpack_u8_1);
            break;
        }

        // Add multiply and accumulate
        // use MULL to init the output register,
        // use MLAL from there
        for (int i=0; i < 4; i++) {
            for (int j=0; j < 4; j++) {
                switch(ops[i][j]) {
                case 0:
                    break;
                case 2:
                    buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
                    break;
                case 3:
                    buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
                    break;
                }
            }
        }
        for (int j=0; j < 4; j++) {
            if (opInit[j]) {
                if (key.u.addMask & (1 << j)) {
                    buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
                }
            } else {
                if (key.u.addMask & (1 << j)) {
                    buf = addVORR_32(buf, 8+j, 4+j, 4+j);
                }
            }
        }

        // If we have a dot product, perform the special pack.
        if (key.u.dot) {
            ADD_CHUNK(pack_u8_1);
            ADD_CHUNK(dot);
        } else {
            switch(key.u.outVecSize) {
            case 3:
                if (key.u.copyAlpha) {
                    ADD_CHUNK(pack_u8_3);
                } else {
                    ADD_CHUNK(pack_u8_4);
                }
                break;
            case 2:
                ADD_CHUNK(pack_u8_3);
                break;
            case 1:
                ADD_CHUNK(pack_u8_2);
                break;
            case 0:
                ADD_CHUNK(pack_u8_1);
                break;
            }
        }

        // Write out result
        switch(key.u.outVecSize) {
        case 3:
        case 2:
            ADD_CHUNK(store_u8_4);
            break;
        case 1:
            ADD_CHUNK(store_u8_2);
            break;
        case 0:
            ADD_CHUNK(store_u8_1);
            break;
        }
    }

    if (key.u.inType != key.u.outType) {
        key.u.copyAlpha = 0;
        key.u.dot = 0;
    }

    // Loop, branch, and cleanup
    ADD_CHUNK(postfix1);
    buf = addBranch(buf, buf2, 0x01);
    ADD_CHUNK(postfix2);

    int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
    if (ret == -1) {
        ALOGE("mprotect error %i", ret);
        return false;
    }

    __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
    return true;
#else
    (void) key; // Avoid unused parameter warning.
    return false;
#endif
}