static int filter_slice()

in Transform_V1/vf_transform_v1.c [824:922]


static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{
    const ThreadData *td = arg;
    const TransformPlaneMap *p = td->p;
    const int linesize = td->linesize;
    const int subs = td->subs;
    const int num_tiles = td->num_tiles;
    const int num_tiles_col = td->num_tiles_col;
    const uint8_t *in_data = td->in_data;
    uint8_t *out_data = td->out_data;

    const int tile_start = (num_tiles * jobnr) / nb_jobs;
    const int tile_end = (num_tiles * (jobnr+1)) / nb_jobs;

    for (int tile = tile_start ; tile < tile_end ; ++tile) {
        int tile_i = (tile / num_tiles_col) * 16;
        int tile_j = (tile % num_tiles_col) * 16;

#ifdef SOFTWARE_PREFETCH_OPT
        TransformPixelWeights *ws_prefetch;
        const uint8_t prefetch_lookahead = 8;
        int id_prefetch = p->w * tile_i + tile_j;

        // The loop below prefetches all the weights from array "p" and
        // associated pairs array. The prefetch is only done for the initial
        // lookahead for the first iteration of tile processing loop below so
        // that they are ready to be consumed in the inner loop. In the tile
        // processing loop, we are prefetching addresses that are after the
        // lookahead (i.e in the same iteration and also the next
        // ietartion of the loop).
        for(int k = 0; k < prefetch_lookahead; ++k){
           ws_prefetch = &p->weights[id_prefetch+k];
           __builtin_prefetch (ws_prefetch, 0, 0);
           __builtin_prefetch (ws_prefetch->pairs, 0, 0);
        }
        // Prefetch the cacheline for out_data for writes
        int out_sample_prefetch = linesize * (tile_i + 2) + tile_j;
        __builtin_prefetch (&out_data[out_sample_prefetch], 1, 0);
#endif

        if ((tile_i + 15) >= p->h || (tile_j + 15) >= p->w) {
            filter_slice_boundcheck(tile_i, tile_j, linesize, subs, p, in_data, out_data);
            continue;
        }

        for (int i = 0; i < 16; ++i) {
            int out_line = linesize * (tile_i + i);
            int map_line = p->w * (tile_i + i);

#ifdef SOFTWARE_PREFETCH_OPT
            // Prefetch the cacheline for out_data for writes
            __builtin_prefetch (&out_data[out_line+tile_j], 1, 0);
#endif

            for (int j = 0; j < 16; ++j) {
                int out_sample = out_line + tile_j + j;
                int id = map_line + tile_j + j;
                TransformPixelWeights *ws = &p->weights[id];

#ifdef SOFTWARE_PREFETCH_OPT
                // In this inner loop, we prefech the weight from array "p" after the
                // prefetch_lookahead iteration. We also prefetch the weight pairs
                // along with weight address as we found that we were getting
                // datacache (L1 and LLC) and DTLB misses for both the address
                // and the pair.
                if (j <  prefetch_lookahead) {
                   ws_prefetch = &p->weights[id+prefetch_lookahead];
                   __builtin_prefetch (ws_prefetch->pairs, 0, 0);
                   __builtin_prefetch (ws_prefetch+prefetch_lookahead, 0, 0);
                }
                else if (i < 15) {
                   // Here we are prefetching the address for the next iteration of outer loop
                   // so that we have the data avaialble in the next loop when it starts.
                   id_prefetch = p->w + id - prefetch_lookahead;
                   ws_prefetch = &p->weights[id_prefetch];
                   __builtin_prefetch (ws_prefetch->pairs, 0, 0);
                   __builtin_prefetch (ws_prefetch+prefetch_lookahead, 0, 0);
                }
                // Prefetch the cacheline for out_data for writes
                __builtin_prefetch (&out_data[out_sample], 1, 0);
#endif

                if (ws->n == 1) {
                    out_data[out_sample] = in_data[UNPACK_ID(ws->pairs[0])];
                } else {
                    int color_sum = 0;
                    for (int k = 0; k < ws->n; ++k) {
                        color_sum += ((int) in_data[UNPACK_ID(ws->pairs[k])]) *
                            UNPACK_COUNT(ws->pairs[k]);
                    }
                    // Round to nearest
                    out_data[out_sample] = (uint8_t) ((color_sum + (subs >> 1)) / subs);
                }
            }
        }
    }

    return 0;
}