in Transform_V1/vf_transform_v1.c [824:922]
static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{
const ThreadData *td = arg;
const TransformPlaneMap *p = td->p;
const int linesize = td->linesize;
const int subs = td->subs;
const int num_tiles = td->num_tiles;
const int num_tiles_col = td->num_tiles_col;
const uint8_t *in_data = td->in_data;
uint8_t *out_data = td->out_data;
const int tile_start = (num_tiles * jobnr) / nb_jobs;
const int tile_end = (num_tiles * (jobnr+1)) / nb_jobs;
for (int tile = tile_start ; tile < tile_end ; ++tile) {
int tile_i = (tile / num_tiles_col) * 16;
int tile_j = (tile % num_tiles_col) * 16;
#ifdef SOFTWARE_PREFETCH_OPT
TransformPixelWeights *ws_prefetch;
const uint8_t prefetch_lookahead = 8;
int id_prefetch = p->w * tile_i + tile_j;
// The loop below prefetches all the weights from array "p" and
// associated pairs array. The prefetch is only done for the initial
// lookahead for the first iteration of tile processing loop below so
// that they are ready to be consumed in the inner loop. In the tile
// processing loop, we are prefetching addresses that are after the
// lookahead (i.e in the same iteration and also the next
// ietartion of the loop).
for(int k = 0; k < prefetch_lookahead; ++k){
ws_prefetch = &p->weights[id_prefetch+k];
__builtin_prefetch (ws_prefetch, 0, 0);
__builtin_prefetch (ws_prefetch->pairs, 0, 0);
}
// Prefetch the cacheline for out_data for writes
int out_sample_prefetch = linesize * (tile_i + 2) + tile_j;
__builtin_prefetch (&out_data[out_sample_prefetch], 1, 0);
#endif
if ((tile_i + 15) >= p->h || (tile_j + 15) >= p->w) {
filter_slice_boundcheck(tile_i, tile_j, linesize, subs, p, in_data, out_data);
continue;
}
for (int i = 0; i < 16; ++i) {
int out_line = linesize * (tile_i + i);
int map_line = p->w * (tile_i + i);
#ifdef SOFTWARE_PREFETCH_OPT
// Prefetch the cacheline for out_data for writes
__builtin_prefetch (&out_data[out_line+tile_j], 1, 0);
#endif
for (int j = 0; j < 16; ++j) {
int out_sample = out_line + tile_j + j;
int id = map_line + tile_j + j;
TransformPixelWeights *ws = &p->weights[id];
#ifdef SOFTWARE_PREFETCH_OPT
// In this inner loop, we prefech the weight from array "p" after the
// prefetch_lookahead iteration. We also prefetch the weight pairs
// along with weight address as we found that we were getting
// datacache (L1 and LLC) and DTLB misses for both the address
// and the pair.
if (j < prefetch_lookahead) {
ws_prefetch = &p->weights[id+prefetch_lookahead];
__builtin_prefetch (ws_prefetch->pairs, 0, 0);
__builtin_prefetch (ws_prefetch+prefetch_lookahead, 0, 0);
}
else if (i < 15) {
// Here we are prefetching the address for the next iteration of outer loop
// so that we have the data avaialble in the next loop when it starts.
id_prefetch = p->w + id - prefetch_lookahead;
ws_prefetch = &p->weights[id_prefetch];
__builtin_prefetch (ws_prefetch->pairs, 0, 0);
__builtin_prefetch (ws_prefetch+prefetch_lookahead, 0, 0);
}
// Prefetch the cacheline for out_data for writes
__builtin_prefetch (&out_data[out_sample], 1, 0);
#endif
if (ws->n == 1) {
out_data[out_sample] = in_data[UNPACK_ID(ws->pairs[0])];
} else {
int color_sum = 0;
for (int k = 0; k < ws->n; ++k) {
color_sum += ((int) in_data[UNPACK_ID(ws->pairs[k])]) *
UNPACK_COUNT(ws->pairs[k]);
}
// Round to nearest
out_data[out_sample] = (uint8_t) ((color_sum + (subs >> 1)) / subs);
}
}
}
}
return 0;
}