in benchmarks/trivial_compute_benchmark.cpp [157:189]
void BM_async(benchmark::State& state) {
const int num_threads = state.range(0);
const int num_elements = state.range(1);
uint64_t sum = 0;
int foo = 0;
auto input = getInputs(num_elements);
for (auto UNUSED_VAR : state) {
std::vector<uint64_t> sums;
++foo;
size_t chunkSize = (num_elements + num_threads - 1) / num_threads;
std::vector<std::future<uint64_t>> futures;
for (int i = 0; i < num_elements; i += chunkSize) {
futures.push_back(
std::async([input, foo, i, end = std::min<int>(num_elements, i + chunkSize)]() mutable {
uint64_t lsum = 0;
for (; i != end; ++i) {
lsum += calculate(input, i, foo);
}
return lsum;
}));
}
sum = 0;
for (auto& s : futures) {
sum += s.get();
}
}
checkResults(input, sum, foo, num_elements);
}