benchmarks/future_benchmark.cpp (313 lines of code) (raw):
// Copyright (c) Facebook, Inc. and its affiliates.
//
// This source code is licensed under the MIT license found in the
// LICENSE.md file in the root directory of this source tree.
#include <cmath>
#include <future>
#include <iostream>
#include <random>
#include <dispenso/future.h>
#if !defined(BENCHMARK_WITHOUT_FOLLY)
#include <folly/executors/CPUThreadPoolExecutor.h>
#include <folly/futures/Future.h>
#endif // !BENCHMARK_WITHOUT_FOLLY
#include "thread_benchmark_common.h"
constexpr size_t kSmallSize = 13;
constexpr size_t kMediumSize = 16;
constexpr size_t kLargeSize = 19;
// Note that there are many optimizations that could be made for these tree build routines. The
// goal was to make these as apples-to-apples as possible.
struct Node {
Node* left;
Node* right;
uint32_t value;
void setValue(uint32_t unique_bitset, uint32_t modulo) {
value = 0;
for (uint32_t i = 0; i < 32; ++i) {
value += unique_bitset % modulo;
unique_bitset /= modulo;
}
}
};
class Allocator {
public:
void reset(size_t depth) {
nodes_.resize(std::pow(2, depth) - 1);
next_.store(0, std::memory_order_release);
}
Node* alloc() {
size_t cur = next_.fetch_add(1, std::memory_order_relaxed);
return &nodes_[cur];
}
private:
std::vector<Node> nodes_;
std::atomic<size_t> next_{0};
};
const std::vector<uint32_t>& getModulos() {
static const std::vector<uint32_t> modulos = []() {
std::mt19937 mt;
std::uniform_int_distribution<> dis(2, 55);
std::vector<uint32_t> m;
for (size_t i = 0; i < 64; ++i) {
m.emplace_back(dis(mt));
}
return m;
}();
return modulos;
}
uint64_t sumTree(Node* root) {
if (!root) {
return 0;
}
return root->value + sumTree(root->left) + sumTree(root->right);
}
void checkTree(Node* root, uint32_t depth, uint32_t modulo) {
uint64_t expectedSum = 0;
uint32_t num = std::pow(2, depth);
for (uint32_t i = 0; i < num; ++i) {
auto bitset = i;
while (bitset) {
expectedSum += bitset % modulo;
bitset /= modulo;
}
}
uint64_t actual = sumTree(root);
if (actual != expectedSum) {
std::cerr << "Mismatch! " << expectedSum << " vs " << actual << std::endl;
std::abort();
}
}
Node* serialTree(Allocator& allocator, uint32_t depth, uint32_t bitset, uint32_t modulo) {
--depth;
Node* node = allocator.alloc();
node->setValue(bitset, modulo);
if (!depth) {
node->left = nullptr;
node->right = nullptr;
return node;
}
node->left = serialTree(allocator, depth, (bitset << 1), modulo);
node->right = serialTree(allocator, depth, (bitset << 1) | 1, modulo);
return node;
}
template <size_t depth>
void BM_serial_tree(benchmark::State& state) {
Allocator alloc;
alloc.reset(depth);
getModulos();
uint32_t modulo;
Node* root;
size_t m = 0;
for (auto UNUSED_VAR : state) {
alloc.reset(depth);
modulo = getModulos()[m];
root = serialTree(alloc, depth, 1, modulo);
m = (m + 1 == getModulos().size()) ? 0 : m + 1;
}
checkTree(root, depth, modulo);
}
Node* stdTree(Allocator& allocator, uint32_t depth, uint32_t bitset, uint32_t modulo) {
--depth;
Node* node = allocator.alloc();
node->setValue(bitset, modulo);
if (!depth) {
node->left = nullptr;
node->right = nullptr;
return node;
}
auto left = std::async([&]() { return stdTree(allocator, depth, (bitset << 1), modulo); });
auto right = std::async([&]() { return stdTree(allocator, depth, (bitset << 1) | 1, modulo); });
node->left = left.get();
node->right = right.get();
return node;
}
template <size_t depth>
void BM_std_tree(benchmark::State& state) {
Allocator alloc;
alloc.reset(depth);
getModulos();
uint32_t modulo;
Node* root;
size_t m = 0;
for (auto UNUSED_VAR : state) {
alloc.reset(depth);
modulo = getModulos()[m];
root = stdTree(alloc, depth, 1, modulo);
m = (m + 1 == getModulos().size()) ? 0 : m + 1;
}
checkTree(root, depth, modulo);
}
Node* dispensoTree(Allocator& allocator, uint32_t depth, uint32_t bitset, uint32_t modulo) {
--depth;
Node* node = allocator.alloc();
node->setValue(bitset, modulo);
if (!depth) {
node->left = nullptr;
node->right = nullptr;
return node;
}
auto left = dispenso::async(
[=, &allocator]() { return dispensoTree(allocator, depth, (bitset << 1), modulo); });
auto right = dispenso::async(
[=, &allocator]() { return dispensoTree(allocator, depth, (bitset << 1) | 1, modulo); });
node->left = left.get();
node->right = right.get();
return node;
}
template <size_t depth>
void BM_dispenso_tree(benchmark::State& state) {
Allocator alloc;
alloc.reset(depth);
getModulos();
dispenso::globalThreadPool();
uint32_t modulo;
Node* root;
size_t m = 0;
for (auto UNUSED_VAR : state) {
alloc.reset(depth);
modulo = getModulos()[m];
root = dispensoTree(alloc, depth, 1, modulo);
m = (m + 1 == getModulos().size()) ? 0 : m + 1;
}
checkTree(root, depth, modulo);
}
#if !defined(BENCHMARK_WITHOUT_FOLLY)
folly::SemiFuture<folly::Unit> follyTree(
folly::Executor* exec,
Node* node,
Allocator* allocator,
uint32_t depth,
uint32_t bitset,
uint32_t modulo) {
--depth;
node->setValue(bitset, modulo);
if (!depth) {
node->left = nullptr;
node->right = nullptr;
return folly::Unit{};
}
node->left = allocator->alloc();
node->right = allocator->alloc();
return folly::via(
exec,
[=]() {
return folly::collectAll(
follyTree(exec, node->left, allocator, depth, bitset << 1, modulo),
follyTree(exec, node->right, allocator, depth, bitset << 1 | 1, modulo))
.unit();
})
.semi();
}
template <size_t depth>
void BM_folly_tree(benchmark::State& state) {
folly::CPUThreadPoolExecutor follyExec{std::thread::hardware_concurrency()};
Allocator alloc;
alloc.reset(depth);
uint32_t modulo;
Node root;
size_t m = 0;
for (auto UNUSED_VAR : state) {
alloc.reset(depth);
modulo = getModulos()[m];
follyTree(&follyExec, &root, &alloc, depth, 1, modulo).via(&follyExec).get();
m = (m + 1 == getModulos().size()) ? 0 : m + 1;
}
checkTree(&root, depth, modulo);
}
#endif // !BENCHMARK_WITHOUT_FOLLY
void dispensoTaskSetTree(
dispenso::ConcurrentTaskSet& tasks,
Node* node,
Allocator& allocator,
uint32_t depth,
uint32_t bitset,
uint32_t modulo) {
node->setValue(bitset, modulo);
--depth;
if (!depth) {
node->left = nullptr;
node->right = nullptr;
return;
}
tasks.schedule([&tasks, &allocator, node, depth, bitset, modulo]() {
node->left = allocator.alloc();
dispensoTaskSetTree(tasks, node->left, allocator, depth, (bitset << 1), modulo);
});
tasks.schedule([&tasks, &allocator, node, depth, bitset, modulo]() {
node->right = allocator.alloc();
dispensoTaskSetTree(tasks, node->right, allocator, depth, (bitset << 1) | 1, modulo);
});
}
template <size_t depth>
void BM_taskset_tree(benchmark::State& state) {
Allocator alloc;
alloc.reset(depth);
getModulos();
uint32_t modulo;
Node root;
dispenso::ConcurrentTaskSet tasks(dispenso::globalThreadPool());
size_t m = 0;
for (auto UNUSED_VAR : state) {
alloc.reset(depth);
modulo = getModulos()[m];
dispensoTaskSetTree(tasks, &root, alloc, depth, 1, modulo);
tasks.wait();
m = (m + 1 == getModulos().size()) ? 0 : m + 1;
}
checkTree(&root, depth, modulo);
}
dispenso::Future<Node*>
dispensoTreeWhenAll(Allocator& allocator, uint32_t depth, uint32_t bitset, uint32_t modulo) {
--depth;
Node* node = allocator.alloc();
node->setValue(bitset, modulo);
if (!depth) {
node->left = nullptr;
node->right = nullptr;
return dispenso::make_ready_future(node);
}
auto left = dispenso::async([depth, bitset, modulo, &allocator]() {
return dispensoTreeWhenAll(allocator, depth, (bitset << 1), modulo);
});
auto right = dispenso::async([depth, bitset, modulo, &allocator]() {
return dispensoTreeWhenAll(allocator, depth, (bitset << 1) | 1, modulo);
});
return dispenso::when_all(left, right).then([node](auto&& both) {
auto& tuple = both.get();
node->left = std::get<0>(tuple).get().get();
node->right = std::get<1>(tuple).get().get();
return node;
});
}
template <size_t depth>
void BM_dispenso_tree_when_all(benchmark::State& state) {
Allocator alloc;
alloc.reset(depth);
getModulos();
dispenso::globalThreadPool();
uint32_t modulo;
Node* root;
size_t m = 0;
for (auto UNUSED_VAR : state) {
alloc.reset(depth);
modulo = getModulos()[m];
root = dispensoTreeWhenAll(alloc, depth, 1, modulo).get();
m = (m + 1 == getModulos().size()) ? 0 : m + 1;
}
checkTree(root, depth, modulo);
}
BENCHMARK_TEMPLATE(BM_serial_tree, kSmallSize)->UseRealTime();
BENCHMARK_TEMPLATE(BM_serial_tree, kMediumSize)->UseRealTime();
BENCHMARK_TEMPLATE(BM_serial_tree, kLargeSize)->UseRealTime();
BENCHMARK_TEMPLATE(BM_std_tree, kSmallSize)->UseRealTime();
BENCHMARK_TEMPLATE(BM_std_tree, kMediumSize)->UseRealTime();
BENCHMARK_TEMPLATE(BM_std_tree, kLargeSize)->UseRealTime();
#if !defined(BENCHMARK_WITHOUT_FOLLY)
BENCHMARK_TEMPLATE(BM_folly_tree, kSmallSize)->UseRealTime();
BENCHMARK_TEMPLATE(BM_folly_tree, kMediumSize)->UseRealTime();
BENCHMARK_TEMPLATE(BM_folly_tree, kLargeSize)->UseRealTime();
#endif // !BENCHMARK_WITHOUT_FOLLY
BENCHMARK_TEMPLATE(BM_dispenso_tree, kSmallSize)->UseRealTime();
BENCHMARK_TEMPLATE(BM_dispenso_tree, kMediumSize)->UseRealTime();
BENCHMARK_TEMPLATE(BM_dispenso_tree, kLargeSize)->UseRealTime();
BENCHMARK_TEMPLATE(BM_taskset_tree, kSmallSize)->UseRealTime();
BENCHMARK_TEMPLATE(BM_taskset_tree, kMediumSize)->UseRealTime();
BENCHMARK_TEMPLATE(BM_taskset_tree, kLargeSize)->UseRealTime();
BENCHMARK_TEMPLATE(BM_dispenso_tree_when_all, kSmallSize)->UseRealTime();
BENCHMARK_TEMPLATE(BM_dispenso_tree_when_all, kMediumSize)->UseRealTime();
BENCHMARK_TEMPLATE(BM_dispenso_tree_when_all, kLargeSize)->UseRealTime();
BENCHMARK_MAIN();