cachelib/benchmarks/MutexBench.cpp

/* * Copyright (c) Facebook, Inc. and its affiliates. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include <folly/Benchmark.h> #include <folly/Format.h> #include <folly/Random.h> #include <folly/SpinLock.h> #include <folly/init/Init.h> #include <folly/portability/Asm.h> #include <gflags/gflags.h> #include <sys/resource.h> #include <atomic> #include <chrono> #include <condition_variable> #include <iostream> #include <memory> #include <mutex> #include <thread> #include <unordered_map> #include <vector> #include "cachelib/allocator/datastruct/DList.h" #include "cachelib/common/Mutex.h" #include "cachelib/common/Time.h" using namespace facebook::cachelib; DEFINE_uint64(size, 1 << 16, "Number of nodes to populate the list with"); DEFINE_uint64(num_threads, 0, "Number of threads to be run concurrently. 0 means " "hardware_concurrency on the platform"); DEFINE_uint64(num_thread_ops, 1 << 19, "Number of operations to be performed in each thread"); DEFINE_uint64(sleep_nano, 0, "Number of nano seconds to sleep between each operation"); DEFINE_uint64(spin_nano, 100, "Number of nano secs to spin after each operation"); DEFINE_double(updates, 0.9, "Weight for lru updates"); DEFINE_double(evicts, 0.5, "Weight for lru evictions"); DEFINE_double(deletes, 0.01, "Weight for deletes"); DEFINE_uint64(max_search_len, 50, "Maximum search depth for evicts"); DEFINE_bool(print_stats, true, "Print the stats to stdout"); namespace { struct LoadInfo { private: double totalWeight{}; public: LoadInfo() {} LoadInfo(double update, double evict, double dels) : totalWeight(update + evict + dels), updateRatio(update / totalWeight), deleteRatio(dels / totalWeight), evictRatio(evict / totalWeight) {} double updateRatio{0.5}; double deleteRatio{0.2}; double evictRatio{0.1}; }; struct Stats { uint64_t numUpdates{0}; uint64_t numDeletes{0}; uint64_t numEvicts{0}; uint64_t numInvCsw{0}; uint64_t numVCsw{0}; uint64_t elapsedNSecs{0}; Stats& operator+=(const Stats& other) { numUpdates += other.numUpdates; numDeletes += other.numDeletes; numEvicts += other.numEvicts; elapsedNSecs = std::max(elapsedNSecs, other.elapsedNSecs); return *this; } double elapsedSecs() const noexcept { return static_cast<double>(elapsedNSecs) / 1e9; } void render() const { std::cout << folly::sformat("{:30}: {:16.4f}", "Elapsed time", elapsedSecs()) << std::endl; std::cout << folly::sformat("{:30}: {:16,}", "Vol Context Switches", numVCsw) << std::endl; std::cout << folly::sformat("{:30}: {:16,}", "Inv Context Switches", numInvCsw) << std::endl; auto rate = [&](uint64_t num) { return static_cast<uint64_t>(num / elapsedSecs()); }; std::cout << folly::sformat("{:30}: {:16,}/sec", "Updates", rate(numUpdates)) << std::endl; std::cout << folly::sformat("{:30}: {:16,}/sec", "Deletes", rate(numDeletes)) << std::endl; std::cout << folly::sformat("{:30}: {:16,}/sec", "Evictions", rate(numEvicts)) << std::endl; const auto total = numUpdates + numDeletes + numEvicts; std::cout << folly::sformat("{:30}: {:16,}/sec", "Throughput", rate(total)) << std::endl; } }; class Lru; // our own implementation of node that will be put inside the container. struct Node { public: Node(const Node&) = delete; Node& operator=(const Node&) = delete; Node(Node&&) = default; Node& operator=(Node&&) = default; // Node does not perform pointer compression, but it needs to supply a dummy // PtrCompressor using CompressedPtr = Node*; struct PtrCompressor { constexpr CompressedPtr compress(Node* uncompressed) const noexcept { return uncompressed; } constexpr Node* unCompress(CompressedPtr compressed) const noexcept { return compressed; } }; explicit Node() noexcept { XDCHECK(hook_.getNext(PtrCompressor()) == nullptr); XDCHECK(hook_.getPrev(PtrCompressor()) == nullptr); } protected: bool isInMMContainer() const noexcept { return inContainer_; } void markInMMContainer() noexcept { inContainer_ = true; } void unmarkInMMContainer() noexcept { inContainer_ = false; } void setUpdateTime(uint32_t time) noexcept { hook_.setUpdateTime(time); } uint32_t getUpdateTime() const noexcept { return hook_.getUpdateTime(); } public: DListHook<Node> hook_{}; private: bool inContainer_{false}; friend Lru; }; class Lru { public: explicit Lru(unsigned int numNodes) : numNodes_(numNodes), lru_{Node::PtrCompressor{}} { for (size_t i = 0; i < numNodes_; i++) { nodes_.emplace_back(Node{}); } // add all the nodes once the vector is sized appropriately. for (auto& node : nodes_) { add(node, 0); } XDCHECK_EQ(nodes_.size(), numNodes_); } unsigned int getRandomNodeIdx(folly::ThreadLocalPRNG& rng) const noexcept { return folly::Random::rand32(0, numNodes_, rng); } void doUpdate(unsigned int nodeIdx, uint32_t time) { XDCHECK_LT(nodeIdx, numNodes_); XDCHECK(!nodes_.empty()); auto& node = nodes_[nodeIdx]; lru_.moveToHead(node); node.setUpdateTime(time); } void doEvict(unsigned int iterationLength, uint32_t time) { unsigned int i = 0; for (auto it = lru_.rbegin(); it != lru_.rend(); ++it) { auto& node = *it; if (++i == iterationLength) { remove(node); add(node, time); break; } } } private: void add(Node& node, uint32_t time) { lru_.linkAtHead(node); node.markInMMContainer(); node.setUpdateTime(time); } void remove(Node& node) { lru_.remove(node); node.unmarkInMMContainer(); } using LruImpl = DList<Node, &Node::hook_>; const unsigned int numNodes_; std::vector<Node> nodes_; LruImpl lru_; }; // global lru that every thread will access. std::unique_ptr<Lru> gLru; LoadInfo gLoadInfo{}; template <typename Mutex> void runBench(Stats& global) { // mutex for our lru Mutex mutex; std::vector<std::thread> threads; // thread local stats. std::vector<Stats> stats(FLAGS_num_threads); // count of number of threads completed std::atomic<unsigned int> nCompleted{0}; // main thread will wait on this to figure out when the benchmark is // complete std::mutex benchFinishMutex; bool benchFinished = false; std::condition_variable benchFinishCV; // all benchmark threads will wait on this after completing the benchmark // and before doing the thread cleanup. bool cleanUpStart = false; std::condition_variable cleanupCV; std::mutex cleanupMutex; auto runInThread = [&](unsigned int index) { auto rng = folly::ThreadLocalPRNG(); std::mt19937 gen(folly::Random::rand32(rng)); std::uniform_real_distribution<> opDis(0, 1); auto& s = stats[index]; const auto now = util::getCurrentTimeNs(); using LockHolder = std::unique_lock<Mutex>; for (size_t i = 0; i < FLAGS_num_thread_ops; i++) { const auto r = opDis(gen); const uint32_t currTime = util::getCurrentTimeSec(); if (r < gLoadInfo.updateRatio) { const auto idx = gLru->getRandomNodeIdx(rng); ++s.numUpdates; LockHolder l(mutex); gLru->doUpdate(idx, currTime); } if (r < gLoadInfo.evictRatio) { const auto searchLen = folly::Random::rand32(FLAGS_max_search_len, rng); ++s.numEvicts; LockHolder l(mutex); gLru->doEvict(searchLen, currTime); } if (r < gLoadInfo.deleteRatio) { ++s.numEvicts; LockHolder l(mutex); // for now, deletes also do evicts gLru->doEvict(1, currTime); } if (FLAGS_spin_nano) { unsigned int spins = FLAGS_spin_nano / 40; while (--spins) { // this pause takes up to 40 clock cycles on intel and the lock // cmpxchgl // above should take about 100 clock cycles. we pause once every 400 // cycles or so if we are extremely unlucky. folly::asm_volatile_pause(); } } if (FLAGS_sleep_nano) { /* sleep override */ std::this_thread::sleep_for( std::chrono::nanoseconds(FLAGS_sleep_nano)); } } s.elapsedNSecs += (util::getCurrentTimeNs() - now); if (++nCompleted == FLAGS_num_threads) { { std::unique_lock<std::mutex> l(benchFinishMutex); benchFinished = true; } benchFinishCV.notify_one(); } std::unique_lock<std::mutex> l(cleanupMutex); cleanupCV.wait(l, [&] { return cleanUpStart; }); }; struct rusage rUsageBefore = {}; struct rusage rUsageAfter = {}; BENCHMARK_SUSPEND { getrusage(RUSAGE_SELF, &rUsageBefore); for (size_t i = 0; i < FLAGS_num_threads; i++) { threads.push_back(std::thread{runInThread, i}); } } { // wait for benchmark to finish. std::unique_lock<std::mutex> l(benchFinishMutex); benchFinishCV.wait(l, [&] { return benchFinished; }); } BENCHMARK_SUSPEND { { std::unique_lock<std::mutex> l(cleanupMutex); cleanUpStart = true; } cleanupCV.notify_all(); for (auto& thread : threads) { thread.join(); } getrusage(RUSAGE_SELF, &rUsageAfter); for (auto& stat : stats) { global += stat; } global.numVCsw += rUsageAfter.ru_nvcsw - rUsageBefore.ru_nvcsw; global.numInvCsw += rUsageAfter.ru_nivcsw - rUsageBefore.ru_nivcsw; } } enum class MutexType : int { kStdMutex, kFollySpin, kPThreadMutex, kPThreadAdaptiveMutex, kPThreadSpinMutex, kPThreadSpinLock }; struct EnumHash { std::size_t operator()(MutexType t) const noexcept { return static_cast<size_t>(t); } }; std::unordered_map<MutexType, std::pair<Stats, std::string>, EnumHash> stats = { {MutexType::kStdMutex, {{}, "std::mutex"}}, {MutexType::kFollySpin, {{}, "folly SpinLock"}}, {MutexType::kPThreadMutex, {{}, "PThreadMutex"}}, {MutexType::kPThreadAdaptiveMutex, {{}, "PThreadAdaptiveMutex"}}, {MutexType::kPThreadSpinMutex, {{}, "PThreadSpinMutex"}}, {MutexType::kPThreadSpinLock, {{}, "PThreadSpinLock"}}, }; BENCHMARK(StandardMutex) { runBench<std::mutex>(stats[MutexType::kStdMutex].first); } BENCHMARK_RELATIVE(SpinLock) { runBench<folly::SpinLock>(stats[MutexType::kFollySpin].first); } BENCHMARK_RELATIVE(PThreadMutex) { runBench<facebook::cachelib::PThreadMutex>( stats[MutexType::kPThreadMutex].first); } BENCHMARK_RELATIVE(PThreadAdaptiveMutex) { runBench<facebook::cachelib::PThreadAdaptiveMutex>( stats[MutexType::kPThreadAdaptiveMutex].first); } BENCHMARK_RELATIVE(PThreadSpinMutex) { runBench<facebook::cachelib::PThreadSpinMutex>( stats[MutexType::kPThreadSpinMutex].first); } BENCHMARK_RELATIVE(PThreadSpinLock) { runBench<facebook::cachelib::PThreadSpinLock>( stats[MutexType::kPThreadSpinLock].first); } } // namespace int main(int argc, char** argv) { folly::init(&argc, &argv); gLru = std::make_unique<Lru>(FLAGS_size); gLoadInfo = {FLAGS_updates, FLAGS_evicts, FLAGS_deletes}; if (FLAGS_num_threads == 0) { FLAGS_num_threads = std::thread::hardware_concurrency(); if (FLAGS_num_threads == 0) { FLAGS_num_threads = 32; } } folly::runBenchmarks(); if (FLAGS_print_stats) { auto printInt = [](folly::StringPiece key, size_t val) { std::cout << folly::sformat("{:20}: {:16,} ", key, val) << std::endl; }; auto printDouble = [](folly::StringPiece key, double val) { std::cout << folly::sformat("{:20}: {:16.2f} ", key, val) << std::endl; }; printInt("num_threads", FLAGS_num_threads); printInt("num_thread_ops", FLAGS_num_thread_ops); printInt("size", FLAGS_size); printInt("sleep_nano", FLAGS_sleep_nano); printInt("spin_nano", FLAGS_spin_nano); printDouble("evict ratio", gLoadInfo.evictRatio); printDouble("delete ratio", gLoadInfo.deleteRatio); printDouble("update ratio", gLoadInfo.updateRatio); std::cout << std::endl; for (auto& stat : stats) { std::cout << folly::sformat("==={:16}===", stat.second.second) << std::endl; stat.second.first.render(); std::cout << std::endl; } } return 0; }

cachelib/benchmarks/MutexBench.cpp (351 lines of code) (raw):