cachelib/benchmarks/CacheAllocatorOpsMicroBench.cpp (243 lines of code) (raw):
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
clang-format off
Microbenchmarks for variou CacheAllocator operations
Results are at the bottom of this file
Various latency numbers circa 2012
----------------------------------
L1 cache reference 0.5 ns
Branch mispredict 5 ns
L2 cache reference 7 ns 14x L1 cache
Mutex lock/unlock 25 ns
Main memory reference 100 ns 20x L2 cache, 200x L1 cache
Compress 1K bytes with Zippy 3,000 ns 3 us
Send 1K bytes over 1 Gbps network 10,000 ns 10 us
Read 4K randomly from SSD* 150,000 ns 150 us ~1GB/sec SSD
Read 1 MB sequentially from memory 250,000 ns 250 us
Round trip within same datacenter 500,000 ns 500 us
Read 1 MB sequentially from SSD* 1,000,000 ns 1,000 us 1 ms ~1GB/sec SSD, 4X memory
Disk seek 10,000,000 ns 10,000 us 10 ms 20x datacenter roundtrip
Read 1 MB sequentially from disk 20,000,000 ns 20,000 us 20 ms 80x memory, 20X SSD
Send packet CA->Netherlands->CA 150,000,000 ns 150,000 us 150 ms
clang-format on
*/
#include <folly/Benchmark.h>
#include <folly/BenchmarkUtil.h>
#include <folly/init/Init.h>
#include <chrono>
#include <random>
#include <string>
#include <thread>
#include "cachelib/allocator/CacheAllocator.h"
#include "cachelib/benchmarks/BenchmarkUtils.h"
#include "cachelib/common/BytesEqual.h"
#include "cachelib/common/PercentileStats.h"
#include "cachelib/navy/testing/SeqPoints.h"
namespace facebook {
namespace cachelib {
namespace {
std::unique_ptr<LruAllocator> getCache(unsigned int htPower = 20) {
LruAllocator::Config config;
config.setCacheSize(1024 * 1024 * 1024);
// Hashtable: 1024 ht locks, 1M buckets
config.setAccessConfig(LruAllocator::AccessConfig{htPower, 10});
// Allocation Sizes: Min: 64 bytes. Max: 1 MB. Growth factor 108%.
config.setDefaultAllocSizes(1.08, 1024 * 1024, 64, false);
// Disable background workers
config.enablePoolRebalancing({}, std::chrono::seconds{0});
config.enableItemReaperInBackground(std::chrono::seconds{0});
auto cache = std::make_unique<LruAllocator>(config);
cache->addPool("default", cache->getCacheMemoryStats().cacheSize);
return cache;
}
} // namespace
void runDifferentHTSizes(int htPower, uint64_t numObjects) {
constexpr int kNumThreads = 16;
constexpr uint64_t kLoops = 10'000'000;
auto cache = getCache();
std::vector<std::string> keys;
for (uint64_t i = 0; i < numObjects; i++) {
// Length of key should be 10 bytes
auto key = folly::sformat("k_{: <8}", i);
auto hdl = cache->allocate(0, key, 100);
XCHECK(hdl);
cache->insertOrReplace(hdl);
keys.push_back(key);
}
navy::SeqPoints sp;
auto readOps = [&] {
sp.wait(0);
std::mt19937 gen;
std::uniform_int_distribution<uint64_t> dist(0, numObjects - 1);
for (uint64_t loop = 0; loop < kLoops; loop++) {
const auto& key = keys[dist(gen)];
auto hdl = cache->peek(key);
folly::doNotOptimizeAway(hdl);
}
};
// Create readers
std::vector<std::thread> rs;
for (int i = 0; i < kNumThreads; i++) {
rs.push_back(std::thread{readOps});
}
{
Timer t{folly::sformat("Peek - {: <2} HT Power, {: <8} Objects", htPower,
numObjects),
kLoops};
sp.reached(0); // Start the operations
for (auto& r : rs) {
r.join();
}
}
}
void runFindMultiThreads(int numThreads,
bool isPeek,
uint64_t objSize,
uint64_t writePct) {
constexpr uint64_t kObjects = 100'000;
constexpr uint64_t kLoops = 10'000'000;
auto cache = getCache();
std::vector<std::string> keys;
for (uint64_t i = 0; i < kObjects; i++) {
// Length of key should be 10 bytes
auto key = folly::sformat("k_{: <8}", i);
auto hdl = cache->allocate(0, key, objSize);
XCHECK(hdl);
cache->insertOrReplace(hdl);
keys.push_back(key);
}
navy::SeqPoints sp;
auto readOps = [&] {
sp.wait(0);
std::mt19937 gen;
std::uniform_int_distribution<uint64_t> dist(0, kObjects - 1);
for (uint64_t loop = 0; loop < kLoops; loop++) {
const auto& key = keys[dist(gen)];
auto hdl = isPeek ? cache->peek(key) : cache->find(key);
folly::doNotOptimizeAway(hdl);
}
};
auto writeOps = [&] {
sp.wait(0);
if (writePct == 0) {
return;
}
std::mt19937 gen;
std::uniform_int_distribution<uint64_t> dist(0,
kObjects * writePct / 100 - 1);
for (uint64_t loop = 0; loop < kLoops / 10; loop++) {
const auto& key = keys[dist(gen)];
auto hdl = cache->allocate(0, key, objSize);
XCHECK(hdl);
cache->insertOrReplace(hdl);
folly::doNotOptimizeAway(hdl);
}
};
// Create readers
std::vector<std::thread> rs;
for (int i = 0; i < numThreads; i++) {
rs.push_back(std::thread{readOps});
}
// Create writers
std::vector<std::thread> ws;
for (int i = 0; i < 4; i++) {
ws.push_back(std::thread{writeOps});
}
{
Timer t{folly::sformat("{} - {: <2} Threads, {: <4} Bytes, {: <2}% Write",
isPeek ? "Peek" : "Find", numThreads, objSize,
writePct),
kLoops};
sp.reached(0); // Start the operations
for (auto& r : rs) {
r.join();
}
}
for (auto& w : ws) {
w.join();
}
}
void runFindMissMultiThreads(int numThreads, bool isPeek) {
// All lookups in this test are misses
constexpr uint64_t kObjects = 100'000;
constexpr uint64_t kLoops = 10'000'000;
auto cache = getCache();
std::vector<std::string> keys;
// Populate a bunch of keys that don't exist in cache
for (uint64_t i = kObjects; i < 2 * kObjects; i++) {
// Length of key should be 10 bytes
auto key = folly::sformat("k_{: <8}", i);
keys.push_back(key);
}
// Fill up cache with some other keys
for (uint64_t i = 0; i < kObjects; i++) {
// Length of key should be 10 bytes
auto key = folly::sformat("k_{: <8}", i);
auto hdl = cache->allocate(0, key, 100);
XCHECK(hdl);
cache->insertOrReplace(hdl);
}
navy::SeqPoints sp;
auto readOps = [&] {
sp.wait(0);
std::mt19937 gen;
std::uniform_int_distribution<uint64_t> dist(0, kObjects - 1);
for (uint64_t loop = 0; loop < kLoops; loop++) {
const auto& key = keys[dist(gen)];
auto hdl = isPeek ? cache->peek(key) : cache->find(key);
folly::doNotOptimizeAway(hdl);
}
};
std::vector<std::thread> rs;
for (int i = 0; i < numThreads; i++) {
rs.push_back(std::thread{readOps});
}
{
Timer t{folly::sformat("{} - All Misses - {: <2} Threads",
isPeek ? "Peek" : "Find", numThreads),
kLoops};
sp.reached(0); // Start the operations
for (auto& r : rs) {
r.join();
}
}
}
void runAllocateMultiThreads(int numThreads,
bool preFillupCache,
std::vector<uint32_t> payloadSizes) {
constexpr uint64_t kLoops = 1;
constexpr uint64_t kObjects = 100'000;
auto cache = getCache();
std::vector<std::string> keys;
for (uint64_t i = 0; i < kObjects; i++) {
// Length of key should be 10 bytes
auto key = folly::sformat("k_{: <8}", i);
keys.push_back(key);
}
if (preFillupCache) {
uint64_t i = keys.size();
std::vector<LruAllocator::ItemHandle> handles;
while (true) {
// Length of key should be 10 bytes
auto key = folly::sformat("k_{: <8}", i);
auto hdl = cache->allocate(0, key, payloadSizes[i % payloadSizes.size()]);
if (!hdl) {
// Cache is full. Stop prefill.
break;
}
cache->insertOrReplace(hdl);
handles.push_back(std::move(hdl));
i++;
}
}
navy::SeqPoints sp;
auto writeOps = [&](uint64_t start, uint64_t end) {
sp.wait(0);
for (uint64_t loops = 0; loops < kLoops; loops++) {
for (uint64_t i = start; i < end; i++) {
auto& key = keys[i];
auto hdl =
cache->allocate(0, key, payloadSizes[i % payloadSizes.size()]);
XCHECK(hdl);
cache->insertOrReplace(hdl);
}
}
};
// Create writers
std::vector<std::thread> ws;
uint64_t startIndex = 0;
uint64_t chunkSize = kObjects / numThreads;
uint64_t totalItemsPerThread = chunkSize * kLoops;
for (int i = 0; i < numThreads; i++) {
ws.push_back(std::thread{writeOps, startIndex, startIndex + chunkSize});
startIndex += chunkSize;
}
{
Timer t{folly::sformat("Allocate - {} - {: <2} Threads, {: <2} Sizes",
preFillupCache ? "Eviction" : "New ", numThreads,
payloadSizes.size()),
totalItemsPerThread};
sp.reached(0); // Start the operations
for (auto& w : ws) {
w.join();
}
}
}
} // namespace cachelib
} // namespace facebook
using namespace facebook::cachelib;
int main(int argc, char** argv) {
folly::init(&argc, &argv);
printMsg("Benchmark Starting Now");
printMsg("Becnhmarks (Different HT Sizes)");
runDifferentHTSizes(12, 1000);
std::set<int> htPowers{16, 20, 24, 28};
std::set<uint64_t> numObjects{10'000, 100'000, 1'000'000, 10'000'000};
for (auto h : htPowers) {
std::cout << "---------\n";
for (auto o : numObjects) {
runDifferentHTSizes(h, o);
}
}
printMsg("Becnhmarks (100K Objects)");
std::set<uint64_t> threads{1, 4, 8, 16, 32, 64};
std::set<bool> findOrPeek{true, false};
for (auto t : threads) {
std::cout << "---------\n";
for (auto f : findOrPeek) {
runFindMissMultiThreads(t, f);
}
}
std::cout << "---------\n";
std::set<uint64_t> objSizes{0, 100, 1000, 10000};
for (auto o : objSizes) {
runFindMultiThreads(32, false, o, 0);
}
std::set<uint64_t> writePcts{0, 5, 10, 20};
for (auto t : threads) {
for (auto f : findOrPeek) {
std::cout << "---------\n";
for (auto w : writePcts) {
runFindMultiThreads(t, f, 100, w);
}
}
}
std::set<bool> preFillupCache{true, false};
std::set<std::vector<uint32_t>> setOfPayloadSizes{
{5000},
{1000, 5000, 10000},
{0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000}};
for (auto t : threads) {
for (auto p : preFillupCache) {
std::cout << "---------\n";
for (auto s : setOfPayloadSizes) {
runAllocateMultiThreads(t, p, s);
}
}
}
printMsg("Becnhmarks have completed");
}
/*
clang-format off
Hardware Spec: T1 Skylake
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 36
On-line CPU(s) list: 0-35
Thread(s) per core: 2
Core(s) per socket: 18
Socket(s): 1
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 85
Model name: Intel(R) Xeon(R) D-2191A CPU @ 1.60GHz
Stepping: 4
CPU MHz: 1527.578
CPU max MHz: 1601.0000
CPU min MHz: 800.0000
BogoMIPS: 3200.00
Virtualization: VT-x
L1d cache: 32K
L1i cache: 32K
L2 cache: 1024K
L3 cache: 25344K
NUMA node0 CPU(s): 0-35
-------- Benchmark Starting Now --------------------------------------------------------------------
-------- Becnhmarks (Different HT Sizes) -----------------------------------------------------------
[Peek - 12 HT Power, 1000 Objects ] Per-Op: 199 ns, 317 cycles
---------
[Peek - 16 HT Power, 10000 Objects ] Per-Op: 202 ns, 323 cycles
[Peek - 16 HT Power, 100000 Objects ] Per-Op: 216 ns, 345 cycles
[Peek - 16 HT Power, 1000000 Objects ] Per-Op: 414 ns, 660 cycles
[Peek - 16 HT Power, 10000000 Objects ] Per-Op: 1120 ns, 1788 cycles
---------
[Peek - 20 HT Power, 10000 Objects ] Per-Op: 202 ns, 322 cycles
[Peek - 20 HT Power, 100000 Objects ] Per-Op: 216 ns, 345 cycles
[Peek - 20 HT Power, 1000000 Objects ] Per-Op: 412 ns, 657 cycles
[Peek - 20 HT Power, 10000000 Objects ] Per-Op: 1131 ns, 1806 cycles
---------
[Peek - 24 HT Power, 10000 Objects ] Per-Op: 203 ns, 324 cycles
[Peek - 24 HT Power, 100000 Objects ] Per-Op: 219 ns, 350 cycles
[Peek - 24 HT Power, 1000000 Objects ] Per-Op: 412 ns, 657 cycles
[Peek - 24 HT Power, 10000000 Objects ] Per-Op: 1137 ns, 1815 cycles
---------
[Peek - 28 HT Power, 10000 Objects ] Per-Op: 216 ns, 345 cycles
[Peek - 28 HT Power, 100000 Objects ] Per-Op: 323 ns, 515 cycles
[Peek - 28 HT Power, 1000000 Objects ] Per-Op: 417 ns, 666 cycles
[Peek - 28 HT Power, 10000000 Objects ] Per-Op: 1123 ns, 1794 cycles
-------- Becnhmarks (100K Objects) -----------------------------------------------------------------
---------
[Find - All Misses - 1 Threads ] Per-Op: 101 ns, 162 cycles
[Peek - All Misses - 1 Threads ] Per-Op: 86 ns, 138 cycles
---------
[Find - All Misses - 4 Threads ] Per-Op: 103 ns, 165 cycles
[Peek - All Misses - 4 Threads ] Per-Op: 90 ns, 144 cycles
---------
[Find - All Misses - 8 Threads ] Per-Op: 110 ns, 176 cycles
[Peek - All Misses - 8 Threads ] Per-Op: 91 ns, 146 cycles
---------
[Find - All Misses - 16 Threads ] Per-Op: 114 ns, 182 cycles
[Peek - All Misses - 16 Threads ] Per-Op: 94 ns, 151 cycles
---------
[Find - All Misses - 32 Threads ] Per-Op: 265 ns, 424 cycles
[Peek - All Misses - 32 Threads ] Per-Op: 252 ns, 403 cycles
---------
[Find - All Misses - 64 Threads ] Per-Op: 376 ns, 600 cycles
[Peek - All Misses - 64 Threads ] Per-Op: 308 ns, 492 cycles
---------
[Find - 32 Threads, 0 Bytes, 0 % Write ] Per-Op: 340 ns, 542 cycles
[Find - 32 Threads, 100 Bytes, 0 % Write ] Per-Op: 337 ns, 539 cycles
[Find - 32 Threads, 1000 Bytes, 0 % Write ] Per-Op: 340 ns, 543 cycles
[Find - 32 Threads, 10000 Bytes, 0 % Write ] Per-Op: 359 ns, 574 cycles
---------
[Find - 1 Threads, 100 Bytes, 0 % Write ] Per-Op: 207 ns, 331 cycles
[Find - 1 Threads, 100 Bytes, 5 % Write ] Per-Op: 295 ns, 471 cycles
[Find - 1 Threads, 100 Bytes, 10% Write ] Per-Op: 337 ns, 538 cycles
[Find - 1 Threads, 100 Bytes, 20% Write ] Per-Op: 360 ns, 576 cycles
---------
[Peek - 1 Threads, 100 Bytes, 0 % Write ] Per-Op: 180 ns, 288 cycles
[Peek - 1 Threads, 100 Bytes, 5 % Write ] Per-Op: 221 ns, 353 cycles
[Peek - 1 Threads, 100 Bytes, 10% Write ] Per-Op: 208 ns, 333 cycles
[Peek - 1 Threads, 100 Bytes, 20% Write ] Per-Op: 207 ns, 330 cycles
---------
[Find - 4 Threads, 100 Bytes, 0 % Write ] Per-Op: 217 ns, 347 cycles
[Find - 4 Threads, 100 Bytes, 5 % Write ] Per-Op: 363 ns, 579 cycles
[Find - 4 Threads, 100 Bytes, 10% Write ] Per-Op: 375 ns, 598 cycles
[Find - 4 Threads, 100 Bytes, 20% Write ] Per-Op: 380 ns, 607 cycles
---------
[Peek - 4 Threads, 100 Bytes, 0 % Write ] Per-Op: 184 ns, 293 cycles
[Peek - 4 Threads, 100 Bytes, 5 % Write ] Per-Op: 280 ns, 448 cycles
[Peek - 4 Threads, 100 Bytes, 10% Write ] Per-Op: 293 ns, 467 cycles
[Peek - 4 Threads, 100 Bytes, 20% Write ] Per-Op: 316 ns, 505 cycles
---------
[Find - 8 Threads, 100 Bytes, 0 % Write ] Per-Op: 255 ns, 407 cycles
[Find - 8 Threads, 100 Bytes, 5 % Write ] Per-Op: 402 ns, 642 cycles
[Find - 8 Threads, 100 Bytes, 10% Write ] Per-Op: 392 ns, 626 cycles
[Find - 8 Threads, 100 Bytes, 20% Write ] Per-Op: 384 ns, 614 cycles
---------
[Peek - 8 Threads, 100 Bytes, 0 % Write ] Per-Op: 198 ns, 316 cycles
[Peek - 8 Threads, 100 Bytes, 5 % Write ] Per-Op: 325 ns, 519 cycles
[Peek - 8 Threads, 100 Bytes, 10% Write ] Per-Op: 317 ns, 506 cycles
[Peek - 8 Threads, 100 Bytes, 20% Write ] Per-Op: 323 ns, 516 cycles
---------
[Find - 16 Threads, 100 Bytes, 0 % Write ] Per-Op: 256 ns, 410 cycles
[Find - 16 Threads, 100 Bytes, 5 % Write ] Per-Op: 453 ns, 723 cycles
[Find - 16 Threads, 100 Bytes, 10% Write ] Per-Op: 459 ns, 733 cycles
[Find - 16 Threads, 100 Bytes, 20% Write ] Per-Op: 425 ns, 678 cycles
---------
[Peek - 16 Threads, 100 Bytes, 0 % Write ] Per-Op: 236 ns, 377 cycles
[Peek - 16 Threads, 100 Bytes, 5 % Write ] Per-Op: 367 ns, 586 cycles
[Peek - 16 Threads, 100 Bytes, 10% Write ] Per-Op: 382 ns, 610 cycles
[Peek - 16 Threads, 100 Bytes, 20% Write ] Per-Op: 393 ns, 628 cycles
---------
[Find - 32 Threads, 100 Bytes, 0 % Write ] Per-Op: 401 ns, 641 cycles
[Find - 32 Threads, 100 Bytes, 5 % Write ] Per-Op: 516 ns, 824 cycles
[Find - 32 Threads, 100 Bytes, 10% Write ] Per-Op: 468 ns, 747 cycles
[Find - 32 Threads, 100 Bytes, 20% Write ] Per-Op: 456 ns, 728 cycles
---------
[Peek - 32 Threads, 100 Bytes, 0 % Write ] Per-Op: 298 ns, 476 cycles
[Peek - 32 Threads, 100 Bytes, 5 % Write ] Per-Op: 369 ns, 590 cycles
[Peek - 32 Threads, 100 Bytes, 10% Write ] Per-Op: 411 ns, 657 cycles
[Peek - 32 Threads, 100 Bytes, 20% Write ] Per-Op: 396 ns, 632 cycles
---------
[Find - 64 Threads, 100 Bytes, 0 % Write ] Per-Op: 642 ns, 1026 cycles
[Find - 64 Threads, 100 Bytes, 5 % Write ] Per-Op: 1087 ns, 1736 cycles
[Find - 64 Threads, 100 Bytes, 10% Write ] Per-Op: 1092 ns, 1744 cycles
[Find - 64 Threads, 100 Bytes, 20% Write ] Per-Op: 1080 ns, 1724 cycles
---------
[Peek - 64 Threads, 100 Bytes, 0 % Write ] Per-Op: 549 ns, 876 cycles
[Peek - 64 Threads, 100 Bytes, 5 % Write ] Per-Op: 1027 ns, 1639 cycles
[Peek - 64 Threads, 100 Bytes, 10% Write ] Per-Op: 1008 ns, 1610 cycles
[Peek - 64 Threads, 100 Bytes, 20% Write ] Per-Op: 1018 ns, 1626 cycles
---------
[Allocate - New - 1 Threads, 11 Sizes ] Per-Op: 1772 ns, 2830 cycles
[Allocate - New - 1 Threads, 3 Sizes ] Per-Op: 1677 ns, 2677 cycles
[Allocate - New - 1 Threads, 1 Sizes ] Per-Op: 2062 ns, 3291 cycles
---------
[Allocate - Eviction - 1 Threads, 11 Sizes ] Per-Op: 982 ns, 1569 cycles
[Allocate - Eviction - 1 Threads, 3 Sizes ] Per-Op: 1068 ns, 1705 cycles
[Allocate - Eviction - 1 Threads, 1 Sizes ] Per-Op: 1039 ns, 1659 cycles
---------
[Allocate - New - 4 Threads, 11 Sizes ] Per-Op: 2680 ns, 4279 cycles
[Allocate - New - 4 Threads, 3 Sizes ] Per-Op: 2845 ns, 4542 cycles
[Allocate - New - 4 Threads, 1 Sizes ] Per-Op: 3376 ns, 5389 cycles
---------
[Allocate - Eviction - 4 Threads, 11 Sizes ] Per-Op: 1989 ns, 3176 cycles
[Allocate - Eviction - 4 Threads, 3 Sizes ] Per-Op: 2564 ns, 4093 cycles
[Allocate - Eviction - 4 Threads, 1 Sizes ] Per-Op: 4412 ns, 7043 cycles
---------
[Allocate - New - 8 Threads, 11 Sizes ] Per-Op: 3032 ns, 4840 cycles
[Allocate - New - 8 Threads, 3 Sizes ] Per-Op: 3339 ns, 5330 cycles
[Allocate - New - 8 Threads, 1 Sizes ] Per-Op: 4523 ns, 7220 cycles
---------
[Allocate - Eviction - 8 Threads, 11 Sizes ] Per-Op: 2524 ns, 4028 cycles
[Allocate - Eviction - 8 Threads, 3 Sizes ] Per-Op: 3836 ns, 6124 cycles
[Allocate - Eviction - 8 Threads, 1 Sizes ] Per-Op: 8916 ns, 14232 cycles
---------
[Allocate - New - 16 Threads, 11 Sizes ] Per-Op: 4106 ns, 6554 cycles
[Allocate - New - 16 Threads, 3 Sizes ] Per-Op: 4643 ns, 7412 cycles
[Allocate - New - 16 Threads, 1 Sizes ] Per-Op: 7628 ns, 12176 cycles
---------
[Allocate - Eviction - 16 Threads, 11 Sizes ] Per-Op: 3686 ns, 5884 cycles
[Allocate - Eviction - 16 Threads, 3 Sizes ] Per-Op: 7184 ns, 11467 cycles
[Allocate - Eviction - 16 Threads, 1 Sizes ] Per-Op: 18559 ns, 29625 cycles
---------
[Allocate - New - 32 Threads, 11 Sizes ] Per-Op: 8358 ns, 13341 cycles
[Allocate - New - 32 Threads, 3 Sizes ] Per-Op: 9577 ns, 15287 cycles
[Allocate - New - 32 Threads, 1 Sizes ] Per-Op: 16754 ns, 26743 cycles
---------
[Allocate - Eviction - 32 Threads, 11 Sizes ] Per-Op: 6055 ns, 9666 cycles
[Allocate - Eviction - 32 Threads, 3 Sizes ] Per-Op: 14372 ns, 22942 cycles
[Allocate - Eviction - 32 Threads, 1 Sizes ] Per-Op: 37691 ns, 60163 cycles
---------
[Allocate - New - 64 Threads, 11 Sizes ] Per-Op: 17692 ns, 28242 cycles
[Allocate - New - 64 Threads, 3 Sizes ] Per-Op: 20727 ns, 33086 cycles
[Allocate - New - 64 Threads, 1 Sizes ] Per-Op: 35300 ns, 56348 cycles
---------
[Allocate - Eviction - 64 Threads, 11 Sizes ] Per-Op: 26014 ns, 41524 cycles
[Allocate - Eviction - 64 Threads, 3 Sizes ] Per-Op: 34119 ns, 54463 cycles
[Allocate - Eviction - 64 Threads, 1 Sizes ] Per-Op: 76896 ns, 122744 cycles
-------- Becnhmarks have completed -----------------------------------------------------------------
clang-format on
*/