cachelib/benchmarks/SpeedUpExistenceCheckBenchmark.cpp (317 lines of code) (raw):
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
clang-format off
Microbenchmarks to explore strategy to improve existence check
Results are at the bottom of this file
Various latency numbers circa 2012
----------------------------------
L1 cache reference 0.5 ns
Branch mispredict 5 ns
L2 cache reference 7 ns 14x L1 cache
Mutex lock/unlock 25 ns
Main memory reference 100 ns 20x L2 cache, 200x L1 cache
Compress 1K bytes with Zippy 3,000 ns 3 us
Send 1K bytes over 1 Gbps network 10,000 ns 10 us
Read 4K randomly from SSD* 150,000 ns 150 us ~1GB/sec SSD
Read 1 MB sequentially from memory 250,000 ns 250 us
Round trip within same datacenter 500,000 ns 500 us
Read 1 MB sequentially from SSD* 1,000,000 ns 1,000 us 1 ms ~1GB/sec SSD, 4X memory
Disk seek 10,000,000 ns 10,000 us 10 ms 20x datacenter roundtrip
Read 1 MB sequentially from disk 20,000,000 ns 20,000 us 20 ms 80x memory, 20X SSD
Send packet CA->Netherlands->CA 150,000,000 ns 150,000 us 150 ms
clang-format on
*/
#include <folly/Benchmark.h>
#include <folly/BenchmarkUtil.h>
#include <folly/init/Init.h>
#include <chrono>
#include <random>
#include <string>
#include <thread>
#include "cachelib/allocator/CacheAllocator.h"
#include "cachelib/benchmarks/BenchmarkUtils.h"
#include "cachelib/common/BytesEqual.h"
#include "cachelib/common/PercentileStats.h"
#include "cachelib/navy/testing/SeqPoints.h"
namespace facebook {
namespace cachelib {
namespace {
template <size_t PayloadSize>
struct ObjectImpl {
explicit ObjectImpl(std::string k) : key(std::move(k)) {}
std::string key;
std::array<uint8_t, PayloadSize> payload;
};
struct Bucket {
using Object = ObjectImpl<100>;
void setObject(Object* o) { obj = o; }
const std::string& getKey() const { return obj->key; }
Object* obj{};
Bucket* next{};
};
struct BucketWithKey {
using Object = ObjectImpl<100>;
void setObject(Object* o) {
obj = o;
key = o->key;
}
const std::string& getKey() const { return key; }
std::string key;
Object* obj{};
BucketWithKey* next{};
};
struct SharedMutex {
auto getReadLock() { return folly::SharedMutex::ReadHolder{l}; }
auto getWriteLock() { return folly::SharedMutex::WriteHolder{l}; }
folly::SharedMutex l{};
};
struct alignas(folly::hardware_destructive_interference_size)
SharedMutexAligned {
auto getReadLock() { return folly::SharedMutex::ReadHolder{l}; }
auto getWriteLock() { return folly::SharedMutex::WriteHolder{l}; }
folly::SharedMutex l{};
};
struct SpinLock {
auto getReadLock() { return std::lock_guard<folly::MicroSpinLock>(l); }
auto getWriteLock() { return std::lock_guard<folly::MicroSpinLock>(l); }
folly::MicroSpinLock l{};
};
struct alignas(folly::hardware_destructive_interference_size) SpinLockAligned {
auto getReadLock() { return std::lock_guard<folly::MicroSpinLock>(l); }
auto getWriteLock() { return std::lock_guard<folly::MicroSpinLock>(l); }
folly::MicroSpinLock l{};
};
template <typename BucketT, typename LockT>
class HashTableImpl {
public:
using BucketType = BucketT;
using Object = typename BucketType::Object;
using Lock = LockT;
HashTableImpl(int bucketPower, int lockPower)
: locks_((1ULL << lockPower)), buckets_((1ULL << bucketPower)) {}
~HashTableImpl() {
for (auto b : buckets_) {
auto* curr = b.next;
while (curr != nullptr) {
auto* next = curr->next;
delete curr;
curr = next;
}
}
}
void insert(Object* obj) {
auto& bucket = getBucket(obj->key);
auto w = getLock(obj->key).getWriteLock();
auto* curBucket = &bucket;
while (curBucket->obj) {
if (!curBucket->next) {
curBucket->next = new BucketType;
}
curBucket = curBucket->next;
}
curBucket->setObject(obj);
}
Object* lookup(const std::string& key) {
auto& bucket = getBucket(key);
auto r = getLock(key).getReadLock();
auto* curBucket = &bucket;
while (curBucket) {
if (curBucket->getKey() == key) {
return curBucket->obj;
}
curBucket = curBucket->next;
}
return nullptr;
}
template <size_t BATCH_SIZE>
void multiLookup(const std::array<std::string, BATCH_SIZE>& keys,
std::array<Object*, BATCH_SIZE>& objects) {
for (size_t i = 0; i < keys.size(); i++) {
objects[i] = lookup(keys[i]);
}
}
template <size_t BATCH_SIZE>
void multiLookup(const std::array<std::string, BATCH_SIZE>& keys,
std::array<Object*, BATCH_SIZE>& objects,
bool prefetchObject) {
std::array<BucketType*, BATCH_SIZE> buckets;
std::array<Lock*, BATCH_SIZE> locks;
for (size_t i = 0; i < keys.size(); i++) {
MurmurHash2 hasher;
uint32_t hash = hasher(keys[i].data(), keys[i].size());
buckets[i] = &buckets_[static_cast<size_t>(hash) % buckets_.size()];
prefetchRead(buckets[i]);
locks[i] = &locks_[static_cast<size_t>(hash % locks_.size())];
}
if (prefetchObject) {
for (size_t i = 0; i < keys.size(); i++) {
prefetchRead(buckets[i]->obj);
}
}
for (size_t i = 0; i < keys.size(); i++) {
auto r = locks[i]->getReadLock();
auto* curBucket = buckets[i];
while (curBucket) {
if (curBucket->getKey() == keys[i]) {
objects[i] = curBucket->obj;
break;
}
curBucket = curBucket->next;
}
objects[i] = nullptr;
}
}
private:
static FOLLY_ALWAYS_INLINE void prefetchRead(void* ptr) {
__builtin_prefetch(ptr, /* read or write */ 0, /* locality hint */ 3);
}
BucketType& getBucket(const std::string& key) {
return buckets_[getHash(key) % buckets_.size()];
}
Lock& getLock(const std::string& key) {
return locks_[getHash(key) % locks_.size()];
}
uint32_t getHash(const std::string& key) {
MurmurHash2 hasher;
return hasher(key.data(), key.size());
}
std::vector<Lock> locks_;
std::vector<BucketType> buckets_;
};
} // namespace
template <typename BucketT, typename LockT>
void testSequential(int numThreads,
int htBucketPower,
int htLockPower,
uint64_t numObjects,
const char* msg = "reg") {
using HashTable = HashTableImpl<BucketT, LockT>;
using Object = typename HashTable::Object;
constexpr uint64_t kLoops = 10'000'000;
std::vector<std::string> keys;
std::vector<std::unique_ptr<Object>> objects;
std::unique_ptr<HashTable> ht;
BENCHMARK_SUSPEND {
ht = std::make_unique<HashTable>(htBucketPower, htLockPower);
for (uint64_t i = 0; i < numObjects; i++) {
auto key = folly::sformat("k_{:<8}", i);
keys.push_back(key);
objects.push_back(std::make_unique<Object>(key));
ht->insert(objects.back().get());
}
}
navy::SeqPoints sp;
auto readOps = [&] {
sp.wait(0);
std::mt19937 gen;
std::uniform_int_distribution<uint64_t> dist(0, numObjects - 1);
for (uint64_t loop = 0; loop < kLoops; loop++) {
const auto& key = keys[dist(gen)];
auto object = ht->lookup(key);
folly::doNotOptimizeAway(object);
}
};
std::vector<std::thread> rs;
for (int i = 0; i < numThreads; i++) {
rs.push_back(std::thread{readOps});
}
{
Timer t{
folly::sformat(
"Sequential_{} - {: <2} T, {: <2} HB, {: <2} HL, {: <8} Objects",
msg, numThreads, htBucketPower, htLockPower, numObjects),
kLoops};
sp.reached(0); // Start the operations
for (auto& r : rs) {
r.join();
}
}
}
template <typename BucketT, typename LockT, size_t BATCH_SIZE>
void testBatch(int numThreads,
int htBucketPower,
int htLockPower,
uint64_t numObjects,
bool doesPrefetchObject,
const char* msg = "reg") {
using HashTable = HashTableImpl<BucketT, LockT>;
using Object = typename HashTable::Object;
constexpr uint64_t kLoops = 10'000'000;
std::vector<std::string> keys;
std::vector<std::unique_ptr<Object>> objects;
std::unique_ptr<HashTable> ht;
BENCHMARK_SUSPEND {
ht = std::make_unique<HashTable>(htBucketPower, htLockPower);
for (uint64_t i = 0; i < numObjects; i++) {
auto key = folly::sformat("k_{:<8}", i);
keys.push_back(key);
objects.push_back(std::make_unique<Object>(key));
ht->insert(objects.back().get());
}
}
navy::SeqPoints sp;
auto readOps = [&] {
sp.wait(0);
std::mt19937 gen;
std::uniform_int_distribution<uint64_t> dist(0, numObjects - 1);
for (uint64_t loop = 0; loop < kLoops / BATCH_SIZE; loop++) {
std::array<Object*, BATCH_SIZE> objects;
std::array<std::string, BATCH_SIZE> batchedKeys;
BENCHMARK_SUSPEND {
for (auto& key : batchedKeys) {
key = keys[dist(gen)];
}
}
ht->template multiLookup<BATCH_SIZE>(batchedKeys, objects,
doesPrefetchObject);
folly::doNotOptimizeAway(objects);
}
};
std::vector<std::thread> rs;
for (int i = 0; i < numThreads; i++) {
rs.push_back(std::thread{readOps});
}
{
Timer t{folly::sformat("Prefetch{} - {: <4} B, {: <2} T, {: <2} HB, {: <2} "
"HL, {: <8} Objects",
msg, BATCH_SIZE, numThreads, htBucketPower,
htLockPower, numObjects),
kLoops};
sp.reached(0); // Start the operations
for (auto& r : rs) {
r.join();
}
}
}
} // namespace cachelib
} // namespace facebook
using namespace facebook::cachelib;
int main(int argc, char** argv) {
static_assert(sizeof(SharedMutex) < sizeof(SharedMutexAligned), "alignment");
static_assert(sizeof(SpinLock) < sizeof(SpinLockAligned), "alignment");
folly::init(&argc, &argv);
// clang-format off
printMsg("Benchmark Starting Now");
// These benchmarks are trying to compare the performance between
// different lock implementation, and also variou alignment on locks
printMsg("Bucket + SharedMutex");
testSequential<Bucket, SharedMutex>(16, 24, 10, 1'000'000);
testBatch<Bucket, SharedMutex, 16>(16, 24, 10, 1'000'000, true);
printMsg("Bucket + SharedMutexAligned");
testSequential<Bucket, SharedMutexAligned>(16, 24, 10, 1'000'000);
testBatch<Bucket, SharedMutexAligned, 16>(16, 24, 10, 1'000'000, true);
printMsg("Bucket + SpinLock");
testSequential<Bucket, SpinLock>(16, 24, 10, 1'000'000);
testSequential<Bucket, SpinLock>(16, 24, 16, 1'000'000);
testSequential<Bucket, SpinLock>(16, 24, 20, 1'000'000);
testBatch<Bucket, SpinLock, 16>(16, 24, 10, 1'000'000, true);
testBatch<Bucket, SpinLock, 16>(16, 24, 16, 1'000'000, true);
testBatch<Bucket, SpinLock, 16>(16, 24, 20, 1'000'000, true);
printMsg("Bucket + SpinLockAligned");
testSequential<Bucket, SpinLockAligned>(16, 24, 10, 1'000'000);
testSequential<Bucket, SpinLockAligned>(16, 24, 16, 1'000'000);
testSequential<Bucket, SpinLockAligned>(16, 24, 20, 1'000'000);
testBatch<Bucket, SpinLockAligned, 16>(16, 24, 10, 1'000'000, true);
testBatch<Bucket, SpinLockAligned, 16>(16, 24, 16, 1'000'000, true);
testBatch<Bucket, SpinLockAligned, 16>(16, 24, 20, 1'000'000, true);
// These benchmarks compare how sequential mode performs with different
// amount of objects
printMsg("Different Object Sizes");
testSequential<Bucket, SharedMutex>(1, 14, 10, 1000);
testSequential<Bucket, SharedMutex>(1, 14, 10, 10'000);
testSequential<Bucket, SharedMutex>(1, 16, 10, 10'000);
testSequential<Bucket, SharedMutex>(1, 16, 10, 100'000);
testSequential<Bucket, SharedMutex>(1, 20, 10, 100'000);
testSequential<Bucket, SharedMutex>(1, 20, 10, 1'000'000);
testSequential<Bucket, SharedMutex>(1, 24, 10, 1'000'000);
testSequential<Bucket, SharedMutex>(1, 24, 10, 10'000'000);
testSequential<Bucket, SharedMutex>(1, 26, 10, 10'000'000);
// These bnechmarks compare the different prefetching batch sizes
printMsg("Different Prefetching Batch Sizes");
testBatch<Bucket, SharedMutex, 1>(16, 24, 10, 1'000'000, true);
testBatch<Bucket, SharedMutex, 2>(16, 24, 10, 1'000'000, true);
testBatch<Bucket, SharedMutex, 4>(16, 24, 10, 1'000'000, true);
testBatch<Bucket, SharedMutex, 8>(16, 24, 10, 1'000'000, true);
testBatch<Bucket, SharedMutex, 16>(16, 24, 10, 1'000'000, true);
testBatch<Bucket, SharedMutex, 32>(16, 24, 10, 1'000'000, true);
testBatch<Bucket, SharedMutex, 64>(16, 24, 10, 1'000'000, true);
testBatch<Bucket, SharedMutex, 128>(16, 24, 10, 1'000'000, true);
testBatch<Bucket, SharedMutex, 256>(16, 24, 10, 1'000'000, true);
testBatch<Bucket, SharedMutex, 1024>(16, 24, 10, 1'000'000, true);
testBatch<Bucket, SharedMutex, 4096>(16, 24, 10, 1'000'000, true);
testBatch<Bucket, SharedMutex, 8192>(16, 24, 10, 1'000'000, true);
// These benchmarks compare how sequential and batch modes perform with
// different object sizes and different hashtable sizes. In addition,
// we also compare against a hashtable with key embedded
printMsg("Sequential vs. Prefetching");
for (auto t : {1, 4, 16}) {
for (auto b : {24, 26}) {
for (auto o : {100'000, 1'000'000, 10'000'000}) {
std::cout << "--------\n";
testSequential<Bucket, SharedMutex>(t, b, 10, o);
testSequential<BucketWithKey, SharedMutex>(t, b, 10, o, "key");
testBatch<Bucket, SharedMutex, 16>(t, b, 10, o, true);
testBatch<BucketWithKey, SharedMutex, 16>(t, b, 10, o, false, "key");
}
}
}
printMsg("Becnhmarks have completed");
// clang-format on
}
/*
clang-format off
Hardware Spec: T1 Skylake
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 36
On-line CPU(s) list: 0-35
Thread(s) per core: 2
Core(s) per socket: 18
Socket(s): 1
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 85
Model name: Intel(R) Xeon(R) D-2191A CPU @ 1.60GHz
Stepping: 4
CPU MHz: 1855.037
CPU max MHz: 1601.0000
CPU min MHz: 800.0000
BogoMIPS: 3200.00
Virtualization: VT-x
L1d cache: 32K
L1i cache: 32K
L2 cache: 1024K
L3 cache: 25344K
NUMA node0 CPU(s): 0-35
-------- Benchmark Starting Now --------------------------------------------------------------------
-------- Bucket + SharedMutex ----------------------------------------------------------------------
[Sequential_reg - 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 310 ns, 495 cycles
[Prefetchreg - 16 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 191 ns, 305 cycles
-------- Bucket + SharedMutexAligned ---------------------------------------------------------------
[Sequential_reg - 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 308 ns, 491 cycles
[Prefetchreg - 16 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 192 ns, 306 cycles
-------- Bucket + SpinLock -------------------------------------------------------------------------
[Sequential_reg - 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 431 ns, 689 cycles
[Sequential_reg - 16 T, 24 HB, 16 HL, 1000000 Objects ] Per-Op: 367 ns, 586 cycles
[Sequential_reg - 16 T, 24 HB, 20 HL, 1000000 Objects ] Per-Op: 361 ns, 576 cycles
[Prefetchreg - 16 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 280 ns, 447 cycles
[Prefetchreg - 16 B, 16 T, 24 HB, 16 HL, 1000000 Objects ] Per-Op: 204 ns, 325 cycles
[Prefetchreg - 16 B, 16 T, 24 HB, 20 HL, 1000000 Objects ] Per-Op: 209 ns, 334 cycles
-------- Bucket + SpinLockAligned ------------------------------------------------------------------
[Sequential_reg - 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 368 ns, 588 cycles
[Sequential_reg - 16 T, 24 HB, 16 HL, 1000000 Objects ] Per-Op: 403 ns, 644 cycles
[Sequential_reg - 16 T, 24 HB, 20 HL, 1000000 Objects ] Per-Op: 424 ns, 677 cycles
[Prefetchreg - 16 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 205 ns, 327 cycles
[Prefetchreg - 16 B, 16 T, 24 HB, 16 HL, 1000000 Objects ] Per-Op: 217 ns, 346 cycles
[Prefetchreg - 16 B, 16 T, 24 HB, 20 HL, 1000000 Objects ] Per-Op: 222 ns, 355 cycles
-------- Different Object Sizes --------------------------------------------------------------------
[Sequential_reg - 1 T, 14 HB, 10 HL, 1000 Objects ] Per-Op: 82 ns, 132 cycles
[Sequential_reg - 1 T, 14 HB, 10 HL, 10000 Objects ] Per-Op: 100 ns, 160 cycles
[Sequential_reg - 1 T, 16 HB, 10 HL, 10000 Objects ] Per-Op: 90 ns, 144 cycles
[Sequential_reg - 1 T, 16 HB, 10 HL, 100000 Objects ] Per-Op: 195 ns, 311 cycles
[Sequential_reg - 1 T, 20 HB, 10 HL, 100000 Objects ] Per-Op: 183 ns, 292 cycles
[Sequential_reg - 1 T, 20 HB, 10 HL, 1000000 Objects ] Per-Op: 367 ns, 586 cycles
[Sequential_reg - 1 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 304 ns, 485 cycles
[Sequential_reg - 1 T, 24 HB, 10 HL, 10000000 Objects ] Per-Op: 398 ns, 636 cycles
[Sequential_reg - 1 T, 26 HB, 10 HL, 10000000 Objects ] Per-Op: 359 ns, 573 cycles
-------- Different Prefetching Batch Sizes ---------------------------------------------------------
[Prefetchreg - 1 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 1456 ns, 2325 cycles
[Prefetchreg - 2 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 743 ns, 1187 cycles
[Prefetchreg - 4 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 375 ns, 599 cycles
[Prefetchreg - 8 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 214 ns, 342 cycles
[Prefetchreg - 16 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 196 ns, 313 cycles
[Prefetchreg - 32 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 190 ns, 304 cycles
[Prefetchreg - 64 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 186 ns, 298 cycles
[Prefetchreg - 128 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 187 ns, 298 cycles
[Prefetchreg - 256 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 186 ns, 297 cycles
[Prefetchreg - 1024 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 203 ns, 324 cycles
[Prefetchreg - 4096 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 228 ns, 365 cycles
[Prefetchreg - 8192 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 243 ns, 389 cycles
-------- Sequential vs. Prefetching ----------------------------------------------------------------
--------
[Sequential_reg - 1 T, 24 HB, 10 HL, 100000 Objects ] Per-Op: 174 ns, 278 cycles
[Sequential_key - 1 T, 24 HB, 10 HL, 100000 Objects ] Per-Op: 116 ns, 185 cycles
[Prefetchreg - 16 B, 1 T, 24 HB, 10 HL, 100000 Objects ] Per-Op: 106 ns, 169 cycles
[Prefetchkey - 16 B, 1 T, 24 HB, 10 HL, 100000 Objects ] Per-Op: 98 ns, 157 cycles
--------
[Sequential_reg - 1 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 304 ns, 485 cycles
[Sequential_key - 1 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 194 ns, 309 cycles
[Prefetchreg - 16 B, 1 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 168 ns, 268 cycles
[Prefetchkey - 16 B, 1 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 168 ns, 269 cycles
--------
[Sequential_reg - 1 T, 24 HB, 10 HL, 10000000 Objects ] Per-Op: 397 ns, 635 cycles
[Sequential_key - 1 T, 24 HB, 10 HL, 10000000 Objects ] Per-Op: 246 ns, 393 cycles
[Prefetchreg - 16 B, 1 T, 24 HB, 10 HL, 10000000 Objects ] Per-Op: 307 ns, 490 cycles
[Prefetchkey - 16 B, 1 T, 24 HB, 10 HL, 10000000 Objects ] Per-Op: 249 ns, 398 cycles
--------
[Sequential_reg - 1 T, 26 HB, 10 HL, 100000 Objects ] Per-Op: 175 ns, 279 cycles
[Sequential_key - 1 T, 26 HB, 10 HL, 100000 Objects ] Per-Op: 126 ns, 201 cycles
[Prefetchreg - 16 B, 1 T, 26 HB, 10 HL, 100000 Objects ] Per-Op: 100 ns, 160 cycles
[Prefetchkey - 16 B, 1 T, 26 HB, 10 HL, 100000 Objects ] Per-Op: 103 ns, 165 cycles
--------
[Sequential_reg - 1 T, 26 HB, 10 HL, 1000000 Objects ] Per-Op: 315 ns, 503 cycles
[Sequential_key - 1 T, 26 HB, 10 HL, 1000000 Objects ] Per-Op: 215 ns, 343 cycles
[Prefetchreg - 16 B, 1 T, 26 HB, 10 HL, 1000000 Objects ] Per-Op: 186 ns, 297 cycles
[Prefetchkey - 16 B, 1 T, 26 HB, 10 HL, 1000000 Objects ] Per-Op: 192 ns, 307 cycles
--------
[Sequential_reg - 1 T, 26 HB, 10 HL, 10000000 Objects ] Per-Op: 373 ns, 595 cycles
[Sequential_key - 1 T, 26 HB, 10 HL, 10000000 Objects ] Per-Op: 233 ns, 372 cycles
[Prefetchreg - 16 B, 1 T, 26 HB, 10 HL, 10000000 Objects ] Per-Op: 294 ns, 470 cycles
[Prefetchkey - 16 B, 1 T, 26 HB, 10 HL, 10000000 Objects ] Per-Op: 258 ns, 411 cycles
--------
[Sequential_reg - 4 T, 24 HB, 10 HL, 100000 Objects ] Per-Op: 156 ns, 250 cycles
[Sequential_key - 4 T, 24 HB, 10 HL, 100000 Objects ] Per-Op: 113 ns, 181 cycles
[Prefetchreg - 16 B, 4 T, 24 HB, 10 HL, 100000 Objects ] Per-Op: 106 ns, 170 cycles
[Prefetchkey - 16 B, 4 T, 24 HB, 10 HL, 100000 Objects ] Per-Op: 122 ns, 196 cycles
--------
[Sequential_reg - 4 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 303 ns, 484 cycles
[Sequential_key - 4 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 196 ns, 313 cycles
[Prefetchreg - 16 B, 4 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 188 ns, 301 cycles
[Prefetchkey - 16 B, 4 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 196 ns, 314 cycles
--------
[Sequential_reg - 4 T, 24 HB, 10 HL, 10000000 Objects ] Per-Op: 432 ns, 690 cycles
[Sequential_key - 4 T, 24 HB, 10 HL, 10000000 Objects ] Per-Op: 270 ns, 431 cycles
[Prefetchreg - 16 B, 4 T, 24 HB, 10 HL, 10000000 Objects ] Per-Op: 310 ns, 495 cycles
[Prefetchkey - 16 B, 4 T, 24 HB, 10 HL, 10000000 Objects ] Per-Op: 275 ns, 439 cycles
--------
[Sequential_reg - 4 T, 26 HB, 10 HL, 100000 Objects ] Per-Op: 171 ns, 273 cycles
[Sequential_key - 4 T, 26 HB, 10 HL, 100000 Objects ] Per-Op: 189 ns, 303 cycles
[Prefetchreg - 16 B, 4 T, 26 HB, 10 HL, 100000 Objects ] Per-Op: 108 ns, 173 cycles
[Prefetchkey - 16 B, 4 T, 26 HB, 10 HL, 100000 Objects ] Per-Op: 110 ns, 176 cycles
--------
[Sequential_reg - 4 T, 26 HB, 10 HL, 1000000 Objects ] Per-Op: 311 ns, 496 cycles
[Sequential_key - 4 T, 26 HB, 10 HL, 1000000 Objects ] Per-Op: 222 ns, 355 cycles
[Prefetchreg - 16 B, 4 T, 26 HB, 10 HL, 1000000 Objects ] Per-Op: 185 ns, 296 cycles
[Prefetchkey - 16 B, 4 T, 26 HB, 10 HL, 1000000 Objects ] Per-Op: 284 ns, 454 cycles
--------
[Sequential_reg - 4 T, 26 HB, 10 HL, 10000000 Objects ] Per-Op: 369 ns, 590 cycles
[Sequential_key - 4 T, 26 HB, 10 HL, 10000000 Objects ] Per-Op: 247 ns, 394 cycles
[Prefetchreg - 16 B, 4 T, 26 HB, 10 HL, 10000000 Objects ] Per-Op: 273 ns, 437 cycles
[Prefetchkey - 16 B, 4 T, 26 HB, 10 HL, 10000000 Objects ] Per-Op: 257 ns, 411 cycles
--------
[Sequential_reg - 16 T, 24 HB, 10 HL, 100000 Objects ] Per-Op: 166 ns, 265 cycles
[Sequential_key - 16 T, 24 HB, 10 HL, 100000 Objects ] Per-Op: 221 ns, 354 cycles
[Prefetchreg - 16 B, 16 T, 24 HB, 10 HL, 100000 Objects ] Per-Op: 119 ns, 190 cycles
[Prefetchkey - 16 B, 16 T, 24 HB, 10 HL, 100000 Objects ] Per-Op: 157 ns, 250 cycles
--------
[Sequential_reg - 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 306 ns, 489 cycles
[Sequential_key - 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 204 ns, 327 cycles
[Prefetchreg - 16 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 269 ns, 429 cycles
[Prefetchkey - 16 B, 16 T, 24 HB, 10 HL, 1000000 Objects ] Per-Op: 193 ns, 308 cycles
--------
[Sequential_reg - 16 T, 24 HB, 10 HL, 10000000 Objects ] Per-Op: 419 ns, 668 cycles
[Sequential_key - 16 T, 24 HB, 10 HL, 10000000 Objects ] Per-Op: 270 ns, 432 cycles
[Prefetchreg - 16 B, 16 T, 24 HB, 10 HL, 10000000 Objects ] Per-Op: 311 ns, 496 cycles
[Prefetchkey - 16 B, 16 T, 24 HB, 10 HL, 10000000 Objects ] Per-Op: 284 ns, 453 cycles
--------
[Sequential_reg - 16 T, 26 HB, 10 HL, 100000 Objects ] Per-Op: 239 ns, 381 cycles
[Sequential_key - 16 T, 26 HB, 10 HL, 100000 Objects ] Per-Op: 187 ns, 299 cycles
[Prefetchreg - 16 B, 16 T, 26 HB, 10 HL, 100000 Objects ] Per-Op: 120 ns, 192 cycles
[Prefetchkey - 16 B, 16 T, 26 HB, 10 HL, 100000 Objects ] Per-Op: 119 ns, 191 cycles
--------
[Sequential_reg - 16 T, 26 HB, 10 HL, 1000000 Objects ] Per-Op: 320 ns, 511 cycles
[Sequential_key - 16 T, 26 HB, 10 HL, 1000000 Objects ] Per-Op: 231 ns, 368 cycles
[Prefetchreg - 16 B, 16 T, 26 HB, 10 HL, 1000000 Objects ] Per-Op: 195 ns, 312 cycles
[Prefetchkey - 16 B, 16 T, 26 HB, 10 HL, 1000000 Objects ] Per-Op: 213 ns, 340 cycles
--------
[Sequential_reg - 16 T, 26 HB, 10 HL, 10000000 Objects ] Per-Op: 382 ns, 610 cycles
[Sequential_key - 16 T, 26 HB, 10 HL, 10000000 Objects ] Per-Op: 300 ns, 479 cycles
[Prefetchreg - 16 B, 16 T, 26 HB, 10 HL, 10000000 Objects ] Per-Op: 285 ns, 454 cycles
[Prefetchkey - 16 B, 16 T, 26 HB, 10 HL, 10000000 Objects ] Per-Op: 268 ns, 428 cycles
-------- Becnhmarks have completed -----------------------------------------------------------------
clang-format on
*/