cachelib/navy/block_cache/BlockCache.h (181 lines of code) (raw):
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <atomic>
#include <chrono>
#include <memory>
#include <stdexcept>
#include <vector>
#include "cachelib/allocator/nvmcache/NavyConfig.h"
#include "cachelib/common/AtomicCounter.h"
#include "cachelib/common/CompilerUtils.h"
#include "cachelib/navy/block_cache/Allocator.h"
#include "cachelib/navy/block_cache/EvictionPolicy.h"
#include "cachelib/navy/block_cache/HitsReinsertionPolicy.h"
#include "cachelib/navy/block_cache/Index.h"
#include "cachelib/navy/block_cache/PercentageReinsertionPolicy.h"
#include "cachelib/navy/block_cache/RegionManager.h"
#include "cachelib/navy/common/Device.h"
#include "cachelib/navy/common/SizeDistribution.h"
#include "cachelib/navy/engine/Engine.h"
#include "cachelib/navy/serialization/Serialization.h"
namespace facebook {
namespace cachelib {
namespace navy {
class JobScheduler;
// Constructor can throw but system remains in the valid state. Caller can
// fix parameters and re-run.
class BlockCache final : public Engine {
public:
// See CacheProto for details
struct Config {
Device* device{};
DestructorCallback destructorCb;
// Checksum data read/written
bool checksum{};
// Base offset and size (in bytes) of cache on the device
uint64_t cacheBaseOffset{};
uint64_t cacheSize{};
// Eviction policy
std::unique_ptr<EvictionPolicy> evictionPolicy;
BlockCacheReinsertionConfig reinsertionConfig{};
// Region size, bytes
uint64_t regionSize{16 * 1024 * 1024};
// See AbstractCacheProto::setReadBufferSize
uint32_t readBufferSize{};
// Job scheduler for background tasks
JobScheduler* scheduler{};
// Clean region pool size
uint32_t cleanRegionsPool{1};
// Number of in-memory buffers where writes are buffered before flushed
// on to the device
uint32_t numInMemBuffers{1};
// whether ItemDestructor is enabled
bool itemDestructorEnabled{false};
// Maximum number of retry times for in-mem buffer flushing.
// When exceeding the limit, we will not reschedule any flushing job but
// directly fail it.
uint16_t inMemBufFlushRetryLimit{10};
// Number of priorities. Items of the same priority will be put into
// the same reigon. The effect of priorities will be up to the particular
// eviction policy. There must be at least one priority.
uint16_t numPriorities{1};
// whether to remove an item by checking the full key.
bool preciseRemove{false};
// Calculates the total region number.
uint32_t getNumRegions() const { return cacheSize / regionSize; }
// Checks invariants. Throws exception if failed.
Config& validate();
};
// Contructor can throw std::exception if config is invalid.
//
// @param config config that was validated with Config::validate
//
// @throw std::invalid_argument on bad config
explicit BlockCache(Config&& config);
BlockCache(const BlockCache&) = delete;
BlockCache& operator=(const BlockCache&) = delete;
~BlockCache() override = default;
// Checks if the key could exist in block cache. This can be used as a
// pre-check to optimize cache lookups to avoid calling lookup in an async IO
// environment.
//
// @param hk key to be checked
//
// @return false if the key definitely does not exist and true if it could.
bool couldExist(HashedKey hk) override;
// Inserts a key-value pair into BlockCache.
//
// @param hk key to be inserted
// @param value value to be inserted
//
// @return Status::Ok on success,
// Status::Rejected on error,
// Status::Retry on no space available for now.
Status insert(HashedKey hk, BufferView value) override;
// Looks up a key in BlockCache.
//
// @param hk key to be looked up
// @param value value to be populated with the data read
//
// @return Status::Ok on success,
// Status::NotFound if the key is not found,
// Status::Retry read cannot be served and needs to be retried again.
// Status::DeviceError otherwise.
Status lookup(HashedKey hk, Buffer& value) override;
// Removes a key from BlockCache.
//
// @param hk key to be removed
//
// @return Status::Ok if the key is found and Status::NotFound otherwise.
Status remove(HashedKey hk) override;
// Flushes all buffered (in flight) operations in BlockCache.
void flush() override;
// Resets BlockCache to its initial state.
void reset() override;
// Serializes BlockCache state to a RecordWriter.
//
// @param rw RecordWriter to serialize state
void persist(RecordWriter& rw) override;
// Deserialize BlockCache state from a RecordReader.
//
// @param rr RecordReader to deserialize state
//
// @return true if recovery succeeds, false otherwise.
bool recover(RecordReader& rr) override;
// Exports BlockCache stats via CounterVisitor.
//
// @param visitor CounterVisitor to export stats
void getCounters(const CounterVisitor& visitor) const override;
// Gets the maximum item size that can be inserted into BlockCache.
uint64_t getMaxItemSize() const override {
return regionSize_ - sizeof(EntryDesc);
}
// Gets the alloc alignment size (must be an integral power of two).
uint32_t getAllocAlignSize() const {
XDCHECK(folly::isPowTwo(allocAlignSize_));
return allocAlignSize_;
}
// The minimum alloc alignment size can be as small as 1. Since the
// test cases have very small device size, they will end up with alloc
// alignment size of 1 (if determined as device_size >> 32 ) and we may
// not be able to test the "alloc alignment size" feature correctly.
// Since the real devices are not going to be that small, this is a
// reasonable minimum size. This can be lowered to 256 or 128 in future,
// if needed.
static constexpr uint32_t kMinAllocAlignSize = 512;
// This is the maximum size of an item that can be cached in block cache.
static constexpr uint32_t kMaxItemSize =
kMinAllocAlignSize *
static_cast<uint32_t>(std::numeric_limits<uint16_t>::max());
private:
// Serialization format version. Never 0. Versions < 10 reserved for testing.
static constexpr uint32_t kFormatVersion = 12;
// This should be at least the nextTwoPow(sizeof(EntryDesc)).
static constexpr uint32_t kDefReadBufferSize = 4096;
// Default priority for an item inserted into block cache
static constexpr uint16_t kDefaultItemPriority = 0;
// When modify @EntryDesc layout, don't forget to bump @kFormatVersion!
struct EntryDesc {
uint32_t keySize{};
uint32_t valueSize{};
uint64_t keyHash{};
uint32_t csSelf{};
uint32_t cs{};
EntryDesc() = default;
EntryDesc(uint32_t ks, uint32_t vs, uint64_t kh)
: keySize{ks}, valueSize{vs}, keyHash{kh} {
csSelf = computeChecksum();
}
uint32_t computeChecksum() const {
return checksum(BufferView{offsetof(EntryDesc, csSelf),
reinterpret_cast<const uint8_t*>(this)});
}
};
// Instead of unportable packing, we make sure that struct size is equal to
// the size of its members.
static_assert(sizeof(EntryDesc) == 24, "packed struct required");
struct ValidConfigTag {};
BlockCache(Config&& config, ValidConfigTag);
// Entry disk size (with aux data and aligned)
uint32_t serializedSize(uint32_t keySize, uint32_t valueSize);
// Read and write are time consuming. It doesn't worth inlining them from
// the performance point of view, but makes sense to track them for perf:
// especially portion on CPU time spent in std::memcpy.
// @param addr Address to write this entry into
// @param slotSize Number of bytes this entry will take up on the device
// @param hk Key of the entry
// @param value Payload of the entry
Status writeEntry(RelAddress addr,
uint32_t slotSize,
HashedKey hk,
BufferView value);
// @param readDesc Descriptor for reading. This must be valid
// @param addrEnd End of the entry since the item layout is backward
// @param approxSize Approximate size since we got this size from index
// @param expected We expect the entry's key to match with our key
// @param value We will write the payload into this buffer
Status readEntry(const RegionDescriptor& readDesc,
RelAddress addrEnd,
uint32_t approxSize,
HashedKey expected,
Buffer& value);
// Allocator reclaim callback
// Returns number of slots that were successfully evicted
uint32_t onRegionReclaim(RegionId rid, BufferView buffer);
// Allocator cleanup callback
void onRegionCleanup(RegionId rid, BufferView buffer);
// Returns true if @config matches this cache's config_
bool isValidRecoveryData(const serialization::BlockCacheConfig& config) const;
static serialization::BlockCacheConfig serializeConfig(const Config& config);
// Tries to recover cache. Throws std::exception on failure.
void tryRecover(RecordReader& rr);
// The alloc alignment indicates the granularity of read/write. This
// granuality is less than the device io alignment size because we buffer
// writes in memory until we fill up a region.
uint32_t calcAllocAlignSize() const;
// Size hint is computed by aligning size up to kMinAllocAlignSize,
// and then divide by it. It is loosely compressing the size as
// we may decode into a bigger size later.
uint16_t encodeSizeHint(uint32_t size) const {
auto alignedSize = powTwoAlign(size, kMinAllocAlignSize);
return static_cast<uint16_t>(alignedSize / kMinAllocAlignSize);
}
uint32_t decodeSizeHint(uint16_t sizeHint) const {
return sizeHint * kMinAllocAlignSize;
}
uint32_t encodeRelAddress(RelAddress addr) const {
XDCHECK_NE(addr.offset(), 0u); // See @decodeRelAddress
return regionManager_.toAbsolute(addr).offset() / allocAlignSize_;
}
AbsAddress decodeAbsAddress(uint32_t code) const {
return AbsAddress{static_cast<uint64_t>(code) * allocAlignSize_};
}
RelAddress decodeRelAddress(uint32_t code) const {
// Remember, we store slot end address, which can have offset equal to
// region size.
return regionManager_.toRelative(decodeAbsAddress(code).sub(1)).add(1);
}
enum class ReinsertionRes {
// Item was reinserted back into the cache
kReinserted,
// Item was removed by user earlier
kRemoved,
// Item wasn't eligible for re-insertion and was evicted
kEvicted,
};
ReinsertionRes reinsertOrRemoveItem(HashedKey hk,
BufferView value,
uint32_t entrySize,
RelAddress currAddr);
// Removes an entry key from the index and the item size from size
// distribution.
// @return true if the item is successfully removed; false if the item cannot
// be found or was removed earlier.
bool removeItem(HashedKey hk, uint32_t entrySize, RelAddress currAddr);
void validate(Config& config) const;
// Create the reinsertion policy from config.
// This function may need a reference to index and should be called the last
// in the initialization order.
std::shared_ptr<BlockCacheReinsertionPolicy> makeReinsertionPolicy(
const BlockCacheReinsertionConfig& reinsertionConfig);
const serialization::BlockCacheConfig config_;
const uint16_t numPriorities_{};
const DestructorCallback destructorCb_;
const bool checksumData_{};
// reference to the under-lying device.
const Device& device_;
// alloc alignment size indicates the granularity of entry sizes on device.
// this is at least kMinAllocAlignSize and is determined by the size of the
// device and size of the address (which is 32-bits).
const uint32_t allocAlignSize_{};
const uint32_t readBufferSize_{};
// number of bytes in a region
const uint64_t regionSize_{};
// whether ItemDestructor is enabled
const bool itemDestructorEnabled_{false};
// whether preciseRemove is enabled
const bool preciseRemove_{false};
// Index stores offset of the slot *end*. This enables efficient paradigm
// "buffer pointer is value pointer", which means value has to be at offset 0
// of the slot and header (footer) at the end.
//
// -------------------------------------------
// | Value | Footer |
// -------------------------------------------
// ^ ^
// | |
// Buffer* Index points here
Index index_;
RegionManager regionManager_;
Allocator allocator_;
// It is vital that the reinsertion policy is initialized after index_.
// Make sure that this class member is defined after index_.
std::shared_ptr<BlockCacheReinsertionPolicy> reinsertionPolicy_;
// thread local counters in synchronized/critical path
mutable TLCounter lookupCount_;
mutable TLCounter succLookupCount_;
// atomic counters in asynchronized path
mutable AtomicCounter insertCount_;
mutable AtomicCounter insertHashCollisionCount_;
mutable AtomicCounter succInsertCount_;
mutable AtomicCounter lookupFalsePositiveCount_;
mutable AtomicCounter lookupEntryHeaderChecksumErrorCount_;
mutable AtomicCounter lookupValueChecksumErrorCount_;
mutable AtomicCounter removeCount_;
mutable AtomicCounter succRemoveCount_;
mutable AtomicCounter evictionLookupMissCounter_;
mutable AtomicCounter allocErrorCount_;
mutable AtomicCounter logicalWrittenCount_;
mutable AtomicCounter holeCount_;
mutable AtomicCounter holeSizeTotal_;
mutable AtomicCounter reinsertionErrorCount_;
mutable AtomicCounter reinsertionCount_;
mutable AtomicCounter reinsertionBytes_;
mutable AtomicCounter reclaimEntryHeaderChecksumErrorCount_;
mutable AtomicCounter reclaimValueChecksumErrorCount_;
mutable AtomicCounter removeAttemptCollisions_;
mutable AtomicCounter cleanupEntryHeaderChecksumErrorCount_;
mutable AtomicCounter cleanupValueChecksumErrorCount_;
mutable SizeDistribution sizeDist_;
mutable AtomicCounter lookupForItemDestructorErrorCount_;
};
} // namespace navy
} // namespace cachelib
} // namespace facebook