cachelib/allocator/CacheStats.h (299 lines of code) (raw):
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <algorithm>
#include <numeric>
#include "cachelib/allocator/Util.h"
#include "cachelib/allocator/memory/MemoryAllocator.h"
#include "cachelib/allocator/memory/MemoryAllocatorStats.h"
#include "cachelib/allocator/memory/Slab.h"
#include "cachelib/common/FastStats.h"
#include "cachelib/common/PercentileStats.h"
#include "cachelib/common/Time.h"
namespace facebook {
namespace cachelib {
// stats class for a single eviction queue
struct EvictionStatPerType {
// the age of the oldest element in seconds
uint64_t oldestElementAge = 0ULL;
// number of elements in the eviction queue
uint64_t size = 0ULL;
};
// stats class for one MM container (a.k.a one allocation class) related to
// evictions
struct EvictionAgeStat {
EvictionStatPerType warmQueueStat;
EvictionStatPerType hotQueueStat;
EvictionStatPerType coldQueueStat;
// this is the estimated age after removing a slab worth of elements
uint64_t projectedAge;
};
// stats related to evictions for a pool
struct PoolEvictionAgeStats {
// Map from allocation class id to the eviction age stats
std::unordered_map<ClassId, EvictionAgeStat> classEvictionAgeStats;
uint64_t getOldestElementAge(ClassId cid) const {
return classEvictionAgeStats.at(cid).warmQueueStat.oldestElementAge;
}
const EvictionStatPerType& getWarmEvictionStat(ClassId cid) const {
return classEvictionAgeStats.at(cid).warmQueueStat;
}
const EvictionStatPerType& getHotEvictionStat(ClassId cid) const {
return classEvictionAgeStats.at(cid).hotQueueStat;
}
const EvictionStatPerType& getColdEvictionStat(ClassId cid) const {
return classEvictionAgeStats.at(cid).coldQueueStat;
}
uint64_t getProjectedAge(ClassId cid) const {
return classEvictionAgeStats.at(cid).projectedAge;
}
};
// Stats for MM container
struct MMContainerStat {
// number of elements in the container.
size_t size;
// what is the unix timestamp in seconds of the oldest element existing in
// the container.
uint64_t oldestTimeSec;
// refresh time for LRU
uint64_t lruRefreshTime;
// TODO: Make the MMContainerStat generic by moving the Lru/2Q specific
// stats inside MMType and exporting them through a generic stats interface.
// number of hits in each lru.
uint64_t numHotAccesses;
uint64_t numColdAccesses;
uint64_t numWarmAccesses;
uint64_t numTailAccesses;
};
// cache related stats for a given allocation class.
struct CacheStat {
// allocation size for this container.
uint32_t allocSize;
// number of attempts to allocate memory
uint64_t allocAttempts{0};
// number of failed attempts
uint64_t allocFailures{0};
// total fragmented memory size in bytes
uint64_t fragmentationSize{0};
// number of hits for this container.
uint64_t numHits;
// number of evictions from this class id that was of a chained item
uint64_t chainedItemEvictions;
// number of regular items that were evicted from this classId
uint64_t regularItemEvictions;
// the stats from the mm container
MMContainerStat containerStat;
uint64_t numItems() const noexcept { return numEvictableItems(); }
// number of elements in this MMContainer
size_t numEvictableItems() const noexcept { return containerStat.size; }
// total number of evictions.
uint64_t numEvictions() const noexcept {
return chainedItemEvictions + regularItemEvictions;
}
// the current oldest item in the container in seconds.
uint64_t getEvictionAge() const noexcept {
return containerStat.oldestTimeSec != 0
? util::getCurrentTimeSec() - containerStat.oldestTimeSec
: 0;
}
};
// Stats for a pool
struct PoolStats {
// pool name given by users of this pool.
std::string poolName;
// true if the pool is a compact cache pool.
bool isCompactCache;
// total pool size assigned by users when adding pool.
uint64_t poolSize;
// total size of the pool that is actively usable, taking advising into
// account
uint64_t poolUsableSize;
// total size of the pool that is set to be advised away.
uint64_t poolAdvisedSize;
// container stats that provide evictions etc.
std::unordered_map<ClassId, CacheStat> cacheStats;
// stats from the memory allocator perspective. this is a map of MPStat
// for each allocation class that this pool has.
MPStats mpStats;
// number of get hits for this pool.
uint64_t numPoolGetHits;
// estimates for eviction age for items in this pool
util::PercentileStats::Estimates evictionAgeSecs{};
const std::set<ClassId>& getClassIds() const noexcept {
return mpStats.classIds;
}
// number of attempts to allocate
uint64_t numAllocAttempts() const;
// number of attempts that failed
uint64_t numAllocFailures() const;
// toal memory fragmentation size of this pool.
uint64_t totalFragmentation() const;
// total number of free allocs for this pool
uint64_t numFreeAllocs() const noexcept;
// amount of cache memory that is not allocated.
size_t freeMemoryBytes() const noexcept;
// number of evictions for this pool
uint64_t numEvictions() const noexcept;
// number of all items in this pool
uint64_t numItems() const noexcept;
// number of evictable items
uint64_t numEvictableItems() const noexcept;
// total number of allocations currently in this pool
uint64_t numActiveAllocs() const noexcept;
// number of hits for an alloc class in this pool
uint64_t numHitsForClass(ClassId cid) const {
return cacheStats.at(cid).numHits;
}
// number of slabs in this class id
uint64_t numSlabsForClass(ClassId cid) const {
return mpStats.acStats.at(cid).totalSlabs();
}
// alloc size corresponding to the class id
uint32_t allocSizeForClass(ClassId cid) const {
return cacheStats.at(cid).allocSize;
}
// mm container eviction age for the class
uint64_t evictionAgeForClass(ClassId cid) const {
return cacheStats.at(cid).getEvictionAge();
}
// total free allocs for the class
uint64_t numFreeAllocsForClass(ClassId cid) const {
return mpStats.acStats.at(cid).freeAllocs;
}
// This is the real eviction age of this pool as this number
// guarantees the time any item inserted into this pool will live
// ignores the classIds that are not used.
uint64_t minEvictionAge() const;
// computes the maximum eviction age across all class Ids
uint64_t maxEvictionAge() const;
// aggregate this pool stats with another that is compatible. To be
// compatible, they need to have the same number of classIds
//
// throws when the operation is not compatible.
PoolStats& operator+=(const PoolStats& other);
};
// Stats for slab release events
struct SlabReleaseStats {
uint64_t numActiveSlabReleases;
uint64_t numSlabReleaseForRebalance;
uint64_t numSlabReleaseForResize;
uint64_t numSlabReleaseForAdvise;
uint64_t numSlabReleaseForRebalanceAttempts;
uint64_t numSlabReleaseForResizeAttempts;
uint64_t numSlabReleaseForAdviseAttempts;
uint64_t numMoveAttempts;
uint64_t numMoveSuccesses;
uint64_t numEvictionAttempts;
uint64_t numEvictionSuccesses;
};
// Stats for reaper
struct ReaperStats {
// the total number of items the reaper has visited.
uint64_t numVisitedItems{0};
// the number of items reaped.
uint64_t numReapedItems{0};
uint64_t numVisitErrs{0};
// number of times we went through the whole cache
uint64_t numTraversals{0};
// indicates the time in ms for the last iteration across the entire cache
uint64_t lastTraversalTimeMs{0};
// indicates the maximum of all traversals
uint64_t minTraversalTimeMs{0};
// indicates the minimum of all traversals
uint64_t maxTraversalTimeMs{0};
// indicates the average of all traversals
uint64_t avgTraversalTimeMs{0};
};
// CacheMetadata type to export
struct CacheMetadata {
// allocator_version
int allocatorVersion;
// ram_format_version
int ramFormatVersion;
// nvm_format_version
int nvmFormatVersion;
// cache_total_size
size_t cacheSize;
};
// forward declaration
namespace detail {
struct Stats;
}
// Stats that apply globally in cache and
// the ones that are aggregated over all pools
struct GlobalCacheStats {
// number of calls to CacheAllocator::find
uint64_t numCacheGets{0};
// number of such calls being a miss in the cache.
uint64_t numCacheGetMiss{0};
// number of such calls being an expiry in the cache. This is also included
// in the numCacheGetMiss stats above.
uint64_t numCacheGetExpiries{0};
// number of remove calls to CacheAllocator::remove that requires
// a lookup first and then remove the item
uint64_t numCacheRemoves{0};
// number of remove calls that resulted in a ram hit
uint64_t numCacheRemoveRamHits{0};
// number of item destructor calls from ram
uint64_t numRamDestructorCalls{0};
// number of nvm gets
uint64_t numNvmGets{0};
// number of nvm misses
uint64_t numNvmGetMiss{0};
// number of nvm isses due to internal errors
uint64_t numNvmGetMissErrs{0};
// number of nvm misses due to inflight remove on the same key
uint64_t numNvmGetMissDueToInflightRemove{0};
// number of nvm misses that happened synchronously
uint64_t numNvmGetMissFast{0};
// number of nvm gets that are expired
uint64_t numNvmGetMissExpired{0};
// number of gets that joined a concurrent fill for same item
uint64_t numNvmGetCoalesced{0};
// number of deletes issues to nvm
uint64_t numNvmDeletes{0};
// number of deletes skipped and not issued to nvm
uint64_t numNvmSkippedDeletes{0};
// number of writes to nvm
uint64_t numNvmPuts{0};
// number of put errors;
uint64_t numNvmPutErrs{0};
// number of put failures due to encode call back
uint64_t numNvmPutEncodeFailure{0};
// number of puts that observed an inflight delete and aborted
uint64_t numNvmAbortedPutOnTombstone{0};
// number of items that are filtered by compaction
uint64_t numNvmCompactionFiltered{0};
// number of puts that observed an inflight get and aborted
uint64_t numNvmAbortedPutOnInflightGet{0};
// number of evictions from NvmCache
uint64_t numNvmEvictions{0};
// number of evictions from nvm that found an inconsistent state in RAM
uint64_t numNvmUncleanEvict{0};
// number of evictions that were issued for an item that was in RAM in clean
// state
uint64_t numNvmCleanEvict{0};
// number of evictions that were issued more than once on an unclean item.
uint64_t numNvmCleanDoubleEvict{0};
// number of evictions that were already expired
uint64_t numNvmExpiredEvict{0};
// number of item destructor calls from nvm
uint64_t numNvmDestructorCalls{0};
// number of RefcountOverflow happens causing item destructor
// being skipped in nvm
uint64_t numNvmDestructorRefcountOverflow{0};
// number of puts to nvm of a clean item in RAM due to nvm eviction.
uint64_t numNvmPutFromClean{0};
// attempts made from nvm cache to allocate an item for promotion
uint64_t numNvmAllocAttempts{0};
// attempts made from nvm cache to allocate an item for its destructor
uint64_t numNvmAllocForItemDestructor{0};
// heap allocate errors for item destrutor
uint64_t numNvmItemDestructorAllocErrors{0};
// size of itemRemoved_ hash set in nvm
uint64_t numNvmItemRemovedSetSize{0};
// number of attempts to allocate an item
uint64_t allocAttempts{0};
// number of failures to allocate an item due to internal error
uint64_t allocFailures{0};
// number of evictions across all the pools in the cache.
uint64_t numEvictions{0};
// number of allocation attempts with invalid input params.
uint64_t invalidAllocs{0};
// total number of items
uint64_t numItems{0};
// number of refcount overflows
uint64_t numRefcountOverflow{0};
// number of exception occurred inside item destructor
uint64_t numDestructorExceptions{0};
// number of allocated and CHAINED items that are parents (i.e.,
// consisting of at least one chained child)
uint64_t numChainedChildItems{0};
// number of allocated and CHAINED items that are children (i.e.,
// allocated with a parent handle that it's chained to)
uint64_t numChainedParentItems{0};
// number of eviction failures
uint64_t numEvictionFailureFromAccessContainer{0};
uint64_t numEvictionFailureFromConcurrentFill{0};
uint64_t numEvictionFailureFromParentAccessContainer{0};
uint64_t numEvictionFailureFromMoving{0};
uint64_t numEvictionFailureFromParentMoving{0};
// latency and percentile stats of various cachelib operations
util::PercentileStats::Estimates allocateLatencyNs{};
util::PercentileStats::Estimates moveChainedLatencyNs{};
util::PercentileStats::Estimates moveRegularLatencyNs{};
util::PercentileStats::Estimates nvmLookupLatencyNs{};
util::PercentileStats::Estimates nvmInsertLatencyNs{};
util::PercentileStats::Estimates nvmRemoveLatencyNs{};
util::PercentileStats::Estimates ramEvictionAgeSecs{};
util::PercentileStats::Estimates ramItemLifeTimeSecs{};
util::PercentileStats::Estimates nvmSmallLifetimeSecs{};
util::PercentileStats::Estimates nvmLargeLifetimeSecs{};
util::PercentileStats::Estimates nvmEvictionSecondsPastExpiry{};
util::PercentileStats::Estimates nvmEvictionSecondsToExpiry{};
util::PercentileStats::Estimates nvmPutSize{};
// time since the ram cache was created in seconds
uint64_t ramUpTime{0};
// time since the nvm cache was created in seconds
uint64_t nvmUpTime{0};
// if nvmcache is currently active and serving gets
bool nvmCacheEnabled;
// stats related to the reaper
ReaperStats reaperStats;
uint64_t numNvmRejectsByExpiry{};
uint64_t numNvmRejectsByClean{};
uint64_t numNvmRejectsByAP{};
// Decryption and Encryption errors
uint64_t numNvmEncryptionErrors{0};
uint64_t numNvmDecryptionErrors{0};
// Number of times slab release was aborted due to shutdown
uint64_t numAbortedSlabReleases{0};
// current active handles outstanding. This stat should
// not go to negative. If it's negative, it means we have
// leaked handles (or some sort of accounting bug internally)
int64_t numActiveHandles;
};
struct CacheMemoryStats {
// current memory used for cache in bytes. This excludes the memory used for
// headers. This can change as memory is advised and reclaimed.
size_t cacheSize{0};
// regular pool memory size in bytes
size_t regularCacheSize{0};
// compact cache pool memory size in bytes
size_t compactCacheSize{0};
// current advised away memory size in bytes.
size_t advisedSize{0};
// maximum advised pct of regular cache.
size_t maxAdvisedPct{0};
// amount of memory that is not assigned for any pool in bytes
size_t unReservedSize{0};
// size of the nvm cache in addition to the ram cache.
size_t nvmCacheSize{0};
// returns the advised memory in the unit of slabs.
size_t numAdvisedSlabs() const { return advisedSize / Slab::kSize; }
// returne usable portion of the cache size
size_t usableCacheSize() const { return cacheSize - advisedSize; }
// amount of memory available on the host
size_t memAvailableSize{0};
// rss size of the process
size_t memRssSize{0};
};
// Stats for compact cache
struct CCacheStats {
uint64_t get;
uint64_t getHit;
uint64_t getMiss;
uint64_t getErr;
uint64_t tailHits;
uint64_t set;
uint64_t setHit;
uint64_t setMiss;
uint64_t setErr;
uint64_t evictions;
uint64_t del;
uint64_t delHit;
uint64_t delMiss;
uint64_t delErr;
uint64_t purgeSuccess;
uint64_t purgeErr;
uint64_t lockTimeout;
uint64_t promoteTimeout;
double hitRatio() const;
CCacheStats& operator+=(const CCacheStats& other) {
get += other.get;
getHit += other.getHit;
getMiss += other.getMiss;
getErr += other.getErr;
tailHits += other.tailHits;
set += other.set;
setHit += other.setHit;
setMiss += other.setMiss;
setErr += other.setErr;
evictions += other.evictions;
del += other.del;
delHit += other.delHit;
delMiss += other.delMiss;
delErr += other.delErr;
purgeSuccess += other.purgeSuccess;
purgeErr += other.purgeErr;
lockTimeout += other.lockTimeout;
promoteTimeout += other.promoteTimeout;
return *this;
}
};
// Types of background workers
enum PoolWorkerType {
POOL_REBALANCER = 0,
POOL_RESIZER,
MEMORY_MONITOR,
MAX_POOL_WORKER
};
/* Slab release event data */
struct SlabReleaseData {
// Time when release occured.
std::chrono::system_clock::time_point timeOfRelease;
// The class where the slab was released from.
ClassId from;
// The receiver of the released slab.
ClassId to;
// The sequence of this event, with respect to other release events logged by
// this process.
uint64_t sequenceNum;
// Time release took.
uint64_t durationMs;
// PoolId of the pool where the rebalance occurred.
PoolId pid;
// Number of slabs in the victim class after rebalancing.
unsigned int numSlabsInVictim;
// Number of slabs in the receiver class after rebalancing.
unsigned int numSlabsInReceiver;
// Allocation size of the victim class.
uint32_t victimAllocSize;
// Allocation size of the receiver class.
uint32_t receiverAllocSize;
// Eviction age of the victim class.
uint64_t victimEvictionAge;
// Eviction age of the receiver class.
uint64_t receiverEvictionAge;
// Number of free allocs in the victim class
uint64_t numFreeAllocsInVictim;
};
using SlabReleaseEvents = std::vector<SlabReleaseData>;
// Slab release events organized by their type
struct AllSlabReleaseEvents {
SlabReleaseEvents rebalancerEvents;
SlabReleaseEvents resizerEvents;
SlabReleaseEvents monitorEvents;
};
} // namespace cachelib
} // namespace facebook