cachelib/allocator/CacheAllocator-inl.h (2,576 lines of code) (raw):

/* * Copyright (c) Facebook, Inc. and its affiliates. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing fmm permissions and * limitations under the License. */ #pragma once namespace facebook { namespace cachelib { template <typename CacheTrait> CacheAllocator<CacheTrait>::CacheAllocator(Config config) : isOnShm_{config.memMonitoringEnabled()}, config_(config.validate()), tempShm_(isOnShm_ ? std::make_unique<TempShmMapping>(config_.size) : nullptr), allocator_(isOnShm_ ? std::make_unique<MemoryAllocator>( getAllocatorConfig(config_), tempShm_->getAddr(), config_.size) : std::make_unique<MemoryAllocator>( getAllocatorConfig(config_), config_.size)), compactCacheManager_(std::make_unique<CCacheManager>(*allocator_)), compressor_(createPtrCompressor()), accessContainer_(std::make_unique<AccessContainer>( config_.accessConfig, compressor_, [this](Item* it) -> ItemHandle { return acquire(it); })), chainedItemAccessContainer_(std::make_unique<AccessContainer>( config_.chainedItemAccessConfig, compressor_, [this](Item* it) -> ItemHandle { return acquire(it); })), chainedItemLocks_(config_.chainedItemsLockPower, std::make_shared<MurmurHash2>()), cacheCreationTime_{util::getCurrentTimeSec()}, nvmCacheState_{config_.cacheDir, config_.isNvmCacheEncryptionEnabled(), config_.isNvmCacheTruncateAllocSizeEnabled()} { initCommon(false); } template <typename CacheTrait> CacheAllocator<CacheTrait>::CacheAllocator(SharedMemNewT, Config config) : isOnShm_{true}, config_(config.validate()), shmManager_( std::make_unique<ShmManager>(config_.cacheDir, config_.usePosixShm)), allocator_(createNewMemoryAllocator()), compactCacheManager_(std::make_unique<CCacheManager>(*allocator_)), compressor_(createPtrCompressor()), accessContainer_(std::make_unique<AccessContainer>( config_.accessConfig, shmManager_ ->createShm(detail::kShmHashTableName, AccessContainer::getRequiredSize( config_.accessConfig.getNumBuckets()), nullptr, ShmSegmentOpts(config_.accessConfig.getPageSize())) .addr, compressor_, [this](Item* it) -> ItemHandle { return acquire(it); })), chainedItemAccessContainer_(std::make_unique<AccessContainer>( config_.chainedItemAccessConfig, shmManager_ ->createShm(detail::kShmChainedItemHashTableName, AccessContainer::getRequiredSize( config_.chainedItemAccessConfig.getNumBuckets()), nullptr, ShmSegmentOpts(config_.accessConfig.getPageSize())) .addr, compressor_, [this](Item* it) -> ItemHandle { return acquire(it); })), chainedItemLocks_(config_.chainedItemsLockPower, std::make_shared<MurmurHash2>()), cacheCreationTime_{util::getCurrentTimeSec()}, nvmCacheState_{config_.cacheDir, config_.isNvmCacheEncryptionEnabled(), config_.isNvmCacheTruncateAllocSizeEnabled()} { initCommon(false); shmManager_->removeShm(detail::kShmInfoName); } template <typename CacheTrait> CacheAllocator<CacheTrait>::CacheAllocator(SharedMemAttachT, Config config) : isOnShm_{true}, config_(config.validate()), shmManager_( std::make_unique<ShmManager>(config_.cacheDir, config_.usePosixShm)), deserializer_(createDeserializer()), metadata_{deserializeCacheAllocatorMetadata(*deserializer_)}, allocator_(restoreMemoryAllocator()), compactCacheManager_(restoreCCacheManager()), compressor_(createPtrCompressor()), mmContainers_(deserializeMMContainers(*deserializer_, compressor_)), accessContainer_(std::make_unique<AccessContainer>( deserializer_->deserialize<AccessSerializationType>(), config_.accessConfig, shmManager_->attachShm(detail::kShmHashTableName), compressor_, [this](Item* it) -> ItemHandle { return acquire(it); })), chainedItemAccessContainer_(std::make_unique<AccessContainer>( deserializer_->deserialize<AccessSerializationType>(), config_.chainedItemAccessConfig, shmManager_->attachShm(detail::kShmChainedItemHashTableName), compressor_, [this](Item* it) -> ItemHandle { return acquire(it); })), chainedItemLocks_(config_.chainedItemsLockPower, std::make_shared<MurmurHash2>()), cacheCreationTime_{*metadata_.cacheCreationTime_ref()}, nvmCacheState_{config_.cacheDir, config_.isNvmCacheEncryptionEnabled(), config_.isNvmCacheTruncateAllocSizeEnabled()} { for (auto pid : *metadata_.compactCachePools_ref()) { isCompactCachePool_[pid] = true; } initCommon(true); // We will create a new info shm segment on shutDown(). If we don't remove // this info shm segment here and the new info shm segment's size is larger // than this one, creating new one will fail. shmManager_->removeShm(detail::kShmInfoName); } template <typename CacheTrait> CacheAllocator<CacheTrait>::~CacheAllocator() { XLOG(DBG, "destructing CacheAllocator"); // Stop all workers. In case user didn't call shutDown, we want to // terminate all background workers and nvmCache before member variables // go out of scope. stopWorkers(); nvmCache_.reset(); } template <typename CacheTrait> std::unique_ptr<MemoryAllocator> CacheAllocator<CacheTrait>::createNewMemoryAllocator() { ShmSegmentOpts opts; opts.alignment = sizeof(Slab); return std::make_unique<MemoryAllocator>( getAllocatorConfig(config_), shmManager_ ->createShm(detail::kShmCacheName, config_.size, config_.slabMemoryBaseAddr, opts) .addr, config_.size); } template <typename CacheTrait> std::unique_ptr<MemoryAllocator> CacheAllocator<CacheTrait>::restoreMemoryAllocator() { ShmSegmentOpts opts; opts.alignment = sizeof(Slab); return std::make_unique<MemoryAllocator>( deserializer_->deserialize<MemoryAllocator::SerializationType>(), shmManager_ ->attachShm(detail::kShmCacheName, config_.slabMemoryBaseAddr, opts) .addr, config_.size, config_.disableFullCoredump); } template <typename CacheTrait> std::unique_ptr<CCacheManager> CacheAllocator<CacheTrait>::restoreCCacheManager() { return std::make_unique<CCacheManager>( deserializer_->deserialize<CCacheManager::SerializationType>(), *allocator_); } template <typename CacheTrait> void CacheAllocator<CacheTrait>::initCommon(bool dramCacheAttached) { if (config_.nvmConfig.has_value()) { if (config_.nvmCacheAP) { nvmAdmissionPolicy_ = config_.nvmCacheAP; } else if (config_.rejectFirstAPNumEntries) { nvmAdmissionPolicy_ = std::make_shared<RejectFirstAP<CacheT>>( config_.rejectFirstAPNumEntries, config_.rejectFirstAPNumSplits, config_.rejectFirstSuffixIgnoreLength, config_.rejectFirstUseDramHitSignal); } if (config_.nvmAdmissionMinTTL > 0) { if (!nvmAdmissionPolicy_) { nvmAdmissionPolicy_ = std::make_shared<NvmAdmissionPolicy<CacheT>>(); } nvmAdmissionPolicy_->initMinTTL(config_.nvmAdmissionMinTTL); } } initStats(); initNvmCache(dramCacheAttached); initWorkers(); } template <typename CacheTrait> void CacheAllocator<CacheTrait>::initNvmCache(bool dramCacheAttached) { if (!config_.nvmConfig.has_value()) { return; } // for some usecases that create pools, restoring nvmcache when dram cache // is not persisted is not supported. const bool shouldDrop = config_.dropNvmCacheOnShmNew && !dramCacheAttached; // if we are dealing with persistency, cache directory should be enabled const bool truncate = config_.cacheDir.empty() || nvmCacheState_.shouldStartFresh() || shouldDrop; if (truncate) { nvmCacheState_.markTruncated(); } nvmCache_ = std::make_unique<NvmCacheT>(*this, *config_.nvmConfig, truncate, config_.itemDestructor); if (!config_.cacheDir.empty()) { nvmCacheState_.clearPrevState(); } } template <typename CacheTrait> void CacheAllocator<CacheTrait>::initWorkers() { if (config_.poolResizingEnabled()) { startNewPoolResizer(config_.poolResizeInterval, config_.poolResizeSlabsPerIter, config_.poolResizeStrategy); } if (config_.poolRebalancingEnabled()) { startNewPoolRebalancer(config_.poolRebalanceInterval, config_.defaultPoolRebalanceStrategy, config_.poolRebalancerFreeAllocThreshold); } if (config_.memMonitoringEnabled()) { if (!isOnShm_) { throw std::invalid_argument( "Memory monitoring is not supported for cache on heap. It is " "supported " "for cache on a shared memory segment only."); } startNewMemMonitor(config_.memMonitorInterval, config_.memMonitorConfig, config_.poolAdviseStrategy); } if (config_.itemsReaperEnabled()) { startNewReaper(config_.reaperInterval, config_.reaperConfig); } if (config_.poolOptimizerEnabled()) { startNewPoolOptimizer(config_.regularPoolOptimizeInterval, config_.compactCacheOptimizeInterval, config_.poolOptimizeStrategy, config_.ccacheOptimizeStepSizePercent); } } template <typename CacheTrait> std::unique_ptr<Deserializer> CacheAllocator<CacheTrait>::createDeserializer() { auto infoAddr = shmManager_->attachShm(detail::kShmInfoName); return std::make_unique<Deserializer>( reinterpret_cast<uint8_t*>(infoAddr.addr), reinterpret_cast<uint8_t*>(infoAddr.addr) + infoAddr.size); } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::WriteHandle CacheAllocator<CacheTrait>::allocate(PoolId poolId, typename Item::Key key, uint32_t size, uint32_t ttlSecs, uint32_t creationTime) { if (creationTime == 0) { creationTime = util::getCurrentTimeSec(); } return allocateInternal(poolId, key, size, creationTime, ttlSecs == 0 ? 0 : creationTime + ttlSecs); } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::WriteHandle CacheAllocator<CacheTrait>::allocateInternal(PoolId pid, typename Item::Key key, uint32_t size, uint32_t creationTime, uint32_t expiryTime) { util::LatencyTracker tracker{stats().allocateLatency_}; SCOPE_FAIL { stats_.invalidAllocs.inc(); }; // number of bytes required for this item const auto requiredSize = Item::getRequiredSize(key, size); // the allocation class in our memory allocator. const auto cid = allocator_->getAllocationClassId(pid, requiredSize); (*stats_.allocAttempts)[pid][cid].inc(); void* memory = allocator_->allocate(pid, requiredSize); if (memory == nullptr && !config_.isEvictionDisabled()) { memory = findEviction(pid, cid); } WriteHandle handle; if (memory != nullptr) { // At this point, we have a valid memory allocation that is ready for use. // Ensure that when we abort from here under any circumstances, we free up // the memory. Item's handle could throw because the key size was invalid // for example. SCOPE_FAIL { // free back the memory to the allocator since we failed. allocator_->free(memory); }; handle = acquire(new (memory) Item(key, size, creationTime, expiryTime)); if (handle) { handle.markNascent(); (*stats_.fragmentationSize)[pid][cid].add( util::getFragmentation(*this, *handle)); } } else { // failed to allocate memory. (*stats_.allocFailures)[pid][cid].inc(); // wake up rebalancer if (poolRebalancer_) { poolRebalancer_->wakeUp(); } } if (auto eventTracker = getEventTracker()) { const auto result = handle ? AllocatorApiResult::ALLOCATED : AllocatorApiResult::FAILED; eventTracker->record(AllocatorApiEvent::ALLOCATE, key, result, size, expiryTime ? expiryTime - creationTime : 0); } return handle; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::WriteHandle CacheAllocator<CacheTrait>::allocateChainedItem(const ReadHandle& parent, uint32_t size) { if (!parent) { throw std::invalid_argument( "Cannot call allocate chained item with a empty parent handle!"); } auto it = allocateChainedItemInternal(parent, size); if (auto eventTracker = getEventTracker()) { const auto result = it ? AllocatorApiResult::ALLOCATED : AllocatorApiResult::FAILED; eventTracker->record(AllocatorApiEvent::ALLOCATE_CHAINED, parent->getKey(), result, size, parent->getConfiguredTTL().count()); } return it; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::WriteHandle CacheAllocator<CacheTrait>::allocateChainedItemInternal( const ReadHandle& parent, uint32_t size) { util::LatencyTracker tracker{stats().allocateLatency_}; SCOPE_FAIL { stats_.invalidAllocs.inc(); }; // number of bytes required for this item const auto requiredSize = ChainedItem::getRequiredSize(size); const auto pid = allocator_->getAllocInfo(parent->getMemory()).poolId; const auto cid = allocator_->getAllocationClassId(pid, requiredSize); (*stats_.allocAttempts)[pid][cid].inc(); void* memory = allocator_->allocate(pid, requiredSize); if (memory == nullptr) { memory = findEviction(pid, cid); } if (memory == nullptr) { (*stats_.allocFailures)[pid][cid].inc(); return ItemHandle{}; } SCOPE_FAIL { allocator_->free(memory); }; auto child = acquire( new (memory) ChainedItem(compressor_.compress(parent.getInternal()), size, util::getCurrentTimeSec())); if (child) { child.markNascent(); (*stats_.fragmentationSize)[pid][cid].add( util::getFragmentation(*this, *child)); } return child; } template <typename CacheTrait> void CacheAllocator<CacheTrait>::addChainedItem(ItemHandle& parent, ItemHandle child) { if (!parent || !child || !child->isChainedItem()) { throw std::invalid_argument( folly::sformat("Invalid parent or child. parent: {}, child: {}", parent ? parent->toString() : "nullptr", child ? child->toString() : "nullptr")); } auto l = chainedItemLocks_.lockExclusive(parent->getKey()); // Insert into secondary lookup table for chained allocation auto oldHead = chainedItemAccessContainer_->insertOrReplace(*child); if (oldHead) { child->asChainedItem().appendChain(oldHead->asChainedItem(), compressor_); } // Count an item that just became a new parent if (!parent->hasChainedItem()) { stats_.numChainedParentItems.inc(); } // Parent needs to be marked before inserting child into MM container // so the parent-child relationship is established before an eviction // can be triggered from the child parent->markHasChainedItem(); // Count a new child stats_.numChainedChildItems.inc(); insertInMMContainer(*child); // Increment refcount since this chained item is now owned by the parent // Parent will decrement the refcount upon release. Since this is an // internal refcount, we dont include it in active handle tracking. child->incRef(); XDCHECK_EQ(2u, child->getRefCount()); invalidateNvm(*parent); if (auto eventTracker = getEventTracker()) { eventTracker->record(AllocatorApiEvent::ADD_CHAINED, parent->getKey(), AllocatorApiResult::INSERTED, child->getSize(), child->getConfiguredTTL().count()); } } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ItemHandle CacheAllocator<CacheTrait>::popChainedItem(ItemHandle& parent) { if (!parent || !parent->hasChainedItem()) { throw std::invalid_argument(folly::sformat( "Invalid parent {}", parent ? parent->toString() : nullptr)); } ItemHandle head; { // scope of chained item lock. auto l = chainedItemLocks_.lockExclusive(parent->getKey()); head = findChainedItem(*parent); if (head->asChainedItem().getNext(compressor_) != nullptr) { chainedItemAccessContainer_->insertOrReplace( *head->asChainedItem().getNext(compressor_)); } else { chainedItemAccessContainer_->remove(*head); parent->unmarkHasChainedItem(); stats_.numChainedParentItems.dec(); } head->asChainedItem().setNext(nullptr, compressor_); invalidateNvm(*parent); } const auto res = removeFromMMContainer(*head); XDCHECK(res == true); // decrement the refcount to indicate this item is unlinked from its parent head->decRef(); stats_.numChainedChildItems.dec(); if (auto eventTracker = getEventTracker()) { eventTracker->record(AllocatorApiEvent::POP_CHAINED, parent->getKey(), AllocatorApiResult::REMOVED, head->getSize(), head->getConfiguredTTL().count()); } return head; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::Key CacheAllocator<CacheTrait>::getParentKey(const Item& chainedItem) { XDCHECK(chainedItem.isChainedItem()); if (!chainedItem.isChainedItem()) { throw std::invalid_argument(folly::sformat( "Item must be chained item! Item: {}", chainedItem.toString())); } return reinterpret_cast<const ChainedItem&>(chainedItem) .getParentItem(compressor_) .getKey(); } template <typename CacheTrait> void CacheAllocator<CacheTrait>::transferChainLocked(ItemHandle& parent, ItemHandle& newParent) { // parent must be in a state to not have concurrent readers. Eviction code // paths rely on holding the last item handle. Since we hold on to an item // handle here, the chain will not be touched by any eviction code path. XDCHECK(parent); XDCHECK(newParent); XDCHECK_EQ(parent->getKey(), newParent->getKey()); XDCHECK(parent->hasChainedItem()); if (newParent->hasChainedItem()) { throw std::invalid_argument(folly::sformat( "New Parent {} has invalid state", newParent->toString())); } auto headHandle = findChainedItem(*parent); XDCHECK(headHandle); // remove from the access container since we are changing the key chainedItemAccessContainer_->remove(*headHandle); // change the key of the chain to have them belong to the new parent. ChainedItem* curr = &headHandle->asChainedItem(); const auto newParentPtr = compressor_.compress(newParent.get()); while (curr) { XDCHECK_EQ(curr == headHandle.get() ? 2u : 1u, curr->getRefCount()); XDCHECK(curr->isInMMContainer()); curr->changeKey(newParentPtr); curr = curr->getNext(compressor_); } newParent->markHasChainedItem(); auto oldHead = chainedItemAccessContainer_->insertOrReplace(*headHandle); if (oldHead) { throw std::logic_error( folly::sformat("Did not expect to find an existing chain for {}", newParent->toString(), oldHead->toString())); } parent->unmarkHasChainedItem(); } template <typename CacheTrait> void CacheAllocator<CacheTrait>::transferChainAndReplace( ItemHandle& parent, ItemHandle& newParent) { if (!parent || !newParent) { throw std::invalid_argument("invalid parent or new parent"); } { // scope for chained item lock auto l = chainedItemLocks_.lockExclusive(parent->getKey()); transferChainLocked(parent, newParent); } if (replaceIfAccessible(*parent, *newParent)) { newParent.unmarkNascent(); } invalidateNvm(*parent); } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::replaceIfAccessible(Item& oldItem, Item& newItem) { XDCHECK(!newItem.isAccessible()); // Inside the access container's lock, this checks if the old item is // accessible, and only in that case replaces it. If the old item is not // accessible anymore, it may have been replaced or removed earlier and there // is no point in proceeding with a move. if (!accessContainer_->replaceIfAccessible(oldItem, newItem)) { return false; } // Inside the MM container's lock, this checks if the old item exists to // make sure that no other thread removed it, and only then replaces it. if (!replaceInMMContainer(oldItem, newItem)) { accessContainer_->remove(newItem); return false; } // Replacing into the MM container was successful, but someone could have // called insertOrReplace() or remove() before or after the // replaceInMMContainer() operation, which would invalidate newItem. if (!newItem.isAccessible()) { removeFromMMContainer(newItem); return false; } return true; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ItemHandle CacheAllocator<CacheTrait>::replaceChainedItem(Item& oldItem, ItemHandle newItemHandle, Item& parent) { if (!newItemHandle) { throw std::invalid_argument("Empty handle for newItem"); } auto l = chainedItemLocks_.lockExclusive(parent.getKey()); if (!oldItem.isChainedItem() || !newItemHandle->isChainedItem() || &oldItem.asChainedItem().getParentItem(compressor_) != &newItemHandle->asChainedItem().getParentItem(compressor_) || &oldItem.asChainedItem().getParentItem(compressor_) != &parent || newItemHandle->isInMMContainer() || !oldItem.isInMMContainer()) { throw std::invalid_argument(folly::sformat( "Invalid args for replaceChainedItem. oldItem={}, newItem={}, " "parent={}", oldItem.toString(), newItemHandle->toString(), parent.toString())); } auto oldItemHdl = replaceChainedItemLocked(oldItem, std::move(newItemHandle), parent); invalidateNvm(parent); return oldItemHdl; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ItemHandle CacheAllocator<CacheTrait>::replaceChainedItemLocked(Item& oldItem, ItemHandle newItemHdl, const Item& parent) { XDCHECK(newItemHdl != nullptr); XDCHECK_GE(1u, oldItem.getRefCount()); // grab the handle to the old item so that we can return this. Also, we need // to drop the refcount the parent holds on oldItem by manually calling // decRef. To do that safely we need to have a proper outstanding handle. auto oldItemHdl = acquire(&oldItem); // Replace the old chained item with new item in the MMContainer before we // actually replace the old item in the chain if (!replaceChainedItemInMMContainer(oldItem, *newItemHdl)) { // This should never happen since we currently hold an valid // parent handle. None of its chained items can be removed throw std::runtime_error(folly::sformat( "chained item cannot be replaced in MM container, oldItem={}, " "newItem={}, parent={}", oldItem.toString(), newItemHdl->toString(), parent.toString())); } XDCHECK(!oldItem.isInMMContainer()); XDCHECK(newItemHdl->isInMMContainer()); auto head = findChainedItem(parent); XDCHECK(head != nullptr); XDCHECK_EQ(reinterpret_cast<uintptr_t>( &head->asChainedItem().getParentItem(compressor_)), reinterpret_cast<uintptr_t>(&parent)); // if old item is the head, replace the head in the chain and insert into // the access container and append its chain. if (head.get() == &oldItem) { chainedItemAccessContainer_->insertOrReplace(*newItemHdl); } else { // oldItem is in the middle of the chain, find its previous and fix the // links auto* prev = &head->asChainedItem(); auto* curr = prev->getNext(compressor_); while (curr != nullptr && curr != &oldItem) { prev = curr; curr = curr->getNext(compressor_); } XDCHECK(curr != nullptr); prev->setNext(&newItemHdl->asChainedItem(), compressor_); } newItemHdl->asChainedItem().setNext( oldItem.asChainedItem().getNext(compressor_), compressor_); oldItem.asChainedItem().setNext(nullptr, compressor_); // this should not result in 0 refcount. We are bumping down the internal // refcount. If it did, we would leak an item. oldItem.decRef(); XDCHECK_LT(0u, oldItem.getRefCount()) << oldItem.toString(); // increment refcount to indicate parent owns this similar to addChainedItem // Since this is an internal refcount, we dont include it in active handle // tracking. newItemHdl->incRef(); return oldItemHdl; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ReleaseRes CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it, RemoveContext ctx, bool nascent, const Item* toRecycle) { if (!it.isDrained()) { throw std::runtime_error( folly::sformat("cannot release this item: {}", it.toString())); } const auto allocInfo = allocator_->getAllocInfo(it.getMemory()); if (ctx == RemoveContext::kEviction) { const auto timeNow = util::getCurrentTimeSec(); const auto refreshTime = timeNow - it.getLastAccessTime(); const auto lifeTime = timeNow - it.getCreationTime(); stats_.ramEvictionAgeSecs_.trackValue(refreshTime); stats_.ramItemLifeTimeSecs_.trackValue(lifeTime); stats_.perPoolEvictionAgeSecs_[allocInfo.poolId].trackValue(refreshTime); } (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub( util::getFragmentation(*this, it)); // Chained items can only end up in this place if the user has allocated // memory for a chained item but has decided not to insert the chained item // to a parent item and instead drop the chained item handle. In this case, // we free the chained item directly without calling remove callback. if (it.isChainedItem()) { if (toRecycle) { throw std::runtime_error( folly::sformat("Can not recycle a chained item {}, toRecyle", it.toString(), toRecycle->toString())); } allocator_->free(&it); return ReleaseRes::kReleased; } // nascent items represent items that were allocated but never inserted into // the cache. We should not be executing removeCB for them since they were // not initialized from the user perspective and never part of the cache. if (!nascent && config_.removeCb) { config_.removeCb(RemoveCbData{ctx, it, viewAsChainedAllocsRange(it)}); } // only skip destructor for evicted items that are either in the queue to put // into nvm or already in nvm if (!nascent && config_.itemDestructor && (ctx != RemoveContext::kEviction || !it.isNvmClean() || it.isNvmEvicted())) { try { config_.itemDestructor(DestructorData{ ctx, it, viewAsChainedAllocsRange(it), allocInfo.poolId}); stats().numRamDestructorCalls.inc(); } catch (const std::exception& e) { stats().numDestructorExceptions.inc(); XLOG_EVERY_N(INFO, 100) << "Catch exception from user's item destructor: " << e.what(); } } // If no `toRecycle` is set, then the result is kReleased // Because this function cannot fail to release "it" ReleaseRes res = toRecycle == nullptr ? ReleaseRes::kReleased : ReleaseRes::kNotRecycled; // Free chained allocs if there are any if (it.hasChainedItem()) { // At this point, the parent is only accessible within this thread // and thus no one else can add or remove any chained items associated // with this parent. So we're free to go through the list and free // chained items one by one. auto headHandle = findChainedItem(it); ChainedItem* head = &headHandle.get()->asChainedItem(); headHandle.reset(); if (head == nullptr || &head->getParentItem(compressor_) != &it) { throw std::runtime_error(folly::sformat( "Mismatch parent pointer. This should not happen. Key: {}", it.getKey())); } if (!chainedItemAccessContainer_->remove(*head)) { throw std::runtime_error(folly::sformat( "Chained item associated with {} cannot be removed from hash table " "This should not happen here.", it.getKey())); } while (head) { auto next = head->getNext(compressor_); const auto childInfo = allocator_->getAllocInfo(static_cast<const void*>(head)); (*stats_.fragmentationSize)[childInfo.poolId][childInfo.classId].sub( util::getFragmentation(*this, *head)); removeFromMMContainer(*head); // If this chained item is marked as moving, we will not free it. // We must capture the moving state before we do the decRef when // we know the item must still be valid const bool wasMoving = head->isMoving(); // Decref and check if we were the last reference. Now if the item // was marked moving, after decRef, it will be free to be released // by slab release thread const auto childRef = head->decRef(); // If the item is already moving and we already decremented the // refcount, we don't need to free this item. We'll let the slab // release thread take care of that if (!wasMoving) { if (childRef != 0) { throw std::runtime_error(folly::sformat( "chained item refcount is not zero. We cannot proceed! " "Ref: {}, Chained Item: {}", childRef, head->toString())); } // Item is not moving and refcount is 0, we can proceed to // free it or recylce the memory if (head == toRecycle) { XDCHECK(ReleaseRes::kReleased != res); res = ReleaseRes::kRecycled; } else { allocator_->free(head); } } stats_.numChainedChildItems.dec(); head = next; } stats_.numChainedParentItems.dec(); } if (&it == toRecycle) { XDCHECK(ReleaseRes::kReleased != res); res = ReleaseRes::kRecycled; } else { XDCHECK(it.isDrained()); allocator_->free(&it); } return res; } template <typename CacheTrait> void CacheAllocator<CacheTrait>::incRef(Item& it) { it.incRef(); ++handleCount_.tlStats(); } template <typename CacheTrait> RefcountWithFlags::Value CacheAllocator<CacheTrait>::decRef(Item& it) { const auto ret = it.decRef(); // do this after we ensured that we incremented a reference. --handleCount_.tlStats(); return ret; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ItemHandle CacheAllocator<CacheTrait>::acquire(Item* it) { if (UNLIKELY(!it)) { return ItemHandle{}; } SCOPE_FAIL { stats_.numRefcountOverflow.inc(); }; incRef(*it); return ItemHandle{it, *this}; } template <typename CacheTrait> void CacheAllocator<CacheTrait>::release(Item* it, bool isNascent) { // decrement the reference and if it drops to 0, release it back to the // memory allocator, and invoke the removal callback if there is one. if (UNLIKELY(!it)) { return; } const auto ref = decRef(*it); if (UNLIKELY(ref == 0)) { const auto res = releaseBackToAllocator(*it, RemoveContext::kNormal, isNascent); XDCHECK(res == ReleaseRes::kReleased); } } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::removeFromMMContainer(Item& item) { // remove it from the mm container. if (item.isInMMContainer()) { auto& mmContainer = getMMContainer(item); return mmContainer.remove(item); } return false; } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::replaceInMMContainer(Item& oldItem, Item& newItem) { auto& oldContainer = getMMContainer(oldItem); auto& newContainer = getMMContainer(newItem); if (&oldContainer == &newContainer) { return oldContainer.replace(oldItem, newItem); } else { return oldContainer.remove(oldItem) && newContainer.add(newItem); } } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::replaceChainedItemInMMContainer( Item& oldItem, Item& newItem) { auto& oldMMContainer = getMMContainer(oldItem); auto& newMMContainer = getMMContainer(newItem); if (&oldMMContainer == &newMMContainer) { return oldMMContainer.replace(oldItem, newItem); } else { if (!oldMMContainer.remove(oldItem)) { return false; } // This cannot fail because a new item should not have been inserted const auto newRes = newMMContainer.add(newItem); XDCHECK(newRes); return true; } } template <typename CacheTrait> void CacheAllocator<CacheTrait>::insertInMMContainer(Item& item) { XDCHECK(!item.isInMMContainer()); auto& mmContainer = getMMContainer(item); if (!mmContainer.add(item)) { throw std::runtime_error(folly::sformat( "Invalid state. Node {} was already in the container.", &item)); } } /** * There is a potential race with inserts and removes that. While T1 inserts * the key, there is T2 that removes the key. There can be an interleaving of * inserts and removes into the MM and Access Conatainers.It does not matter * what the outcome of this race is (ie key can be present or not present). * However, if the key is not accessible, it should also not be in * MMContainer. To ensure that, we always add to MMContainer on inserts before * adding to the AccessContainer. Both the code paths on success/failure, * preserve the appropriate state in the MMContainer. Note that this insert * will also race with the removes we do in SlabRebalancing code paths. */ template <typename CacheTrait> bool CacheAllocator<CacheTrait>::insert(const WriteHandle& handle) { return insertImpl(handle, AllocatorApiEvent::INSERT); } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::insertImpl(const WriteHandle& handle, AllocatorApiEvent event) { XDCHECK(handle); XDCHECK(event == AllocatorApiEvent::INSERT || event == AllocatorApiEvent::INSERT_FROM_NVM); if (handle->isAccessible()) { throw std::invalid_argument("Handle is already accessible"); } if (nvmCache_ != nullptr && !handle->isNvmClean()) { throw std::invalid_argument("Can't use insert API with nvmCache enabled"); } // insert into the MM container before we make it accessible. Find will // return this item as soon as it is accessible. insertInMMContainer(*(handle.getInternal())); AllocatorApiResult result; if (!accessContainer_->insert(*(handle.getInternal()))) { // this should destroy the handle and release it back to the allocator. removeFromMMContainer(*(handle.getInternal())); result = AllocatorApiResult::FAILED; } else { handle.unmarkNascent(); result = AllocatorApiResult::INSERTED; } if (auto eventTracker = getEventTracker()) { eventTracker->record(event, handle->getKey(), result, handle->getSize(), handle->getConfiguredTTL().count()); } return result == AllocatorApiResult::INSERTED; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::WriteHandle CacheAllocator<CacheTrait>::insertOrReplace(const WriteHandle& handle) { XDCHECK(handle); if (handle->isAccessible()) { throw std::invalid_argument("Handle is already accessible"); } insertInMMContainer(*(handle.getInternal())); WriteHandle replaced; try { auto lock = nvmCache_ ? nvmCache_->getItemDestructorLock(handle->getKey()) : std::unique_lock<std::mutex>(); replaced = accessContainer_->insertOrReplace(*(handle.getInternal())); if (replaced && replaced->isNvmClean() && !replaced->isNvmEvicted()) { // item is to be replaced and the destructor will be executed // upon memory released, mark it in nvm to avoid destructor // executed from nvm nvmCache_->markNvmItemRemovedLocked(handle->getKey()); } } catch (const std::exception&) { removeFromMMContainer(*(handle.getInternal())); if (auto eventTracker = getEventTracker()) { eventTracker->record(AllocatorApiEvent::INSERT_OR_REPLACE, handle->getKey(), AllocatorApiResult::FAILED, handle->getSize(), handle->getConfiguredTTL().count()); } throw; } // Remove from LRU as well if we do have a handle of old item if (replaced) { removeFromMMContainer(*replaced); } if (UNLIKELY(nvmCache_ != nullptr)) { // We can avoid nvm delete only if we have non nvm clean item in cache. // In all other cases we must enqueue delete. if (!replaced || replaced->isNvmClean()) { nvmCache_->remove(handle->getKey(), nvmCache_->createDeleteTombStone(handle->getKey())); } } handle.unmarkNascent(); if (auto eventTracker = getEventTracker()) { XDCHECK(handle); const auto result = replaced ? AllocatorApiResult::REPLACED : AllocatorApiResult::INSERTED; eventTracker->record(AllocatorApiEvent::INSERT_OR_REPLACE, handle->getKey(), result, handle->getSize(), handle->getConfiguredTTL().count()); } return replaced; } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem, ItemHandle& newItemHdl) { XDCHECK(config_.moveCb); util::LatencyTracker tracker{stats_.moveRegularLatency_}; if (!oldItem.isAccessible() || oldItem.isExpired()) { return false; } XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize()); XDCHECK_EQ(reinterpret_cast<uintptr_t>(&getMMContainer(oldItem)), reinterpret_cast<uintptr_t>(&getMMContainer(*newItemHdl))); // take care of the flags before we expose the item to be accessed. this // will ensure that when another thread removes the item from RAM, we issue // a delete accordingly. See D7859775 for an example if (oldItem.isNvmClean()) { newItemHdl->markNvmClean(); } // Execute the move callback. We cannot make any guarantees about the // consistency of the old item beyond this point, because the callback can // do more than a simple memcpy() e.g. update external references. If there // are any remaining handles to the old item, it is the caller's // responsibility to invalidate them. The move can only fail after this // statement if the old item has been removed or replaced, in which case it // should be fine for it to be left in an inconsistent state. config_.moveCb(oldItem, *newItemHdl, nullptr); // Inside the access container's lock, this checks if the old item is // accessible and its refcount is zero. If the item is not accessible, // there is no point to replace it since it had already been removed // or in the process of being removed. If the item is in cache but the // refcount is non-zero, it means user could be attempting to remove // this item through an API such as remove(ItemHandle). In this case, // it is unsafe to replace the old item with a new one, so we should // also abort. if (!accessContainer_->replaceIf(oldItem, *newItemHdl, itemMovingPredicate)) { return false; } // Inside the MM container's lock, this checks if the old item exists to // make sure that no other thread removed it, and only then replaces it. if (!replaceInMMContainer(oldItem, *newItemHdl)) { accessContainer_->remove(*newItemHdl); return false; } // Replacing into the MM container was successful, but someone could have // called insertOrReplace() or remove() before or after the // replaceInMMContainer() operation, which would invalidate newItemHdl. if (!newItemHdl->isAccessible()) { removeFromMMContainer(*newItemHdl); return false; } // no one can add or remove chained items at this point if (oldItem.hasChainedItem()) { // safe to acquire handle for a moving Item auto oldHandle = acquire(&oldItem); XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString(); XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString(); try { auto l = chainedItemLocks_.lockExclusive(oldItem.getKey()); transferChainLocked(oldHandle, newItemHdl); } catch (const std::exception& e) { // this should never happen because we drained all the handles. XLOGF(DFATAL, "{}", e.what()); throw; } XDCHECK(!oldItem.hasChainedItem()); XDCHECK(newItemHdl->hasChainedItem()); } newItemHdl.unmarkNascent(); return true; } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::moveChainedItem(ChainedItem& oldItem, ItemHandle& newItemHdl) { XDCHECK(config_.moveCb); util::LatencyTracker tracker{stats_.moveChainedLatency_}; // This item has been unlinked from its parent and we're the only // owner of it, so we're done here if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) { return false; } const auto parentKey = oldItem.getParentItem(compressor_).getKey(); // Grab lock to prevent anyone else from modifying the chain auto l = chainedItemLocks_.lockExclusive(parentKey); auto parentHandle = validateAndGetParentHandleForChainedMoveLocked(oldItem, parentKey); if (!parentHandle) { return false; } // once we have the moving sync and valid parent for the old item, check if // the original allocation was made correctly. If not, we destroy the // allocation to indicate a retry to moving logic above. if (reinterpret_cast<uintptr_t>( &newItemHdl->asChainedItem().getParentItem(compressor_)) != reinterpret_cast<uintptr_t>(&parentHandle->asChainedItem())) { newItemHdl.reset(); return false; } XDCHECK_EQ(reinterpret_cast<uintptr_t>( &newItemHdl->asChainedItem().getParentItem(compressor_)), reinterpret_cast<uintptr_t>(&parentHandle->asChainedItem())); // In case someone else had removed this chained item from its parent by now // So we check again to see if the it has been unlinked from its parent if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) { return false; } auto parentPtr = parentHandle.get(); XDCHECK_EQ(reinterpret_cast<uintptr_t>(parentPtr), reinterpret_cast<uintptr_t>(&oldItem.getParentItem(compressor_))); // Invoke the move callback to fix up any user data related to the chain config_.moveCb(oldItem, *newItemHdl, parentPtr); // Replace the new item in the position of the old one before both in the // parent's chain and the MMContainer. auto oldItemHandle = replaceChainedItemLocked(oldItem, std::move(newItemHdl), *parentHandle); XDCHECK(oldItemHandle->isMoving()); XDCHECK(!oldItemHandle->isInMMContainer()); return true; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::Item* CacheAllocator<CacheTrait>::findEviction(PoolId pid, ClassId cid) { auto& mmContainer = getMMContainer(pid, cid); // Keep searching for a candidate until we were able to evict it // or until the search limit has been exhausted unsigned int searchTries = 0; auto itr = mmContainer.getEvictionIterator(); while ((config_.evictionSearchTries == 0 || config_.evictionSearchTries > searchTries) && itr) { ++searchTries; Item* candidate = itr.get(); // for chained items, the ownership of the parent can change. We try to // evict what we think as parent and see if the eviction of parent // recycles the child we intend to. auto toReleaseHandle = itr->isChainedItem() ? advanceIteratorAndTryEvictChainedItem(itr) : advanceIteratorAndTryEvictRegularItem(mmContainer, itr); if (toReleaseHandle) { if (toReleaseHandle->hasChainedItem()) { (*stats_.chainedItemEvictions)[pid][cid].inc(); } else { (*stats_.regularItemEvictions)[pid][cid].inc(); } // Invalidate iterator since later on we may use this mmContainer // again, which cannot be done unless we drop this iterator itr.destroy(); // we must be the last handle and for chained items, this will be // the parent. XDCHECK(toReleaseHandle.get() == candidate || candidate->isChainedItem()); XDCHECK_EQ(1u, toReleaseHandle->getRefCount()); // We manually release the item here because we don't want to // invoke the Item Handle's destructor which will be decrementing // an already zero refcount, which will throw exception auto& itemToRelease = *toReleaseHandle.release(); // Decrementing the refcount because we want to recycle the item const auto ref = decRef(itemToRelease); XDCHECK_EQ(0u, ref); // check if by releasing the item we intend to, we actually // recycle the candidate. if (ReleaseRes::kRecycled == releaseBackToAllocator(itemToRelease, RemoveContext::kEviction, /* isNascent */ false, candidate)) { return candidate; } } // If we destroyed the itr to possibly evict and failed, we restart // from the beginning again if (!itr) { itr.resetToBegin(); } } return nullptr; } template <typename CacheTrait> folly::Range<typename CacheAllocator<CacheTrait>::ChainedItemIter> CacheAllocator<CacheTrait>::viewAsChainedAllocsRange(const Item& parent) const { return parent.hasChainedItem() ? folly::Range<ChainedItemIter>{ChainedItemIter{ findChainedItem(parent).get(), compressor_}, ChainedItemIter{}} : folly::Range<ChainedItemIter>{}; } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::shouldWriteToNvmCache(const Item& item) { // write to nvmcache when it is enabled and the item says that it is not // nvmclean or evicted by nvm while present in DRAM. bool doWrite = nvmCache_ && nvmCache_->isEnabled(); if (!doWrite) { return false; } doWrite = !item.isExpired(); if (!doWrite) { stats_.numNvmRejectsByExpiry.inc(); return false; } doWrite = (!item.isNvmClean() || item.isNvmEvicted()); if (!doWrite) { stats_.numNvmRejectsByClean.inc(); return false; } return true; } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::shouldWriteToNvmCacheExclusive( const Item& item) { auto chainedItemRange = viewAsChainedAllocsRange(item); if (nvmAdmissionPolicy_ && !nvmAdmissionPolicy_->accept(item, chainedItemRange)) { stats_.numNvmRejectsByAP.inc(); return false; } return true; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ItemHandle CacheAllocator<CacheTrait>::advanceIteratorAndTryEvictRegularItem( MMContainer& mmContainer, EvictionIterator& itr) { // we should flush this to nvmcache if it is not already present in nvmcache // and the item is not expired. Item& item = *itr; const bool evictToNvmCache = shouldWriteToNvmCache(item); auto token = evictToNvmCache ? nvmCache_->createPutToken(item.getKey()) : typename NvmCacheT::PutToken{}; // record the in-flight eviciton. If not, we move on to next item to avoid // stalling eviction. if (evictToNvmCache && !token.isValid()) { ++itr; stats_.evictFailConcurrentFill.inc(); return ItemHandle{}; } // If there are other accessors, we should abort. Acquire a handle here since // if we remove the item from both access containers and mm containers // below, we will need a handle to ensure proper cleanup in case we end up // not evicting this item auto evictHandle = accessContainer_->removeIf(item, &itemEvictionPredicate); if (!evictHandle) { ++itr; stats_.evictFailAC.inc(); return evictHandle; } mmContainer.remove(itr); XDCHECK_EQ(reinterpret_cast<uintptr_t>(evictHandle.get()), reinterpret_cast<uintptr_t>(&item)); XDCHECK(!evictHandle->isInMMContainer()); XDCHECK(!evictHandle->isAccessible()); // If the item is now marked as moving, that means its corresponding slab is // being released right now. So, we look for the next item that is eligible // for eviction. It is safe to destroy the handle here since the moving bit // is set. Iterator was already advance by the remove call above. if (evictHandle->isMoving()) { stats_.evictFailMove.inc(); return ItemHandle{}; } // Invalidate iterator since later on if we are not evicting this // item, we may need to rely on the handle we created above to ensure // proper cleanup if the item's raw refcount has dropped to 0. // And since this item may be a parent item that has some child items // in this very same mmContainer, we need to make sure we drop this // exclusive iterator so we can gain access to it when we're cleaning // up the child items itr.destroy(); // Ensure that there are no accessors after removing from the access // container XDCHECK(evictHandle->getRefCount() == 1); if (evictToNvmCache && shouldWriteToNvmCacheExclusive(item)) { XDCHECK(token.isValid()); nvmCache_->put(evictHandle, std::move(token)); } return evictHandle; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ItemHandle CacheAllocator<CacheTrait>::advanceIteratorAndTryEvictChainedItem( EvictionIterator& itr) { XDCHECK(itr->isChainedItem()); ChainedItem* candidate = &itr->asChainedItem(); ++itr; // The parent could change at any point through transferChain. However, if // that happens, we would realize that the releaseBackToAllocator return // kNotRecycled and we would try another chained item, leading to transient // failure. auto& parent = candidate->getParentItem(compressor_); const bool evictToNvmCache = shouldWriteToNvmCache(parent); auto token = evictToNvmCache ? nvmCache_->createPutToken(parent.getKey()) : typename NvmCacheT::PutToken{}; // if token is invalid, return. iterator is already advanced. if (evictToNvmCache && !token.isValid()) { stats_.evictFailConcurrentFill.inc(); return ItemHandle{}; } // check if the parent exists in the hashtable and refcount is drained. auto parentHandle = accessContainer_->removeIf(parent, &itemEvictionPredicate); if (!parentHandle) { stats_.evictFailParentAC.inc(); return parentHandle; } // Invalidate iterator since later on we may use the mmContainer // associated with this iterator which cannot be done unless we // drop this iterator // // This must be done once we know the parent is not nullptr. // Since we can very well be the last holder of this parent item, // which may have a chained item that is linked in this MM container. itr.destroy(); // Ensure we have the correct parent and we're the only user of the // parent, then free it from access container. Otherwise, we abort XDCHECK_EQ(reinterpret_cast<uintptr_t>(&parent), reinterpret_cast<uintptr_t>(parentHandle.get())); XDCHECK_EQ(1u, parent.getRefCount()); removeFromMMContainer(*parentHandle); XDCHECK(!parent.isInMMContainer()); XDCHECK(!parent.isAccessible()); // We need to make sure the parent is not marked as moving // and we're the only holder of the parent item. Safe to destroy the handle // here since moving bit is set. if (parentHandle->isMoving()) { stats_.evictFailParentMove.inc(); return ItemHandle{}; } if (evictToNvmCache && shouldWriteToNvmCacheExclusive(*parentHandle)) { XDCHECK(token.isValid()); XDCHECK(parentHandle->hasChainedItem()); nvmCache_->put(parentHandle, std::move(token)); } return parentHandle; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::RemoveRes CacheAllocator<CacheTrait>::remove(typename Item::Key key) { // While we issue this delete, there can be potential races that change the // state of the cache between ram and nvm. If we find the item in RAM and // obtain a handle, the situation is simpler. The complicated ones are the // following scenarios where when the delete checks RAM, we don't find // anything in RAM. The end scenario is that in the absence of any // concurrent inserts, after delete, there should be nothing in nvm and ram. // // == Racing async fill from nvm with delete == // 1. T1 finds nothing in ram and issues a nvmcache look that is async. We // enqueue the get holding the fill lock and drop it. // 2. T2 finds nothing in ram, enqueues delete to nvmcache. // 3. T1's async fetch finishes and fills the item in cache, but right // before the delete is enqueued above // // To deal with this race, we first enqueue the nvmcache delete tombstone // and when we finish the async fetch, we check if a tombstone was enqueued // meanwhile and cancel the fill. // // == Racing async fill from nvm with delete == // there is a key in nvmcache and nothing in RAM. // 1. T1 issues delete while nothing is in RAM and enqueues nvm cache // remove // 2. before the nvmcache remove gets enqueued, T2 does a find() that // fetches from nvm. // 3. T2 inserts in cache from nvmcache and T1 observes that item and tries // to remove it only from RAM. // // to fix this, we do the nvmcache remove always the last thing and enqueue // a tombstone to avoid concurrent fills while we are in the process of // doing the nvmcache remove. // // == Racing eviction with delete == // 1. T1 is evicting an item, trying to remove from the hashtable and is in // the process of enqueing the put to nvmcache. // 2. T2 is removing and finds nothing in ram, enqueue the nvmcache delete. // The delete to nvmcache gets enqueued after T1 fills in ram. // // If T2 finds the item in ram, eviction can not proceed and the race does // not exist. If T2 does not find anything in RAM, it is likely that T1 is // in the process of issuing an nvmcache put. In this case, T1's nvmcache // put will check if there was a delete enqueued while the eviction was in // flight after removing from the hashtable. // stats_.numCacheRemoves.inc(); using Guard = typename NvmCacheT::DeleteTombStoneGuard; auto tombStone = nvmCache_ ? nvmCache_->createDeleteTombStone(key) : Guard{}; auto handle = findInternal(key); if (!handle) { if (nvmCache_) { nvmCache_->remove(key, std::move(tombStone)); } if (auto eventTracker = getEventTracker()) { eventTracker->record(AllocatorApiEvent::REMOVE, key, AllocatorApiResult::NOT_FOUND); } return RemoveRes::kNotFoundInRam; } return removeImpl(*handle, std::move(tombStone)); } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::removeFromRamForTesting( typename Item::Key key) { return removeImpl(*findInternal(key), DeleteTombStoneGuard{}, false /* removeFromNvm */) == RemoveRes::kSuccess; } template <typename CacheTrait> void CacheAllocator<CacheTrait>::removeFromNvmForTesting( typename Item::Key key) { if (nvmCache_) { nvmCache_->remove(key, nvmCache_->createDeleteTombStone(key)); } } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::pushToNvmCacheFromRamForTesting( typename Item::Key key) { auto handle = findInternal(key); if (handle && nvmCache_ && shouldWriteToNvmCache(*handle) && shouldWriteToNvmCacheExclusive(*handle)) { nvmCache_->put(handle, nvmCache_->createPutToken(handle->getKey())); return true; } return false; } template <typename CacheTrait> void CacheAllocator<CacheTrait>::flushNvmCache() { if (nvmCache_) { nvmCache_->flushPendingOps(); } } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::RemoveRes CacheAllocator<CacheTrait>::remove(AccessIterator& it) { stats_.numCacheRemoves.inc(); if (auto eventTracker = getEventTracker()) { eventTracker->record(AllocatorApiEvent::REMOVE, it->getKey(), AllocatorApiResult::REMOVED, it->getSize(), it->getConfiguredTTL().count()); } auto tombstone = nvmCache_ ? nvmCache_->createDeleteTombStone(it->getKey()) : DeleteTombStoneGuard{}; return removeImpl(*it, std::move(tombstone)); } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::RemoveRes CacheAllocator<CacheTrait>::remove(const ReadHandle& it) { stats_.numCacheRemoves.inc(); if (!it) { throw std::invalid_argument("Trying to remove a null item handle"); } auto tombstone = nvmCache_ ? nvmCache_->createDeleteTombStone(it->getKey()) : DeleteTombStoneGuard{}; return removeImpl(*(it.getInternal()), std::move(tombstone)); } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::RemoveRes CacheAllocator<CacheTrait>::removeImpl(Item& item, DeleteTombStoneGuard tombstone, bool removeFromNvm, bool recordApiEvent) { bool success = false; { auto lock = nvmCache_ ? nvmCache_->getItemDestructorLock(item.getKey()) : std::unique_lock<std::mutex>(); success = accessContainer_->remove(item); if (removeFromNvm && success && item.isNvmClean() && !item.isNvmEvicted()) { // item is to be removed and the destructor will be executed // upon memory released, mark it in nvm to avoid destructor // executed from nvm nvmCache_->markNvmItemRemovedLocked(item.getKey()); } } XDCHECK(!item.isAccessible()); // remove it from the mm container. this will be no-op if it is already // removed. removeFromMMContainer(item); // Enqueue delete to nvmCache if we know from the item that it was pulled in // from NVM. If the item was not pulled in from NVM, it is not possible to // have it be written to NVM. if (removeFromNvm && item.isNvmClean()) { XDCHECK(tombstone); nvmCache_->remove(item.getKey(), std::move(tombstone)); } auto eventTracker = getEventTracker(); if (recordApiEvent && eventTracker) { const auto result = success ? AllocatorApiResult::REMOVED : AllocatorApiResult::NOT_FOUND; eventTracker->record(AllocatorApiEvent::REMOVE, item.getKey(), result, item.getSize(), item.getConfiguredTTL().count()); } // the last guy with reference to the item will release it back to the // allocator. if (success) { stats_.numCacheRemoveRamHits.inc(); return RemoveRes::kSuccess; } return RemoveRes::kNotFoundInRam; } template <typename CacheTrait> void CacheAllocator<CacheTrait>::invalidateNvm(Item& item) { if (nvmCache_ != nullptr && item.isAccessible() && item.isNvmClean()) { { auto lock = nvmCache_->getItemDestructorLock(item.getKey()); if (!item.isNvmEvicted() && item.isNvmClean() && item.isAccessible()) { // item is being updated and invalidated in nvm. Mark the item to avoid // destructor to be executed from nvm nvmCache_->markNvmItemRemovedLocked(item.getKey()); } item.unmarkNvmClean(); } nvmCache_->remove(item.getKey(), nvmCache_->createDeleteTombStone(item.getKey())); } } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::MMContainer& CacheAllocator<CacheTrait>::getMMContainer(const Item& item) const noexcept { const auto allocInfo = allocator_->getAllocInfo(static_cast<const void*>(&item)); return getMMContainer(allocInfo.poolId, allocInfo.classId); } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::MMContainer& CacheAllocator<CacheTrait>::getMMContainer(PoolId pid, ClassId cid) const noexcept { XDCHECK_LT(static_cast<size_t>(pid), mmContainers_.size()); XDCHECK_LT(static_cast<size_t>(cid), mmContainers_[pid].size()); return *mmContainers_[pid][cid]; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ItemHandle CacheAllocator<CacheTrait>::peek(typename Item::Key key) { auto handle = findInternal(key); return handle; } template <typename CacheTrait> std::pair<typename CacheAllocator<CacheTrait>::ItemHandle, typename CacheAllocator<CacheTrait>::ItemHandle> CacheAllocator<CacheTrait>::inspectCache(typename Item::Key key) { std::pair<ItemHandle, ItemHandle> res; res.first = findInternal(key); res.second = nvmCache_ ? nvmCache_->peek(key) : nullptr; return res; } // findFast and find() are the most performance critical parts of // CacheAllocator. Hence the sprinkling of UNLIKELY/LIKELY to tell the // compiler which executions we don't want to optimize on. template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ItemHandle CacheAllocator<CacheTrait>::findFastInternal(typename Item::Key key, AccessMode mode) { auto handle = findInternal(key); stats_.numCacheGets.inc(); if (UNLIKELY(!handle)) { stats_.numCacheGetMiss.inc(); return handle; } markUseful(handle, mode); return handle; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ItemHandle CacheAllocator<CacheTrait>::findFastImpl(typename Item::Key key, AccessMode mode) { auto handle = findFastInternal(key, mode); auto eventTracker = getEventTracker(); if (UNLIKELY(eventTracker != nullptr)) { if (handle) { eventTracker->record(AllocatorApiEvent::FIND_FAST, key, AllocatorApiResult::FOUND, folly::Optional<uint32_t>(handle->getSize()), handle->getConfiguredTTL().count()); } else { eventTracker->record(AllocatorApiEvent::FIND_FAST, key, AllocatorApiResult::NOT_FOUND); } } return handle; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ReadHandle CacheAllocator<CacheTrait>::findFast(typename Item::Key key) { return findFastImpl(key, AccessMode::kRead); } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::WriteHandle CacheAllocator<CacheTrait>::findFastToWrite(typename Item::Key key, bool doNvmInvalidation) { auto handle = findFastImpl(key, AccessMode::kWrite); if (handle == nullptr) { return nullptr; } if (doNvmInvalidation) { invalidateNvm(*handle); } return handle; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ItemHandle CacheAllocator<CacheTrait>::findImpl(typename Item::Key key, AccessMode mode) { auto handle = findFastInternal(key, mode); if (handle) { if (UNLIKELY(handle->isExpired())) { // update cache miss stats if the item has already been expired. stats_.numCacheGetMiss.inc(); stats_.numCacheGetExpiries.inc(); auto eventTracker = getEventTracker(); if (UNLIKELY(eventTracker != nullptr)) { eventTracker->record(AllocatorApiEvent::FIND, key, AllocatorApiResult::NOT_FOUND); } ItemHandle ret; ret.markExpired(); return ret; } auto eventTracker = getEventTracker(); if (UNLIKELY(eventTracker != nullptr)) { eventTracker->record(AllocatorApiEvent::FIND, key, AllocatorApiResult::FOUND, handle->getSize(), handle->getConfiguredTTL().count()); } return handle; } auto eventResult = AllocatorApiResult::NOT_FOUND; if (nvmCache_) { handle = nvmCache_->find(key); eventResult = AllocatorApiResult::NOT_FOUND_IN_MEMORY; } auto eventTracker = getEventTracker(); if (UNLIKELY(eventTracker != nullptr)) { eventTracker->record(AllocatorApiEvent::FIND, key, eventResult); } return handle; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::WriteHandle CacheAllocator<CacheTrait>::findToWrite(typename Item::Key key, bool doNvmInvalidation) { auto handle = findImpl(key, AccessMode::kWrite); if (handle == nullptr) { return nullptr; } if (doNvmInvalidation) { invalidateNvm(*handle); } return handle; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ReadHandle CacheAllocator<CacheTrait>::find(typename Item::Key key) { return findImpl(key, AccessMode::kRead); } template <typename CacheTrait> void CacheAllocator<CacheTrait>::markUseful(const ReadHandle& handle, AccessMode mode) { if (!handle) { return; } auto& item = *(handle.getInternal()); bool recorded = recordAccessInMMContainer(item, mode); // if parent is not recorded, skip children as well when the config is set if (LIKELY(!item.hasChainedItem() || (!recorded && config_.isSkipPromoteChildrenWhenParentFailed()))) { return; } forEachChainedItem(item, [this, mode](ChainedItem& chainedItem) { recordAccessInMMContainer(chainedItem, mode); }); } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::recordAccessInMMContainer(Item& item, AccessMode mode) { const auto allocInfo = allocator_->getAllocInfo(static_cast<const void*>(&item)); (*stats_.cacheHits)[allocInfo.poolId][allocInfo.classId].inc(); // track recently accessed items if needed if (UNLIKELY(config_.trackRecentItemsForDump)) { ring_->trackItem(reinterpret_cast<uintptr_t>(&item), item.getSize()); } auto& mmContainer = getMMContainer(allocInfo.poolId, allocInfo.classId); return mmContainer.recordAccess(item, mode); } template <typename CacheTrait> uint32_t CacheAllocator<CacheTrait>::getUsableSize(const Item& item) const { const auto allocSize = allocator_->getAllocInfo(static_cast<const void*>(&item)).allocSize; return item.isChainedItem() ? allocSize - ChainedItem::getRequiredSize(0) : allocSize - Item::getRequiredSize(item.getKey(), 0); } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ItemHandle CacheAllocator<CacheTrait>::getSampleItem() { const auto* item = reinterpret_cast<const Item*>(allocator_->getRandomAlloc()); if (!item) { return ItemHandle{}; } auto handle = findInternal(item->getKey()); // Check that item returned is the same that was sampled if (handle.get() == item) { return handle; } return ItemHandle{}; } template <typename CacheTrait> std::vector<std::string> CacheAllocator<CacheTrait>::dumpEvictionIterator( PoolId pid, ClassId cid, size_t numItems) { if (numItems == 0) { return {}; } if (static_cast<size_t>(pid) >= mmContainers_.size() || static_cast<size_t>(cid) >= mmContainers_[pid].size()) { throw std::invalid_argument( folly::sformat("Invalid PoolId: {} and ClassId: {}.", pid, cid)); } std::vector<std::string> content; auto& mm = *mmContainers_[pid][cid]; auto evictItr = mm.getEvictionIterator(); size_t i = 0; while (evictItr && i < numItems) { content.push_back(evictItr->toString()); ++evictItr; ++i; } return content; } template <typename CacheTrait> template <typename Handle> folly::IOBuf CacheAllocator<CacheTrait>::convertToIOBufT(Handle& handle) { if (!handle) { throw std::invalid_argument("null item handle for converting to IOBUf"); } Item* item = handle.getInternal(); const uint32_t dataOffset = item->getOffsetForMemory(); using ConvertChainedItem = std::function<std::unique_ptr<folly::IOBuf>( Item * item, ChainedItem & chainedItem)>; folly::IOBuf iobuf; ConvertChainedItem converter; // based on current refcount and threshold from config // determine to use a new ItemHandle for each chain items // or use shared ItemHandle for all chain items if (item->getRefCount() > config_.thresholdForConvertingToIOBuf) { auto sharedHdl = std::make_shared<Handle>(std::move(handle)); iobuf = folly::IOBuf{ folly::IOBuf::TAKE_OWNERSHIP, item, // Since we'll be moving the IOBuf data pointer forward // by dataOffset, we need to adjust the IOBuf length // accordingly dataOffset + item->getSize(), [](void*, void* userData) { auto* hdl = reinterpret_cast<std::shared_ptr<Handle>*>(userData); delete hdl; } /* freeFunc */, new std::shared_ptr<Handle>{sharedHdl} /* userData for freeFunc */}; if (item->hasChainedItem()) { converter = [sharedHdl](Item*, ChainedItem& chainedItem) { const uint32_t chainedItemDataOffset = chainedItem.getOffsetForMemory(); return folly::IOBuf::takeOwnership( &chainedItem, // Since we'll be moving the IOBuf data pointer forward by // dataOffset, // we need to adjust the IOBuf length accordingly chainedItemDataOffset + chainedItem.getSize(), [](void*, void* userData) { auto* hdl = reinterpret_cast<std::shared_ptr<Handle>*>(userData); delete hdl; } /* freeFunc */, new std::shared_ptr<Handle>{sharedHdl} /* userData for freeFunc */); }; } } else { // following IOBuf will take the item's ownership and trigger freeFunc to // release the reference count. handle.release(); iobuf = folly::IOBuf{folly::IOBuf::TAKE_OWNERSHIP, item, // Since we'll be moving the IOBuf data pointer forward // by dataOffset, we need to adjust the IOBuf length // accordingly dataOffset + item->getSize(), [](void* buf, void* userData) { Handle{reinterpret_cast<Item*>(buf), *reinterpret_cast<decltype(this)>(userData)} .reset(); } /* freeFunc */, this /* userData for freeFunc */}; if (item->hasChainedItem()) { converter = [this](Item* parentItem, ChainedItem& chainedItem) { const uint32_t chainedItemDataOffset = chainedItem.getOffsetForMemory(); // Each IOBuf converted from a child item will hold one additional // refcount on the parent item. This ensures that as long as the user // holds any IOBuf pointing anywhere in the chain, the whole chain // will not be evicted from cache. // // We can safely bump the refcount on the parent here only because // we already have an item handle on the parent (which has just been // moved into the IOBuf above). Normally, the only place we can // bump an item handle safely is through the AccessContainer. acquire(parentItem).release(); return folly::IOBuf::takeOwnership( &chainedItem, // Since we'll be moving the IOBuf data pointer forward by // dataOffset, // we need to adjust the IOBuf length accordingly chainedItemDataOffset + chainedItem.getSize(), [](void* buf, void* userData) { auto* cache = reinterpret_cast<decltype(this)>(userData); auto* child = reinterpret_cast<ChainedItem*>(buf); auto* parent = &child->getParentItem(cache->compressor_); Handle{parent, *cache}.reset(); } /* freeFunc */, this /* userData for freeFunc */); }; } } iobuf.trimStart(dataOffset); iobuf.markExternallySharedOne(); if (item->hasChainedItem()) { auto appendHelper = [&](ChainedItem& chainedItem) { const uint32_t chainedItemDataOffset = chainedItem.getOffsetForMemory(); auto nextChain = converter(item, chainedItem); nextChain->trimStart(chainedItemDataOffset); nextChain->markExternallySharedOne(); // Append immediately after the parent, IOBuf will present the data // in the original insertion order. // // i.e. 1. Allocate parent // 2. add A, add B, add C // // In memory: parent -> C -> B -> A // In IOBuf: parent -> A -> B -> C iobuf.appendChain(std::move(nextChain)); }; forEachChainedItem(*item, std::move(appendHelper)); } return iobuf; } template <typename CacheTrait> folly::IOBuf CacheAllocator<CacheTrait>::wrapAsIOBuf(const Item& item) { folly::IOBuf ioBuf{folly::IOBuf::WRAP_BUFFER, item.getMemory(), item.getSize()}; if (item.hasChainedItem()) { auto appendHelper = [&](ChainedItem& chainedItem) { auto nextChain = folly::IOBuf::wrapBuffer(chainedItem.getMemory(), chainedItem.getSize()); // Append immediately after the parent, IOBuf will present the data // in the original insertion order. // // i.e. 1. Allocate parent // 2. add A, add B, add C // // In memory: parent -> C -> B -> A // In IOBuf: parent -> A -> B -> C ioBuf.appendChain(std::move(nextChain)); }; forEachChainedItem(item, std::move(appendHelper)); } return ioBuf; } template <typename CacheTrait> PoolId CacheAllocator<CacheTrait>::addPool( folly::StringPiece name, size_t size, const std::set<uint32_t>& allocSizes, MMConfig config, std::shared_ptr<RebalanceStrategy> rebalanceStrategy, std::shared_ptr<RebalanceStrategy> resizeStrategy, bool ensureProvisionable) { folly::SharedMutex::WriteHolder w(poolsResizeAndRebalanceLock_); auto pid = allocator_->addPool(name, size, allocSizes, ensureProvisionable); createMMContainers(pid, std::move(config)); setRebalanceStrategy(pid, std::move(rebalanceStrategy)); setResizeStrategy(pid, std::move(resizeStrategy)); return pid; } template <typename CacheTrait> void CacheAllocator<CacheTrait>::overridePoolRebalanceStrategy( PoolId pid, std::shared_ptr<RebalanceStrategy> rebalanceStrategy) { if (static_cast<size_t>(pid) >= mmContainers_.size()) { throw std::invalid_argument(folly::sformat( "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size())); } setRebalanceStrategy(pid, std::move(rebalanceStrategy)); } template <typename CacheTrait> void CacheAllocator<CacheTrait>::overridePoolResizeStrategy( PoolId pid, std::shared_ptr<RebalanceStrategy> resizeStrategy) { if (static_cast<size_t>(pid) >= mmContainers_.size()) { throw std::invalid_argument(folly::sformat( "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size())); } setResizeStrategy(pid, std::move(resizeStrategy)); } template <typename CacheTrait> void CacheAllocator<CacheTrait>::overridePoolOptimizeStrategy( std::shared_ptr<PoolOptimizeStrategy> optimizeStrategy) { setPoolOptimizeStrategy(std::move(optimizeStrategy)); } template <typename CacheTrait> void CacheAllocator<CacheTrait>::overridePoolConfig(PoolId pid, const MMConfig& config) { if (static_cast<size_t>(pid) >= mmContainers_.size()) { throw std::invalid_argument(folly::sformat( "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size())); } auto& pool = allocator_->getPool(pid); for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) { MMConfig mmConfig = config; mmConfig.addExtraConfig( config_.trackTailHits ? pool.getAllocationClass(static_cast<ClassId>(cid)) .getAllocsPerSlab() : 0); DCHECK_NOTNULL(mmContainers_[pid][cid].get()); mmContainers_[pid][cid]->setConfig(mmConfig); } } template <typename CacheTrait> void CacheAllocator<CacheTrait>::createMMContainers(const PoolId pid, MMConfig config) { auto& pool = allocator_->getPool(pid); for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) { config.addExtraConfig( config_.trackTailHits ? pool.getAllocationClass(static_cast<ClassId>(cid)) .getAllocsPerSlab() : 0); mmContainers_[pid][cid].reset(new MMContainer(config, compressor_)); } } template <typename CacheTrait> PoolId CacheAllocator<CacheTrait>::getPoolId( folly::StringPiece name) const noexcept { return allocator_->getPoolId(name.str()); } // The Function returns a consolidated vector of Release Slab // events from Pool Workers { Pool rebalancer, Pool Resizer and // Memory Monitor}. template <typename CacheTrait> AllSlabReleaseEvents CacheAllocator<CacheTrait>::getAllSlabReleaseEvents( PoolId poolId) const { AllSlabReleaseEvents res; // lock protects against workers being restarted { std::lock_guard<std::mutex> l(workersMutex_); if (poolRebalancer_) { res.rebalancerEvents = poolRebalancer_->getSlabReleaseEvents(poolId); } if (poolResizer_) { res.resizerEvents = poolResizer_->getSlabReleaseEvents(poolId); } if (memMonitor_) { res.monitorEvents = memMonitor_->getSlabReleaseEvents(poolId); } } return res; } template <typename CacheTrait> std::set<PoolId> CacheAllocator<CacheTrait>::filterCompactCachePools( const PoolIds& poolIds) const { PoolIds ret; folly::SharedMutex::ReadHolder lock(compactCachePoolsLock_); for (auto poolId : poolIds) { if (!isCompactCachePool_[poolId]) { // filter out slab pools backing the compact caches. ret.insert(poolId); } } return ret; } template <typename CacheTrait> std::set<PoolId> CacheAllocator<CacheTrait>::getRegularPoolIds() const { folly::SharedMutex::ReadHolder r(poolsResizeAndRebalanceLock_); return filterCompactCachePools(allocator_->getPoolIds()); } template <typename CacheTrait> std::set<PoolId> CacheAllocator<CacheTrait>::getCCachePoolIds() const { PoolIds ret; folly::SharedMutex::ReadHolder lock(compactCachePoolsLock_); for (PoolId id = 0; id < static_cast<PoolId>(MemoryPoolManager::kMaxPools); id++) { if (isCompactCachePool_[id]) { // filter out slab pools backing the compact caches. ret.insert(id); } } return ret; } template <typename CacheTrait> std::set<PoolId> CacheAllocator<CacheTrait>::getRegularPoolIdsForResize() const { folly::SharedMutex::ReadHolder r(poolsResizeAndRebalanceLock_); // If Slabs are getting advised away - as indicated by non-zero // getAdvisedMemorySize - then pools may be overLimit even when // all slabs are not allocated. Otherwise, pools may be overLimit // only after all slabs are allocated. // return (allocator_->allSlabsAllocated()) || (allocator_->getAdvisedMemorySize() != 0) ? filterCompactCachePools(allocator_->getPoolsOverLimit()) : std::set<PoolId>{}; } template <typename CacheTrait> const std::string CacheAllocator<CacheTrait>::getCacheName() const { return config_.cacheName; } template <typename CacheTrait> PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const { const auto& pool = allocator_->getPool(poolId); const auto& allocSizes = pool.getAllocSizes(); auto mpStats = pool.getStats(); const auto& classIds = mpStats.classIds; // check if this is a compact cache. bool isCompactCache = false; { folly::SharedMutex::ReadHolder lock(compactCachePoolsLock_); isCompactCache = isCompactCachePool_[poolId]; } std::unordered_map<ClassId, CacheStat> cacheStats; uint64_t totalHits = 0; // cacheStats is only menaningful for pools that are not compact caches. // TODO export evictions, numItems etc from compact cache directly. if (!isCompactCache) { for (const ClassId cid : classIds) { const auto& container = getMMContainer(poolId, cid); uint64_t classHits = (*stats_.cacheHits)[poolId][cid].get(); cacheStats.insert( {cid, {allocSizes[cid], (*stats_.allocAttempts)[poolId][cid].get(), (*stats_.allocFailures)[poolId][cid].get(), (*stats_.fragmentationSize)[poolId][cid].get(), classHits, (*stats_.chainedItemEvictions)[poolId][cid].get(), (*stats_.regularItemEvictions)[poolId][cid].get(), container.getStats()}}); totalHits += classHits; } } PoolStats ret; ret.isCompactCache = isCompactCache; ret.poolName = allocator_->getPoolName(poolId); ret.poolSize = pool.getPoolSize(); ret.poolUsableSize = pool.getPoolUsableSize(); ret.poolAdvisedSize = pool.getPoolAdvisedSize(); ret.cacheStats = std::move(cacheStats); ret.mpStats = std::move(mpStats); ret.numPoolGetHits = totalHits; ret.evictionAgeSecs = stats_.perPoolEvictionAgeSecs_[poolId].estimate(); return ret; } template <typename CacheTrait> PoolEvictionAgeStats CacheAllocator<CacheTrait>::getPoolEvictionAgeStats( PoolId pid, unsigned int slabProjectionLength) const { PoolEvictionAgeStats stats; const auto& pool = allocator_->getPool(pid); const auto& allocSizes = pool.getAllocSizes(); for (ClassId cid = 0; cid < static_cast<ClassId>(allocSizes.size()); ++cid) { auto& mmContainer = getMMContainer(pid, cid); const auto numItemsPerSlab = allocator_->getPool(pid).getAllocationClass(cid).getAllocsPerSlab(); const auto projectionLength = numItemsPerSlab * slabProjectionLength; stats.classEvictionAgeStats[cid] = mmContainer.getEvictionAgeStat(projectionLength); } return stats; } template <typename CacheTrait> CacheMetadata CacheAllocator<CacheTrait>::getCacheMetadata() const noexcept { return CacheMetadata{kCachelibVersion, kCacheRamFormatVersion, kCacheNvmFormatVersion, config_.size}; } template <typename CacheTrait> void CacheAllocator<CacheTrait>::releaseSlab(PoolId pid, ClassId cid, SlabReleaseMode mode, const void* hint) { releaseSlab(pid, cid, Slab::kInvalidClassId, mode, hint); } template <typename CacheTrait> void CacheAllocator<CacheTrait>::releaseSlab(PoolId pid, ClassId victim, ClassId receiver, SlabReleaseMode mode, const void* hint) { stats_.numActiveSlabReleases.inc(); SCOPE_EXIT { stats_.numActiveSlabReleases.dec(); }; switch (mode) { case SlabReleaseMode::kRebalance: stats_.numReleasedForRebalance.inc(); break; case SlabReleaseMode::kResize: stats_.numReleasedForResize.inc(); break; case SlabReleaseMode::kAdvise: stats_.numReleasedForAdvise.inc(); break; } try { auto releaseContext = allocator_->startSlabRelease( pid, victim, receiver, mode, hint, [this]() -> bool { return shutDownInProgress_; }); // No work needed if the slab is already released if (releaseContext.isReleased()) { return; } releaseSlabImpl(releaseContext); if (!allocator_->allAllocsFreed(releaseContext)) { throw std::runtime_error( folly::sformat("Was not able to free all allocs. PoolId: {}, AC: {}", releaseContext.getPoolId(), releaseContext.getClassId())); } allocator_->completeSlabRelease(releaseContext); } catch (const exception::SlabReleaseAborted& e) { stats_.numAbortedSlabReleases.inc(); throw exception::SlabReleaseAborted(folly::sformat( "Slab release aborted while releasing " "a slab in pool {} victim {} receiver {}. Original ex msg: ", pid, static_cast<int>(victim), static_cast<int>(receiver), e.what())); } } template <typename CacheTrait> SlabReleaseStats CacheAllocator<CacheTrait>::getSlabReleaseStats() const noexcept { std::lock_guard<std::mutex> l(workersMutex_); return SlabReleaseStats{stats_.numActiveSlabReleases.get(), stats_.numReleasedForRebalance.get(), stats_.numReleasedForResize.get(), stats_.numReleasedForAdvise.get(), poolRebalancer_ ? poolRebalancer_->getRunCount() : 0ULL, poolResizer_ ? poolResizer_->getRunCount() : 0ULL, memMonitor_ ? memMonitor_->getRunCount() : 0ULL, stats_.numMoveAttempts.get(), stats_.numMoveSuccesses.get(), stats_.numEvictionAttempts.get(), stats_.numEvictionSuccesses.get()}; } template <typename CacheTrait> void CacheAllocator<CacheTrait>::releaseSlabImpl( const SlabReleaseContext& releaseContext) { util::Throttler throttler(config_.throttleConfig); // Active allocations need to be freed before we can release this slab // The idea is: // 1. Iterate through each active allocation // 2. Under AC lock, acquire ownership of this active allocation // 3. If 2 is successful, Move or Evict // 4. Move on to the next item if current item is freed for (auto alloc : releaseContext.getActiveAllocations()) { // Need to mark an item for release before proceeding // If we can't mark as moving, it means the item is already freed const bool isAlreadyFreed = !markMovingForSlabRelease(releaseContext, alloc, throttler); if (isAlreadyFreed) { continue; } Item& item = *static_cast<Item*>(alloc); // Try to move this item and make sure we can free the memory const bool isMoved = moveForSlabRelease(releaseContext, item, throttler); // if moving fails, evict it if (!isMoved) { evictForSlabRelease(releaseContext, item, throttler); } XDCHECK(allocator_->isAllocFreed(releaseContext, alloc)); } } template <typename CacheTrait> void CacheAllocator<CacheTrait>::throttleWith(util::Throttler& t, std::function<void()> fn) { const unsigned int rateLimit = 1024; // execute every 1024 times we have actually throttled if (t.throttle() && (t.numThrottles() % rateLimit) == 0) { fn(); } } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::moveForSlabRelease( const SlabReleaseContext& ctx, Item& oldItem, util::Throttler& throttler) { if (!config_.moveCb) { return false; } bool isMoved = false; auto startTime = util::getCurrentTimeSec(); WriteHandle newItemHdl = allocateNewItemForOldItem(oldItem); for (unsigned int itemMovingAttempts = 0; itemMovingAttempts < config_.movingTries; ++itemMovingAttempts) { stats_.numMoveAttempts.inc(); // Nothing to move and the key is likely also bogus for chained items. if (oldItem.isOnlyMoving()) { oldItem.unmarkMoving(); const auto res = releaseBackToAllocator(oldItem, RemoveContext::kNormal, false); XDCHECK(res == ReleaseRes::kReleased); return true; } if (!newItemHdl) { // try to allocate again if it previously wasn't successful newItemHdl = allocateNewItemForOldItem(oldItem); } // if we have a valid handle, try to move, if not, we retry. if (newItemHdl) { isMoved = tryMovingForSlabRelease(oldItem, newItemHdl); if (isMoved) { break; } } throttleWith(throttler, [&] { XLOGF(WARN, "Spent {} seconds, slab release still trying to move Item: {}. " "Pool: {}, Class: {}.", util::getCurrentTimeSec() - startTime, oldItem.toString(), ctx.getPoolId(), ctx.getClassId()); }); } // Return false if we've exhausted moving tries. if (!isMoved) { return false; } // Since item has been moved, we can directly free it. We don't need to // worry about any stats related changes, because there is another item // that's identical to this one to replace it. Here we just need to wait // until all users have dropped the item handles before we can proceed. startTime = util::getCurrentTimeSec(); while (!oldItem.isOnlyMoving()) { throttleWith(throttler, [&] { XLOGF(WARN, "Spent {} seconds, slab release still waiting for refcount to " "drain Item: {}. Pool: {}, Class: {}.", util::getCurrentTimeSec() - startTime, oldItem.toString(), ctx.getPoolId(), ctx.getClassId()); }); } const auto allocInfo = allocator_->getAllocInfo(oldItem.getMemory()); allocator_->free(&oldItem); (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub( util::getFragmentation(*this, oldItem)); stats_.numMoveSuccesses.inc(); return true; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ItemHandle CacheAllocator<CacheTrait>::validateAndGetParentHandleForChainedMoveLocked( const ChainedItem& item, const Key& parentKey) { ItemHandle parentHandle{}; try { parentHandle = findInternal(parentKey); // If the parent is not the same as the parent of the chained item, // it means someone has replaced our old parent already. So we abort. if (!parentHandle || parentHandle.get() != &item.getParentItem(compressor_)) { return {}; } } catch (const exception::RefcountOverflow&) { return {}; } return parentHandle; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::WriteHandle CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) { if (oldItem.isChainedItem()) { const auto& oldChainedItem = oldItem.asChainedItem(); const auto parentKey = oldChainedItem.getParentItem(compressor_).getKey(); // Grab lock to prevent anyone else from modifying the chain auto l = chainedItemLocks_.lockExclusive(parentKey); auto parentHandle = validateAndGetParentHandleForChainedMoveLocked( oldChainedItem, parentKey); if (!parentHandle) { return {}; } // Set up the destination for the move. Since oldChainedItem would have // the moving bit set, it won't be picked for eviction. auto newItemHdl = allocateChainedItemInternal(parentHandle, oldChainedItem.getSize()); if (!newItemHdl) { return {}; } XDCHECK_EQ(newItemHdl->getSize(), oldChainedItem.getSize()); auto parentPtr = parentHandle.get(); XDCHECK_EQ(reinterpret_cast<uintptr_t>(parentPtr), reinterpret_cast<uintptr_t>( &oldChainedItem.getParentItem(compressor_))); return newItemHdl; } const auto allocInfo = allocator_->getAllocInfo(static_cast<const void*>(&oldItem)); // Set up the destination for the move. Since oldItem would have the moving // bit set, it won't be picked for eviction. auto newItemHdl = allocateInternal(allocInfo.poolId, oldItem.getKey(), oldItem.getSize(), oldItem.getCreationTime(), oldItem.getExpiryTime()); if (!newItemHdl) { return {}; } XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize()); XDCHECK_EQ(reinterpret_cast<uintptr_t>(&getMMContainer(oldItem)), reinterpret_cast<uintptr_t>(&getMMContainer(*newItemHdl))); return newItemHdl; } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::tryMovingForSlabRelease( Item& oldItem, ItemHandle& newItemHdl) { // By holding onto a user-level synchronization object, we ensure moving // a regular item or chained item is synchronized with any potential // user-side mutation. std::unique_ptr<SyncObj> syncObj; if (config_.movingSync) { if (!oldItem.isChainedItem()) { syncObj = config_.movingSync(oldItem.getKey()); } else { // Copy the key so we have a valid key to work with if the chained // item is still valid. const std::string parentKey = oldItem.asChainedItem().getParentItem(compressor_).getKey().str(); if (oldItem.isOnlyMoving()) { // If chained item no longer has a refcount, its parent is already // being released, so we abort this try to moving. return false; } syncObj = config_.movingSync(parentKey); } // We need to differentiate between the following three scenarios: // 1. nullptr indicates no move sync required for this particular item // 2. moveSync.isValid() == true meaning we've obtained the sync // 3. moveSync.isValid() == false meaning we need to abort and retry if (syncObj && !syncObj->isValid()) { return false; } } return oldItem.isChainedItem() ? moveChainedItem(oldItem.asChainedItem(), newItemHdl) : moveRegularItem(oldItem, newItemHdl); } template <typename CacheTrait> void CacheAllocator<CacheTrait>::evictForSlabRelease( const SlabReleaseContext& ctx, Item& item, util::Throttler& throttler) { XDCHECK(!config_.isEvictionDisabled()); auto startTime = util::getCurrentTimeSec(); while (true) { stats_.numEvictionAttempts.inc(); // if the item is already in a state where only the moving bit is set, // nothing needs to be done. We simply need to unmark moving bit and free // the item. if (item.isOnlyMoving()) { item.unmarkMoving(); const auto res = releaseBackToAllocator(item, RemoveContext::kNormal, false); XDCHECK(ReleaseRes::kReleased == res); return; } // Since we couldn't move, we now evict this item. Owning handle will be // the item's handle for regular/normal items and will be the parent // handle for chained items. auto owningHandle = item.isChainedItem() ? evictChainedItemForSlabRelease(item.asChainedItem()) : evictNormalItemForSlabRelease(item); // we managed to evict the corresponding owner of the item and have the // last handle for the owner. if (owningHandle) { const auto allocInfo = allocator_->getAllocInfo(static_cast<const void*>(&item)); if (owningHandle->hasChainedItem()) { (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId] .inc(); } else { (*stats_.regularItemEvictions)[allocInfo.poolId][allocInfo.classId] .inc(); } stats_.numEvictionSuccesses.inc(); // we have the last handle. no longer need to hold on to the moving bit item.unmarkMoving(); XDCHECK(owningHandle->isExclusive()); // manually decrement the refcount to call releaseBackToAllocator const auto ref = decRef(*owningHandle); XDCHECK(ref == 0); const auto res = releaseBackToAllocator(*owningHandle.release(), RemoveContext::kEviction, false); XDCHECK(res == ReleaseRes::kReleased); return; } if (shutDownInProgress_) { item.unmarkMoving(); allocator_->abortSlabRelease(ctx); throw exception::SlabReleaseAborted( folly::sformat("Slab Release aborted while trying to evict" " Item: {} Pool: {}, Class: {}.", item.toString(), ctx.getPoolId(), ctx.getClassId())); } throttleWith(throttler, [&] { XLOGF(WARN, "Spent {} seconds, slab release still trying to evict Item: {}. " "Pool: {}, Class: {}.", util::getCurrentTimeSec() - startTime, item.toString(), ctx.getPoolId(), ctx.getClassId()) << (item.isChainedItem() ? folly::sformat(" Parent: {}", item.asChainedItem() .getParentItem(compressor_) .toString()) : ""); }); } } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ItemHandle CacheAllocator<CacheTrait>::evictNormalItemForSlabRelease(Item& item) { XDCHECK(item.isMoving()); if (item.isOnlyMoving()) { return ItemHandle{}; } auto predicate = [](const Item& it) { return it.getRefCount() == 0; }; const bool evictToNvmCache = shouldWriteToNvmCache(item); auto token = evictToNvmCache ? nvmCache_->createPutToken(item.getKey()) : typename NvmCacheT::PutToken{}; // We remove the item from both access and mm containers. It doesn't matter // if someone else calls remove on the item at this moment, the item cannot // be freed as long as we have the moving bit set. auto handle = accessContainer_->removeIf(item, std::move(predicate)); if (!handle) { return handle; } XDCHECK_EQ(reinterpret_cast<uintptr_t>(handle.get()), reinterpret_cast<uintptr_t>(&item)); XDCHECK_EQ(1u, handle->getRefCount()); removeFromMMContainer(item); // now that we are the only handle and we actually removed something from // the RAM cache, we enqueue it to nvmcache. if (evictToNvmCache && shouldWriteToNvmCacheExclusive(item)) { nvmCache_->put(handle, std::move(token)); } return handle; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ItemHandle CacheAllocator<CacheTrait>::evictChainedItemForSlabRelease(ChainedItem& child) { XDCHECK(child.isMoving()); // We have the child marked as moving, but dont know anything about the // state of the parent. Unlike the case of regular eviction where we are // sure that the child is inside the MMContainer, ensuring its parent is // valid, we can not make any assumptions here. We try to find the parent // first through the access container and then verify that the parent's // chain points to the child before cleaning up the parent. If the parent // was in the process of being re-allocated or child was being removed // concurrently, we would synchronize here on one of the checks. Item& expectedParent = child.getParentItem(compressor_); // Grab exclusive lock since we are modifying the chain. at this point, we // dont know the state of the parent. so we need to do some validity checks // after we have the chained item lock to ensure that we got the lock off of // a valid state. const std::string parentKey = expectedParent.getKey().str(); auto l = chainedItemLocks_.lockExclusive(parentKey); // check if the child is still in mmContainer and the expected parent is // valid under the chained item lock. if (expectedParent.getKey() != parentKey || !child.isInMMContainer() || child.isOnlyMoving() || &expectedParent != &child.getParentItem(compressor_) || !expectedParent.isAccessible() || !expectedParent.hasChainedItem()) { return {}; } // search if the child is present in the chain auto parentHandle = findInternal(parentKey); if (!parentHandle || parentHandle != &expectedParent) { return {}; } ChainedItem* head = nullptr; { // scope for the handle auto headHandle = findChainedItem(expectedParent); head = headHandle ? &headHandle->asChainedItem() : nullptr; } bool found = false; while (head) { if (head == &child) { found = true; break; } head = head->getNext(compressor_); } if (!found) { return {}; } // if we found the child in the parent's chain, we remove it and ensure that // the handle we obtained was the last one. Before that, create a put token // to guard any racing cache find to avoid item re-appearing in NvmCache. const bool evictToNvmCache = shouldWriteToNvmCache(expectedParent); auto token = evictToNvmCache ? nvmCache_->createPutToken(expectedParent.getKey()) : typename NvmCacheT::PutToken{}; if (!accessContainer_->removeIf(expectedParent, parentEvictForSlabReleasePredicate)) { return {}; } // at this point, we should be the last handle owner XDCHECK_EQ(1u, parentHandle->getRefCount()); // We remove the parent from both access and mm containers. It doesn't // matter if someone else calls remove on the parent at this moment, it // cannot be freed since we hold an active item handle removeFromMMContainer(*parentHandle); // In case someone else had removed this chained item from its parent by now // So we check again to see if it has been unlinked from its parent if (!child.isInMMContainer() || child.isOnlyMoving()) { return {}; } // check after removing from the MMContainer that the parent is still not // being marked as moving. If parent is moving, it will release the child // item and we will wait for that. if (parentHandle->isMoving()) { return {}; } // now that we are the only handle and we actually removed something from // the RAM cache, we enqueue it to nvmcache. if (evictToNvmCache && shouldWriteToNvmCacheExclusive(*parentHandle)) { DCHECK(parentHandle->hasChainedItem()); nvmCache_->put(parentHandle, std::move(token)); } return parentHandle; } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::removeIfExpired(const ItemHandle& handle) { if (!handle) { return false; } // We remove the item from both access and mm containers. // We want to make sure the caller is the only one holding the handle. auto removedHandle = accessContainer_->removeIf(*(handle.getInternal()), itemExpiryPredicate); if (removedHandle) { removeFromMMContainer(*(handle.getInternal())); return true; } return false; } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::markMovingForSlabRelease( const SlabReleaseContext& ctx, void* alloc, util::Throttler& throttler) { // MemoryAllocator::processAllocForRelease will execute the callback // if the item is not already free. So there are three outcomes here: // 1. Item not freed yet and marked as moving // 2. Item not freed yet but could not be marked as moving // 3. Item freed already // // For 1), return true // For 2), retry // For 3), return false to abort since no action is required // At first, we assume this item was already freed bool itemFreed = true; bool markedMoving = false; const auto fn = [&markedMoving, &itemFreed](void* memory) { // Since this callback is executed, the item is not yet freed itemFreed = false; Item* item = static_cast<Item*>(memory); if (item->markMoving()) { markedMoving = true; } }; auto startTime = util::getCurrentTimeSec(); while (true) { allocator_->processAllocForRelease(ctx, alloc, fn); // If item is already freed we give up trying to mark the item moving // and return false, otherwise if marked as moving, we return true. if (itemFreed) { return false; } else if (markedMoving) { return true; } // Reset this to true, since we always assume an item is freed // when checking with the AllocationClass itemFreed = true; if (shutDownInProgress_) { XDCHECK(!static_cast<Item*>(alloc)->isMoving()); allocator_->abortSlabRelease(ctx); throw exception::SlabReleaseAborted( folly::sformat("Slab Release aborted while still trying to mark" " as moving for Item: {}. Pool: {}, Class: {}.", static_cast<Item*>(alloc)->toString(), ctx.getPoolId(), ctx.getClassId())); } throttleWith(throttler, [&] { XLOGF(WARN, "Spent {} seconds, slab release still trying to mark as moving for " "Item: {}. Pool: {}, Class: {}.", util::getCurrentTimeSec() - startTime, static_cast<Item*>(alloc)->toString(), ctx.getPoolId(), ctx.getClassId()); }); } } template <typename CacheTrait> template <typename CCacheT, typename... Args> CCacheT* CacheAllocator<CacheTrait>::addCompactCache(folly::StringPiece name, size_t size, Args&&... args) { if (!config_.isCompactCacheEnabled()) { throw std::logic_error("Compact cache is not enabled"); } folly::SharedMutex::WriteHolder lock(compactCachePoolsLock_); auto poolId = allocator_->addPool(name, size, {Slab::kSize}); isCompactCachePool_[poolId] = true; auto ptr = std::make_unique<CCacheT>( compactCacheManager_->addAllocator(name.str(), poolId), std::forward<Args>(args)...); auto it = compactCaches_.emplace(poolId, std::move(ptr)); XDCHECK(it.second); return static_cast<CCacheT*>(it.first->second.get()); } template <typename CacheTrait> template <typename CCacheT, typename... Args> CCacheT* CacheAllocator<CacheTrait>::attachCompactCache(folly::StringPiece name, Args&&... args) { auto& allocator = compactCacheManager_->getAllocator(name.str()); auto poolId = allocator.getPoolId(); // if a compact cache with this name already exists, return without creating // new instance folly::SharedMutex::WriteHolder lock(compactCachePoolsLock_); if (compactCaches_.find(poolId) != compactCaches_.end()) { return static_cast<CCacheT*>(compactCaches_[poolId].get()); } auto ptr = std::make_unique<CCacheT>(allocator, std::forward<Args>(args)...); auto it = compactCaches_.emplace(poolId, std::move(ptr)); XDCHECK(it.second); return static_cast<CCacheT*>(it.first->second.get()); } template <typename CacheTrait> const ICompactCache& CacheAllocator<CacheTrait>::getCompactCache( PoolId pid) const { folly::SharedMutex::ReadHolder lock(compactCachePoolsLock_); if (!isCompactCachePool_[pid]) { throw std::invalid_argument( folly::sformat("PoolId {} is not a compact cache", pid)); } auto it = compactCaches_.find(pid); if (it == compactCaches_.end()) { throw std::invalid_argument(folly::sformat( "PoolId {} belongs to an un-attached compact cache", pid)); } return *it->second; } template <typename CacheTrait> void CacheAllocator<CacheTrait>::setPoolOptimizerFor(PoolId poolId, bool enableAutoResizing) { optimizerEnabled_[poolId] = enableAutoResizing; } template <typename CacheTrait> void CacheAllocator<CacheTrait>::resizeCompactCaches() { compactCacheManager_->resizeAll(); } template <typename CacheTrait> typename CacheTrait::MMType::LruType CacheAllocator<CacheTrait>::getItemLruType( const Item& item) const { return getMMContainer(item).getLruType(item); } // The order of the serialization is as follows: // // This is also the order of deserialization in the constructor, when // we restore the cache allocator. // // --------------------------------- // | accessContainer_ | // | mmContainers_ | // | compactCacheManager_ | // | allocator_ | // | metadata_ | // --------------------------------- template <typename CacheTrait> folly::IOBufQueue CacheAllocator<CacheTrait>::saveStateToIOBuf() { if (stats_.numActiveSlabReleases.get() != 0) { throw std::logic_error( "There are still slabs being released at the moment"); } *metadata_.allocatorVersion_ref() = kCachelibVersion; *metadata_.ramFormatVersion_ref() = kCacheRamFormatVersion; *metadata_.cacheCreationTime_ref() = static_cast<int64_t>(cacheCreationTime_); *metadata_.mmType_ref() = MMType::kId; *metadata_.accessType_ref() = AccessType::kId; metadata_.compactCachePools_ref()->clear(); const auto pools = getPoolIds(); { folly::SharedMutex::ReadHolder lock(compactCachePoolsLock_); for (PoolId pid : pools) { for (unsigned int cid = 0; cid < (*stats_.fragmentationSize)[pid].size(); ++cid) { metadata_.fragmentationSize_ref()[pid][static_cast<ClassId>(cid)] = (*stats_.fragmentationSize)[pid][cid].get(); } if (isCompactCachePool_[pid]) { metadata_.compactCachePools_ref()->push_back(pid); } } } *metadata_.numChainedParentItems_ref() = stats_.numChainedParentItems.get(); *metadata_.numChainedChildItems_ref() = stats_.numChainedChildItems.get(); *metadata_.numAbortedSlabReleases_ref() = stats_.numAbortedSlabReleases.get(); auto serializeMMContainers = [](MMContainers& mmContainers) { MMSerializationTypeContainer state; for (unsigned int i = 0; i < mmContainers.size(); ++i) { for (unsigned int j = 0; j < mmContainers[i].size(); ++j) { if (mmContainers[i][j]) { state.pools_ref()[i][j] = mmContainers[i][j]->saveState(); } } } return state; }; MMSerializationTypeContainer mmContainersState = serializeMMContainers(mmContainers_); AccessSerializationType accessContainerState = accessContainer_->saveState(); MemoryAllocator::SerializationType allocatorState = allocator_->saveState(); CCacheManager::SerializationType ccState = compactCacheManager_->saveState(); AccessSerializationType chainedItemAccessContainerState = chainedItemAccessContainer_->saveState(); // serialize to an iobuf queue. The caller can then copy over the serialized // results into a single buffer. folly::IOBufQueue queue; Serializer::serializeToIOBufQueue(queue, metadata_); Serializer::serializeToIOBufQueue(queue, allocatorState); Serializer::serializeToIOBufQueue(queue, ccState); Serializer::serializeToIOBufQueue(queue, mmContainersState); Serializer::serializeToIOBufQueue(queue, accessContainerState); Serializer::serializeToIOBufQueue(queue, chainedItemAccessContainerState); return queue; } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::stopWorkers(std::chrono::seconds timeout) { bool success = true; success &= stopPoolRebalancer(timeout); success &= stopPoolResizer(timeout); success &= stopMemMonitor(timeout); success &= stopReaper(timeout); return success; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ShutDownStatus CacheAllocator<CacheTrait>::shutDown() { using ShmShutDownRes = typename ShmManager::ShutDownRes; XLOG(DBG, "shutting down CacheAllocator"); if (shmManager_ == nullptr) { throw std::invalid_argument( "shutDown can only be called once from a cached manager created on " "shared memory. You may also be incorrectly constructing your " "allocator. Are you passing in " "AllocatorType::SharedMem* ?"); } XDCHECK(!config_.cacheDir.empty()); if (config_.enableFastShutdown) { shutDownInProgress_ = true; } stopWorkers(); const auto handleCount = getNumActiveHandles(); if (handleCount != 0) { XLOGF(ERR, "Found {} active handles while shutting down cache. aborting", handleCount); return ShutDownStatus::kFailed; } const auto nvmShutDownStatusOpt = saveNvmCache(); saveRamCache(); const auto shmShutDownStatus = shmManager_->shutDown(); const auto shmShutDownSucceeded = (shmShutDownStatus == ShmShutDownRes::kSuccess); shmManager_.reset(); if (shmShutDownSucceeded) { if (!nvmShutDownStatusOpt || *nvmShutDownStatusOpt) return ShutDownStatus::kSuccess; if (nvmShutDownStatusOpt && !*nvmShutDownStatusOpt) return ShutDownStatus::kSavedOnlyDRAM; } XLOGF(ERR, "Could not shutdown DRAM cache cleanly. ShutDownRes={}", (shmShutDownStatus == ShmShutDownRes::kFailedWrite ? "kFailedWrite" : "kFileDeleted")); if (nvmShutDownStatusOpt && *nvmShutDownStatusOpt) { return ShutDownStatus::kSavedOnlyNvmCache; } return ShutDownStatus::kFailed; } template <typename CacheTrait> std::optional<bool> CacheAllocator<CacheTrait>::saveNvmCache() { if (!nvmCache_) { return std::nullopt; } // throw any exceptions from shutting down nvmcache since we dont know the // state of RAM as well. if (!nvmCache_->isEnabled()) { nvmCache_->shutDown(); return std::nullopt; } if (!nvmCache_->shutDown()) { XLOG(ERR, "Could not shutdown nvmcache cleanly"); return false; } nvmCacheState_.markSafeShutDown(); return true; } template <typename CacheTrait> void CacheAllocator<CacheTrait>::saveRamCache() { // serialize the cache state auto serializedBuf = saveStateToIOBuf(); std::unique_ptr<folly::IOBuf> ioBuf = serializedBuf.move(); ioBuf->coalesce(); void* infoAddr = shmManager_->createShm(detail::kShmInfoName, ioBuf->length()).addr; Serializer serializer(reinterpret_cast<uint8_t*>(infoAddr), reinterpret_cast<uint8_t*>(infoAddr) + ioBuf->length()); serializer.writeToBuffer(std::move(ioBuf)); } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::MMContainers CacheAllocator<CacheTrait>::deserializeMMContainers( Deserializer& deserializer, const typename Item::PtrCompressor& compressor) { const auto container = deserializer.deserialize<MMSerializationTypeContainer>(); MMContainers mmContainers; for (auto& kvPool : *container.pools_ref()) { auto i = static_cast<PoolId>(kvPool.first); auto& pool = getPool(i); for (auto& kv : kvPool.second) { auto j = static_cast<ClassId>(kv.first); MMContainerPtr ptr = std::make_unique<typename MMContainerPtr::element_type>(kv.second, compressor); auto config = ptr->getConfig(); config.addExtraConfig(config_.trackTailHits ? pool.getAllocationClass(j).getAllocsPerSlab() : 0); ptr->setConfig(config); mmContainers[i][j] = std::move(ptr); } } // We need to drop the unevictableMMContainer in the desierializer. // TODO: remove this at version 17. if (metadata_.allocatorVersion_ref() <= 15) { deserializer.deserialize<MMSerializationTypeContainer>(); } return mmContainers; } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::MMContainers CacheAllocator<CacheTrait>::createEmptyMMContainers() { MMContainers mmContainers; for (unsigned int i = 0; i < mmContainers_.size(); i++) { for (unsigned int j = 0; j < mmContainers_[i].size(); j++) { if (mmContainers_[i][j]) { MMContainerPtr ptr = std::make_unique<typename MMContainerPtr::element_type>( mmContainers_[i][j]->getConfig(), compressor_); mmContainers[i][j] = std::move(ptr); } } } return mmContainers; } template <typename CacheTrait> serialization::CacheAllocatorMetadata CacheAllocator<CacheTrait>::deserializeCacheAllocatorMetadata( Deserializer& deserializer) { auto meta = deserializer.deserialize<serialization::CacheAllocatorMetadata>(); // TODO: // Once everyone is on v8 or later, remove the outter if. if (kCachelibVersion > 8) { if (*meta.ramFormatVersion() != kCacheRamFormatVersion) { throw std::runtime_error( folly::sformat("Expected cache ram format version {}. But found {}.", kCacheRamFormatVersion, *meta.ramFormatVersion())); } } if (*meta.accessType() != AccessType::kId) { throw std::invalid_argument( folly::sformat("Expected {}, got {} for AccessType", *meta.accessType(), AccessType::kId)); } if (*meta.mmType() != MMType::kId) { throw std::invalid_argument(folly::sformat("Expected {}, got {} for MMType", *meta.mmType(), MMType::kId)); } return meta; } template <typename CacheTrait> int64_t CacheAllocator<CacheTrait>::getNumActiveHandles() const { return handleCount_.getSnapshot(); } template <typename CacheTrait> int64_t CacheAllocator<CacheTrait>::getHandleCountForThread() const { return handleCount_.tlStats(); } template <typename CacheTrait> void CacheAllocator<CacheTrait>::resetHandleCountForThread_private() { handleCount_.tlStats() = 0; } template <typename CacheTrait> void CacheAllocator<CacheTrait>::adjustHandleCountForThread_private( int64_t delta) { handleCount_.tlStats() += delta; } template <typename CacheTrait> void CacheAllocator<CacheTrait>::initStats() { stats_.init(); // deserialize the fragmentation size of each thread. for (const auto& pid : *metadata_.fragmentationSize_ref()) { for (const auto& cid : pid.second) { (*stats_.fragmentationSize)[pid.first][cid.first].set( static_cast<uint64_t>(cid.second)); } } // deserialize item counter stats stats_.numChainedParentItems.set(*metadata_.numChainedParentItems_ref()); stats_.numChainedChildItems.set(*metadata_.numChainedChildItems_ref()); stats_.numAbortedSlabReleases.set( static_cast<uint64_t>(*metadata_.numAbortedSlabReleases_ref())); } template <typename CacheTrait> void CacheAllocator<CacheTrait>::forEachChainedItem( const Item& parent, std::function<void(ChainedItem&)> func) { auto l = chainedItemLocks_.lockShared(parent.getKey()); auto headHandle = findChainedItem(parent); if (!headHandle) { return; } ChainedItem* head = &headHandle.get()->asChainedItem(); while (head) { func(*head); head = head->getNext(compressor_); } } template <typename CacheTrait> typename CacheAllocator<CacheTrait>::ItemHandle CacheAllocator<CacheTrait>::findChainedItem(const Item& parent) const { const auto cPtr = compressor_.compress(&parent); return chainedItemAccessContainer_->find( Key{reinterpret_cast<const char*>(&cPtr), ChainedItem::kKeySize}); } template <typename CacheTrait> template <typename Handle, typename Iter> CacheChainedAllocs<CacheAllocator<CacheTrait>, Handle, Iter> CacheAllocator<CacheTrait>::viewAsChainedAllocsT(const Handle& parent) { XDCHECK(parent); auto handle = parent.clone(); if (!handle) { throw std::invalid_argument("Failed to clone item handle"); } if (!handle->hasChainedItem()) { throw std::invalid_argument( folly::sformat("Failed to materialize chain. Parent does not have " "chained items. Parent: {}", parent->toString())); } auto l = chainedItemLocks_.lockShared(handle->getKey()); auto head = findChainedItem(*handle); return CacheChainedAllocs<CacheAllocator<CacheTrait>, Handle, Iter>{ std::move(l), std::move(handle), *head, compressor_}; } template <typename CacheTrait> GlobalCacheStats CacheAllocator<CacheTrait>::getGlobalCacheStats() const { GlobalCacheStats ret{}; stats_.populateGlobalCacheStats(ret); ret.numItems = accessContainer_->getStats().numKeys; const uint64_t currTime = util::getCurrentTimeSec(); ret.ramUpTime = currTime - cacheCreationTime_; ret.nvmUpTime = currTime - nvmCacheState_.getCreationTime(); ret.nvmCacheEnabled = nvmCache_ ? nvmCache_->isEnabled() : false; ret.reaperStats = getReaperStats(); ret.numActiveHandles = getNumActiveHandles(); return ret; } template <typename CacheTrait> CacheMemoryStats CacheAllocator<CacheTrait>::getCacheMemoryStats() const { const auto totalCacheSize = allocator_->getMemorySize(); auto addSize = [this](size_t a, PoolId pid) { return a + allocator_->getPool(pid).getPoolSize(); }; const auto regularPoolIds = getRegularPoolIds(); const auto ccCachePoolIds = getCCachePoolIds(); size_t regularCacheSize = std::accumulate( regularPoolIds.begin(), regularPoolIds.end(), 0ULL, addSize); size_t compactCacheSize = std::accumulate( ccCachePoolIds.begin(), ccCachePoolIds.end(), 0ULL, addSize); return CacheMemoryStats{totalCacheSize, regularCacheSize, compactCacheSize, allocator_->getAdvisedMemorySize(), memMonitor_ ? memMonitor_->getMaxAdvisePct() : 0, allocator_->getUnreservedMemorySize(), nvmCache_ ? nvmCache_->getSize() : 0, util::getMemAvailable(), util::getRSSBytes()}; } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::autoResizeEnabledForPool(PoolId pid) const { folly::SharedMutex::ReadHolder lock(compactCachePoolsLock_); if (isCompactCachePool_[pid]) { // compact caches need to be registered to enable auto resizing return optimizerEnabled_[pid]; } else { // by default all regular pools participate in auto resizing return true; } } template <typename CacheTrait> template <typename T> bool CacheAllocator<CacheTrait>::stopWorker(folly::StringPiece name, std::unique_ptr<T>& worker, std::chrono::seconds timeout) { std::lock_guard<std::mutex> l(workersMutex_); if (!worker) { return true; } bool ret = worker->stop(timeout); if (ret) { XLOGF(DBG1, "Stopped worker '{}'", name); } else { XLOGF(ERR, "Couldn't stop worker '{}', timeout: {} seconds", name, timeout.count()); } worker.reset(); return ret; } template <typename CacheTrait> template <typename T, typename... Args> bool CacheAllocator<CacheTrait>::startNewWorker( folly::StringPiece name, std::unique_ptr<T>& worker, std::chrono::milliseconds interval, Args&&... args) { if (!stopWorker(name, worker)) { return false; } std::lock_guard<std::mutex> l(workersMutex_); worker = std::make_unique<T>(*this, std::forward<Args>(args)...); bool ret = worker->start(interval, name); if (ret) { XLOGF(DBG1, "Started worker '{}'", name); } else { XLOGF(ERR, "Couldn't start worker '{}', interval: {} milliseconds", name, interval.count()); } return ret; } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::startNewPoolRebalancer( std::chrono::milliseconds interval, std::shared_ptr<RebalanceStrategy> strategy, unsigned int freeAllocThreshold) { return startNewWorker("PoolRebalancer", poolRebalancer_, interval, strategy, freeAllocThreshold); } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::startNewPoolResizer( std::chrono::milliseconds interval, unsigned int poolResizeSlabsPerIter, std::shared_ptr<RebalanceStrategy> strategy) { return startNewWorker("PoolResizer", poolResizer_, interval, poolResizeSlabsPerIter, strategy); } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::startNewPoolOptimizer( std::chrono::seconds regularInterval, std::chrono::seconds ccacheInterval, std::shared_ptr<PoolOptimizeStrategy> strategy, unsigned int ccacheStepSizePercent) { // For now we are asking the worker to wake up every second to see whether // it should do actual size optimization. Probably need to move to using // the same interval for both, with confirmation of further experiments. const auto workerInterval = std::chrono::seconds(1); return startNewWorker("PoolOptimizer", poolOptimizer_, workerInterval, strategy, regularInterval.count(), ccacheInterval.count(), ccacheStepSizePercent); } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::startNewMemMonitor( std::chrono::milliseconds interval, MemoryMonitor::Config config, std::shared_ptr<RebalanceStrategy> strategy) { return startNewWorker("MemoryMonitor", memMonitor_, interval, std::move(config), strategy); } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::startNewReaper( std::chrono::milliseconds interval, util::Throttler::Config reaperThrottleConfig) { return startNewWorker("Reaper", reaper_, interval, reaperThrottleConfig); } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::stopPoolRebalancer( std::chrono::seconds timeout) { return stopWorker("PoolRebalancer", poolRebalancer_, timeout); } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::stopPoolResizer(std::chrono::seconds timeout) { return stopWorker("PoolResizer", poolResizer_, timeout); } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::stopPoolOptimizer( std::chrono::seconds timeout) { return stopWorker("PoolOptimizer", poolOptimizer_, timeout); } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::stopMemMonitor(std::chrono::seconds timeout) { return stopWorker("MemoryMonitor", memMonitor_, timeout); } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::stopReaper(std::chrono::seconds timeout) { return stopWorker("Reaper", reaper_, timeout); } template <typename CacheTrait> bool CacheAllocator<CacheTrait>::cleanupStrayShmSegments( const std::string& cacheDir, bool posix) { if (util::getStatIfExists(cacheDir, nullptr) && util::isDir(cacheDir)) { try { // cache dir exists. clean up only if there are no other processes // attached. if another process was attached, the following would fail. ShmManager::cleanup(cacheDir, posix); } catch (const std::exception& e) { XLOGF(ERR, "Error cleaning up {}. Exception: ", cacheDir, e.what()); return false; } } else { // cache dir did not exist. Try to nuke the segments we know by name. // Any other concurrent process can not be attached to the segments or // even if it does, we want to mark it for destruction. ShmManager::removeByName(cacheDir, detail::kShmInfoName, posix); ShmManager::removeByName(cacheDir, detail::kShmCacheName, posix); ShmManager::removeByName(cacheDir, detail::kShmHashTableName, posix); ShmManager::removeByName(cacheDir, detail::kShmChainedItemHashTableName, posix); } return true; } template <typename CacheTrait> uint64_t CacheAllocator<CacheTrait>::getItemPtrAsOffset(const void* ptr) { // Return unt64_t instead of uintptr_t to accommodate platforms where // the two differ (e.g. Mac OS 12) - causing templating instantiation // errors downstream. // if this succeeeds, the address is valid within the cache. allocator_->getAllocInfo(ptr); if (!isOnShm_ || !shmManager_) { throw std::invalid_argument("Shared memory not used"); } const auto& shm = shmManager_->getShmByName(detail::kShmCacheName); return reinterpret_cast<uint64_t>(ptr) - reinterpret_cast<uint64_t>(shm.getCurrentMapping().addr); } template <typename CacheTrait> std::unordered_map<std::string, double> CacheAllocator<CacheTrait>::getNvmCacheStatsMap() const { auto ret = nvmCache_ ? nvmCache_->getStatsMap() : std::unordered_map<std::string, double>{}; if (nvmAdmissionPolicy_) { auto policyStats = nvmAdmissionPolicy_->getCounters(); for (const auto kv : policyStats) { ret[kv.first] = kv.second; } } return ret; } } // namespace cachelib } // namespace facebook