cachelib/allocator/memory/SlabAllocator.cpp (426 lines of code) (raw):
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cachelib/allocator/memory/SlabAllocator.h"
#include <folly/Likely.h>
#include <folly/Random.h>
#include <folly/logging/xlog.h>
#include <folly/synchronization/SanitizeThread.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <chrono>
#include <memory>
#include <stdexcept>
#include "cachelib/common/Utils.h"
/* Missing madvise(2) flags on MacOS */
#ifndef MADV_REMOVE
#define MADV_REMOVE 0
#endif
#ifndef MADV_DONTDUMP
#define MADV_DONTDUMP 0
#endif
using namespace facebook::cachelib;
namespace {
size_t roundDownToSlabSize(size_t size) { return size - (size % sizeof(Slab)); }
} // namespace
// definitions to avoid ODR violation.
using PtrType = CompressedPtr::PtrType;
constexpr uint64_t SlabAllocator::kAddressMask;
constexpr PtrType CompressedPtr::kAllocIdxMask;
constexpr unsigned int CompressedPtr::kNumAllocIdxBits;
constexpr unsigned int CompressedPtr::kNumSlabIdxBits;
constexpr unsigned int SlabAllocator::kLockSleepMS;
constexpr size_t SlabAllocator::kPagesPerStep;
void SlabAllocator::checkState() const {
if (memoryStart_ == nullptr || memorySize_ <= Slab::kSize) {
throw std::invalid_argument(
folly::sformat("Invalid memory spec. memoryStart = {}, size = {}",
memoryStart_,
memorySize_));
}
if (slabMemoryStart_ == nullptr || nextSlabAllocation_ == nullptr) {
throw std::invalid_argument(
folly::sformat("Invalid slabMemoryStart_ {} of nextSlabAllocation_ {}",
slabMemoryStart_,
nextSlabAllocation_));
}
// nextSlabAllocation_ should be valid.
if (nextSlabAllocation_ > getSlabMemoryEnd()) {
throw std::invalid_argument(
folly::sformat("Invalid nextSlabAllocation_ {}, with SlabMemoryEnd {}",
nextSlabAllocation_,
getSlabMemoryEnd()));
}
for (const auto slab : freeSlabs_) {
if (!isValidSlab(slab)) {
throw std::invalid_argument(folly::sformat("Invalid free slab {}", slab));
}
}
}
SlabAllocator::~SlabAllocator() {
stopMemoryLocker();
if (ownsMemory_) {
munmap(memoryStart_, memorySize_);
}
}
void SlabAllocator::stopMemoryLocker() {
if (memoryLocker_.joinable()) {
stopLocking_ = true;
memoryLocker_.join();
}
}
SlabAllocator::SlabAllocator(size_t size, const Config& config)
: SlabAllocator(util::mmapAlignedZeroedMemory(sizeof(Slab), size),
size,
true,
config) {
XDCHECK(!isRestorable());
}
SlabAllocator::SlabAllocator(void* memoryStart,
size_t memorySize,
const Config& config)
: SlabAllocator(memoryStart, memorySize, false, config) {
XDCHECK(isRestorable());
}
SlabAllocator::SlabAllocator(void* memoryStart,
size_t memorySize,
bool ownsMemory,
const Config& config)
: memoryStart_(memoryStart),
memorySize_(roundDownToSlabSize(memorySize)),
slabMemoryStart_(computeSlabMemoryStart(memoryStart_, memorySize_)),
nextSlabAllocation_(slabMemoryStart_),
ownsMemory_(ownsMemory) {
checkState();
static_assert(!(sizeof(Slab) & (sizeof(Slab) - 1)),
"slab size must be power of two");
if (config.excludeFromCoredump) {
excludeMemoryFromCoredump();
}
if (config.lockMemory) {
memoryLocker_ = std::thread{[this]() { lockMemoryAsync(); }};
}
XDCHECK_EQ(0u, reinterpret_cast<uintptr_t>(memoryStart_) % sizeof(Slab));
XDCHECK_EQ(0u, memorySize_ % sizeof(Slab));
XDCHECK(nextSlabAllocation_ != nullptr);
XDCHECK_EQ(reinterpret_cast<uintptr_t>(nextSlabAllocation_),
reinterpret_cast<uintptr_t>(slabMemoryStart_));
}
SlabAllocator::SlabAllocator(const serialization::SlabAllocatorObject& object,
void* memoryStart,
size_t memSize,
const Config& config)
: memoryStart_(memoryStart),
memorySize_(*object.memorySize_ref()),
slabMemoryStart_(computeSlabMemoryStart(memoryStart_, memorySize_)),
nextSlabAllocation_(getSlabForIdx(*object.nextSlabIdx_ref())),
canAllocate_(*object.canAllocate_ref()),
ownsMemory_(false) {
if (Slab::kSize != *object.slabSize_ref()) {
throw std::invalid_argument(folly::sformat(
"current slab size {} does not match the previous one {}",
Slab::kSize,
*object.slabSize_ref()));
}
if (CompressedPtr::getMinAllocSize() != *object.minAllocSize_ref()) {
throw std::invalid_argument(folly::sformat(
"current min alloc size {} does not match the previous one {}",
CompressedPtr::getMinAllocSize(),
*object.minAllocSize_ref()));
}
XDCHECK(isRestorable());
const size_t currSize = roundDownToSlabSize(memSize);
if (memorySize_ != currSize) {
throw std::invalid_argument(folly::sformat(
"Memory size {} does not match the saved state's size {}",
currSize,
memorySize_));
}
if (config.excludeFromCoredump) {
excludeMemoryFromCoredump();
}
for (const auto& pair : *object.memoryPoolSize_ref()) {
const PoolId id = pair.first;
if (id >= static_cast<PoolId>(memoryPoolSize_.size())) {
throw std::invalid_argument(
folly::sformat("Invalid class id {}. Max Class Id {}",
id,
memoryPoolSize_.size() - 1));
}
memoryPoolSize_[id] = pair.second;
}
for (auto freeSlabIdx : *object.freeSlabIdxs_ref()) {
freeSlabs_.push_back(getSlabForIdx(freeSlabIdx));
}
for (auto advisedSlabIdx : *object.advisedSlabIdxs_ref()) {
// The slab headers in previous release did not have advised flag
// set in the slab header. To avoid memory locking from touching
// advised slab pages, we'd have to cold roll. To avoid cold roll
// explicitly set the advised bit here.
auto header = getSlabHeader(advisedSlabIdx);
XDCHECK(header != nullptr);
header->setAdvised(true);
advisedSlabs_.push_back(getSlabForIdx(advisedSlabIdx));
}
if (config.lockMemory) {
memoryLocker_ = std::thread{[this]() { lockMemoryAsync(); }};
}
checkState();
}
void SlabAllocator::lockMemoryAsync() noexcept {
try {
// memory start is always page aligned since it is aligned to slab size.
auto* mem = reinterpret_cast<const uint8_t* const>(memoryStart_);
XDCHECK(util::isPageAlignedAddr(mem));
const size_t numPages = util::getNumPages(memorySize_);
const size_t pageSize = util::getPageSize();
size_t pageOffset = 0;
size_t numAdvisedAwayPages = 0;
while (pageOffset < numPages) {
if (stopLocking_) {
return;
}
auto pageAddr = mem + pageOffset * pageSize;
// Avoid touching advised away pages.
const auto header = getSlabHeader(pageAddr);
if (header && header->isAdvised()) {
++numAdvisedAwayPages;
} else {
// this relies on the fact that the pages used with the allocator are
// shared memory pages. For memory that is not shared, touching the
// memory won't page them in until the page gets written to. We default
// to mlock for that and require the caller to set the appropriate
// rlimits. Use volatile to fool the compiler to not optimize this away
// in opt mode.
volatile const uint8_t val = *pageAddr;
(void)val;
}
++pageOffset;
if (pageOffset % kPagesPerStep == 0) {
/* sleep override */
std::this_thread::sleep_for(std::chrono::milliseconds(kLockSleepMS));
}
}
// verify everything got paged in. If it doesn't, then we'll end up locking
// advised away pages, which means we'll start off with some unusable
// cache memory.
const auto numInCore = util::getNumResidentPages(memoryStart_, memorySize_);
if (numInCore != numPages - numAdvisedAwayPages) {
XLOGF(ERR,
"could not page in all memory. numPages = {}, numInCore = {}. "
"Trying to mlock.",
numPages - numAdvisedAwayPages, numInCore);
// try mlock to see if that helps.
const int rv = mlock(memoryStart_, memorySize_);
if (rv != 0) {
XLOGF(ERR, "could not mlock. errno = {}", errno);
}
}
} catch (const std::exception& e) {
XLOGF(ERR, "Exception during locking memory {}", e.what());
}
}
namespace {
unsigned int numSlabs(size_t memorySize) noexcept {
return static_cast<unsigned int>(memorySize / sizeof(Slab));
}
unsigned int numSlabsForHeaders(size_t memorySize) noexcept {
const size_t headerSpace = sizeof(SlabHeader) * numSlabs(memorySize);
return static_cast<unsigned int>((headerSpace + sizeof(Slab) - 1) /
sizeof(Slab));
}
} // namespace
unsigned int SlabAllocator::getNumUsableSlabs(size_t memorySize) noexcept {
return numSlabs(memorySize) - numSlabsForHeaders(memorySize);
}
unsigned int SlabAllocator::getNumUsableSlabs() const noexcept {
return getNumUsableAndAdvisedSlabs() -
static_cast<unsigned int>(numSlabsReclaimable());
}
unsigned int SlabAllocator::getNumUsableAndAdvisedSlabs() const noexcept {
return static_cast<unsigned int>(getSlabMemoryEnd() - slabMemoryStart_);
}
Slab* SlabAllocator::computeSlabMemoryStart(void* memoryStart,
size_t memorySize) {
// compute the number of slabs we can have.
const auto numHeaderSlabs = numSlabsForHeaders(memorySize);
if (numSlabs(memorySize) <= numHeaderSlabs) {
throw std::invalid_argument("not enough memory for slabs");
}
if (memoryStart == nullptr ||
reinterpret_cast<uintptr_t>(memoryStart) % sizeof(Slab)) {
throw std::invalid_argument(
folly::sformat("Invalid memory start {}", memoryStart));
}
// reserve the first numHeaderSlabs for storing the header info for all the
// slabs.
return reinterpret_cast<Slab*>(memoryStart) + numHeaderSlabs;
}
Slab* SlabAllocator::makeNewSlabImpl() {
// early return without any locks.
if (!canAllocate_) {
return nullptr;
}
LockHolder l(lock_);
// grab a free slab if it exists.
if (!freeSlabs_.empty()) {
auto slab = freeSlabs_.back();
freeSlabs_.pop_back();
return slab;
}
XDCHECK_EQ(0u,
reinterpret_cast<uintptr_t>(nextSlabAllocation_) % sizeof(Slab));
// check if we have any more memory left.
if (allMemorySlabbed()) {
// free list is empty and we have slabbed all the memory.
canAllocate_ = false;
return nullptr;
}
// allocate a new slab.
return nextSlabAllocation_++;
}
// This does not hold the lock since the expectation is that its used with
// new/free/advised away slabs which are not in active use.
void SlabAllocator::initializeHeader(Slab* slab, PoolId id) {
auto* header = getSlabHeader(slab);
XDCHECK(header != nullptr);
header = new (header) SlabHeader(id);
}
Slab* SlabAllocator::makeNewSlab(PoolId id) {
Slab* slab = makeNewSlabImpl();
if (slab == nullptr) {
return nullptr;
}
memoryPoolSize_[id] += sizeof(Slab);
// initialize the header for the slab.
initializeHeader(slab, id);
return slab;
}
void SlabAllocator::freeSlab(Slab* slab) {
// find the header for the slab.
auto* header = getSlabHeader(slab);
XDCHECK(header != nullptr);
if (header == nullptr) {
throw std::runtime_error(folly::sformat("Invalid Slab {}", slab));
}
memoryPoolSize_[header->poolId] -= sizeof(Slab);
// grab the lock
LockHolder l(lock_);
freeSlabs_.push_back(slab);
canAllocate_ = true;
header->resetAllocInfo();
}
bool SlabAllocator::adviseSlab(Slab* slab) {
// find the header for the slab.
auto* header = getSlabHeader(slab);
if (header == nullptr) {
throw std::runtime_error(folly::sformat("Invalid Slab {}", slab));
}
// Mark slab as advised in header prior to advising to avoid it from being
// touched during memory locking.
header->setAdvised(true);
// madvise kernel to release this slab. Do this while not holding the
// lock since the MADV_REMOVE happens inline.
auto ret = madvise((void*)slab->memoryAtOffset(0), Slab::kSize, MADV_REMOVE);
if (!ret || pretendMadvise_) {
LockHolder l(lock_);
advisedSlabs_.push_back(slab);
// This doesn't reset flags
header->resetAllocInfo();
return true;
}
// Unset the flag since we failed to advise this slab away
header->setAdvised(false);
return false;
}
Slab* FOLLY_NULLABLE SlabAllocator::reclaimSlab(PoolId id) {
Slab* slab = nullptr;
{
LockHolder l(lock_);
if (!advisedSlabs_.empty()) {
auto it = advisedSlabs_.begin();
slab = *it;
advisedSlabs_.erase(it);
}
}
if (!slab) {
return nullptr;
}
const size_t numPages = util::getNumPages(sizeof(Slab));
const size_t pageSize = util::getPageSize();
auto* mem = reinterpret_cast<const uint8_t* const>(slab->memoryAtOffset(0));
XDCHECK(util::isPageAlignedAddr(mem));
for (size_t pageOffset = 0; pageOffset < numPages; pageOffset++) {
// Use volatile to fool the compiler to not optimize this away in opt
// mode.
volatile const uint8_t val = *(mem + pageOffset * pageSize);
(void)val;
}
memoryPoolSize_[id] += sizeof(Slab);
// initialize the header for the slab.
initializeHeader(slab, id);
return slab;
}
SlabHeader* SlabAllocator::getSlabHeader(
const Slab* const slab) const noexcept {
if ([&] {
// TODO(T79149875): Fix data race exposed by TSAN.
folly::annotate_ignore_thread_sanitizer_guard g(__FILE__, __LINE__);
return isValidSlab(slab);
}()) {
return [&] {
// TODO(T79149875): Fix data race exposed by TSAN.
folly::annotate_ignore_thread_sanitizer_guard g(__FILE__, __LINE__);
return getSlabHeader(slabIdx(slab));
}();
}
return nullptr;
}
bool SlabAllocator::isMemoryInSlab(const void* ptr,
const Slab* slab) const noexcept {
if (!isValidSlab(slab)) {
return false;
}
return getSlabForMemory(ptr) == slab;
}
const void* SlabAllocator::getRandomAlloc() const noexcept {
// disregard the space we use for slab header.
const auto validMaxOffset =
memorySize_ - (reinterpret_cast<uintptr_t>(slabMemoryStart_) -
reinterpret_cast<uintptr_t>(memoryStart_));
// pick a random location in the memory.
const auto offset = folly::Random::rand64(0, validMaxOffset);
const auto* memory = reinterpret_cast<void*>(
reinterpret_cast<uintptr_t>(slabMemoryStart_) + offset);
const auto* slab = getSlabForMemory(memory);
const auto* header = getSlabHeader(slab);
if (header == nullptr) {
return nullptr;
}
XDCHECK_GE(reinterpret_cast<uintptr_t>(memory),
reinterpret_cast<uintptr_t>(slab));
const auto allocSize = header->allocSize;
if (allocSize == 0) {
return nullptr;
}
const auto maxAllocIdx = Slab::kSize / allocSize - 1;
auto allocIdx = (reinterpret_cast<uintptr_t>(memory) -
reinterpret_cast<uintptr_t>(slab)) /
allocSize;
allocIdx = allocIdx > maxAllocIdx ? maxAllocIdx : allocIdx;
return reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(slab) +
allocSize * allocIdx);
}
serialization::SlabAllocatorObject SlabAllocator::saveState() {
if (!isRestorable()) {
throw std::logic_error("Can not save state when memory is mmaped");
}
// stop async thread that is paging in memory if it is still running.
stopMemoryLocker();
serialization::SlabAllocatorObject object;
*object.memorySize_ref() = memorySize_;
*object.nextSlabIdx_ref() = slabIdx(nextSlabAllocation_);
*object.canAllocate_ref() = canAllocate_;
for (PoolId id = 0; id < static_cast<PoolId>(memoryPoolSize_.size()); ++id) {
object.memoryPoolSize_ref()[id] = memoryPoolSize_[id];
}
for (auto slab : freeSlabs_) {
object.freeSlabIdxs_ref()->push_back(slabIdx(slab));
}
for (auto slab : advisedSlabs_) {
object.advisedSlabIdxs_ref()->push_back(slabIdx(slab));
}
*object.slabSize_ref() = Slab::kSize;
*object.minAllocSize_ref() = CompressedPtr::getMinAllocSize();
return object;
}
// for benchmarking purposes.
const unsigned int kMarkerBits = 6;
CompressedPtr SlabAllocator::compressAlt(const void* ptr) const {
if (ptr == nullptr) {
return CompressedPtr{};
}
ptrdiff_t delta = reinterpret_cast<const uint8_t*>(ptr) -
reinterpret_cast<const uint8_t*>(slabMemoryStart_);
return CompressedPtr{
static_cast<CompressedPtr::PtrType>(delta >> kMarkerBits)};
}
void* SlabAllocator::unCompressAlt(const CompressedPtr cPtr) const {
if (cPtr.isNull()) {
return nullptr;
}
const auto markerOffset = cPtr.getRaw() << kMarkerBits;
const void* markerPtr =
reinterpret_cast<const uint8_t*>(slabMemoryStart_) + markerOffset;
const auto* header = getSlabHeader(markerPtr);
const auto allocSize = header->allocSize;
XDCHECK_GE(allocSize, 1u << kMarkerBits);
auto slab = getSlabForMemory(markerPtr);
auto slabOffset = reinterpret_cast<uintptr_t>(markerPtr) -
reinterpret_cast<uintptr_t>(slab);
XDCHECK_LT(slabOffset, Slab::kSize);
/*
* Since the marker is to the left of the desired allocation, now
* we want to find the alloc boundary to the right of this marker.
* But we start off by finding the distance to the alloc
* boundary on our left, which we call delta.
* Then the distance to the right is allocSize - delta:
*
* I M I
* <-- delta ---------><-- allocSize - delta -->
*
* Since allocs start at the beginning of the slab, and are all allocSize
* bytes big, delta is just slabOffset % allocSize. If delta is 0, then the
* marker is already at an alloc boundary.
*/
const auto delta = slabOffset % allocSize;
if (delta) {
slabOffset += (allocSize - delta);
}
return slab->memoryAtOffset(slabOffset);
}
void SlabAllocator::excludeMemoryFromCoredump() const {
// dump the headers always. Very useful for debugging when we have
// pointers and need to find information. slab headers are only few slabs
// and in the order of 4-8MB
auto slabMemStartPtr = reinterpret_cast<uint8_t*>(slabMemoryStart_);
const size_t headerBytes =
slabMemStartPtr - reinterpret_cast<uint8_t*>(memoryStart_);
const size_t slabBytes = memorySize_ - headerBytes;
XDCHECK_LT(slabBytes, memorySize_);
if (madvise(slabMemStartPtr, slabBytes, MADV_DONTDUMP)) {
throw std::system_error(errno, std::system_category(),
"madvise failed to exclude memory from coredump");
}
}