lib/Support/OSCompatPosix.cpp (549 lines of code) (raw):
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#if !defined(_WINDOWS) && !defined(__EMSCRIPTEN__)
#include "hermes/Support/Compiler.h"
#include "hermes/Support/ErrorHandling.h"
#include "hermes/Support/OSCompat.h"
#include <cassert>
#include <fstream>
#include <vector>
#include <signal.h>
#include <sys/mman.h>
#include <sys/resource.h>
#if defined(__linux__)
#if !defined(RUSAGE_THREAD)
#define RUSAGE_THREAD 1
#endif
#endif // __linux__
#include <sys/types.h>
#include <unistd.h>
#ifdef __MACH__
#include <mach/mach.h>
#ifdef __APPLE__
#include <pthread.h>
#endif // __APPLE__
#endif // __MACH__
#ifdef __linux__
#if !defined(_POSIX_TIMERS) || _POSIX_TIMERS <= 0
#error "Timers not supported on this Android platform."
#endif
#ifndef CLOCK_THREAD_CPUTIME_ID
#error "CLOCK_THREAD_CPUTIME_ID not supported by clock_gettime"
#endif
#include <sys/syscall.h>
#include <time.h>
#endif // __linux__
#if defined(__linux__) || defined(__ANDROID__)
#include <sys/prctl.h>
#endif
#ifdef __ANDROID__
#ifndef PR_SET_VMA
#define PR_SET_VMA 0x53564d41
#endif
#ifndef PR_SET_VMA_ANON_NAME
#define PR_SET_VMA_ANON_NAME 0
#endif
#endif // __ANDROID__
#include "llvh/Support/raw_ostream.h"
namespace hermes {
namespace oscompat {
#ifndef NDEBUG
static size_t testPgSz = 0;
void set_test_page_size(size_t pageSz) {
testPgSz = pageSz;
}
void reset_test_page_size() {
testPgSz = 0;
}
#endif
static inline size_t page_size_real() {
return getpagesize();
}
size_t page_size() {
#ifndef NDEBUG
if (testPgSz != 0) {
return testPgSz;
}
#endif
return page_size_real();
}
#ifndef NDEBUG
static constexpr size_t unsetVMAllocLimit = std::numeric_limits<size_t>::max();
static size_t totalVMAllocLimit = unsetVMAllocLimit;
void set_test_vm_allocate_limit(size_t totSz) {
totalVMAllocLimit = totSz;
}
void unset_test_vm_allocate_limit() {
totalVMAllocLimit = unsetVMAllocLimit;
}
#endif // !NDEBUG
static llvh::ErrorOr<void *> vm_allocate_impl(size_t sz) {
#ifndef NDEBUG
if (LLVM_UNLIKELY(sz > totalVMAllocLimit)) {
return make_error_code(OOMError::TestVMLimitReached);
} else if (LLVM_UNLIKELY(totalVMAllocLimit != unsetVMAllocLimit)) {
totalVMAllocLimit -= sz;
}
#endif // !NDEBUG
void *result = mmap(
nullptr, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (result == MAP_FAILED) {
// Since mmap is a POSIX API, even on MacOS, errno should use the POSIX
// generic_category.
return std::error_code(errno, std::generic_category());
}
return result;
}
static char *alignAlloc(void *p, size_t alignment) {
return reinterpret_cast<char *>(
llvh::alignTo(reinterpret_cast<uintptr_t>(p), alignment));
}
llvh::ErrorOr<void *> vm_allocate(size_t sz) {
assert(sz % page_size() == 0);
#ifndef NDEBUG
if (testPgSz != 0 && testPgSz > static_cast<size_t>(page_size_real())) {
return vm_allocate_aligned(sz, testPgSz);
}
#endif // !NDEBUG
return vm_allocate_impl(sz);
}
llvh::ErrorOr<void *> vm_allocate_aligned(size_t sz, size_t alignment) {
assert(sz > 0 && sz % page_size() == 0);
assert(alignment > 0 && alignment % page_size() == 0);
// Opportunistically allocate without alignment constraint,
// and see if the memory happens to be aligned.
// While this may be unlikely on the first allocation request,
// subsequent allocation requests have a good chance.
auto result = vm_allocate_impl(sz);
if (!result) {
return result;
}
void *mem = *result;
if (mem == alignAlloc(mem, alignment)) {
return mem;
}
// Free the oppotunistic allocation.
oscompat::vm_free(mem, sz);
// This time, allocate a larger section to ensure that it contains
// a subsection that satisfies the request.
// Use *real* page size here since that's what vm_allocate_impl guarantees.
const size_t excessSize = sz + alignment - page_size_real();
result = vm_allocate_impl(excessSize);
if (!result)
return result;
void *raw = *result;
char *aligned = alignAlloc(raw, alignment);
size_t excessAtFront = aligned - static_cast<char *>(raw);
size_t excessAtBack = excessSize - excessAtFront - sz;
if (excessAtFront)
oscompat::vm_free(raw, excessAtFront);
if (excessAtBack)
oscompat::vm_free(aligned + sz, excessAtBack);
return aligned;
}
void vm_free(void *p, size_t sz) {
auto ret = munmap(p, sz);
assert(!ret && "Failed to free memory region.");
(void)ret;
#ifndef NDEBUG
if (LLVM_UNLIKELY(totalVMAllocLimit != unsetVMAllocLimit) && p) {
totalVMAllocLimit += sz;
}
#endif
}
void vm_free_aligned(void *p, size_t sz) {
vm_free(p, sz);
}
void vm_hugepage(void *p, size_t sz) {
assert(
reinterpret_cast<uintptr_t>(p) % page_size() == 0 &&
"Precondition: pointer is page-aligned.");
#if defined(__linux__) || defined(__ANDROID__)
// Since the alloc is aligned, it may benefit from huge pages.
madvise(p, sz, MADV_HUGEPAGE);
#endif
}
void vm_unused(void *p, size_t sz) {
#ifndef NDEBUG
const size_t PS = page_size();
assert(
reinterpret_cast<intptr_t>(p) % PS == 0 &&
"Precondition: pointer is page-aligned.");
#endif
/// Change the flag we pass to \p madvise based on the platform, so that we are
/// always acting to reduce memory pressure, as perceived by that platform.
#if defined(__MACH__)
/// On the mach kernel, \p MADV_FREE causes the OS to deduct this memory from
/// the process's physical footprint.
#define MADV_UNUSED MADV_FREE
#elif defined(__linux__)
/// On linux, telling the OS that we \p MADV_DONTNEED some pages will cause it
/// to immediately deduct their size from the process's resident set.
#define MADV_UNUSED MADV_DONTNEED
#else
#error "Don't know how to return memory to the OS on this platform."
#endif // __MACH__, __linux__
madvise(p, sz, MADV_UNUSED);
#undef MADV_UNUSED
}
void vm_prefetch(void *p, size_t sz) {
assert(
reinterpret_cast<intptr_t>(p) % page_size() == 0 &&
"Precondition: pointer is page-aligned.");
madvise(p, sz, MADV_WILLNEED);
}
void vm_name(void *p, size_t sz, const char *name) {
#ifdef __ANDROID__
prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, p, sz, name);
#else
(void)p;
(void)sz;
(void)name;
#endif // __ANDROID__
}
bool vm_protect(void *p, size_t sz, ProtectMode mode) {
auto prot = PROT_NONE;
if (mode == ProtectMode::ReadWrite) {
prot = PROT_WRITE | PROT_READ;
}
int err = mprotect(p, sz, prot);
return err != -1;
}
bool vm_madvise(void *p, size_t sz, MAdvice advice) {
#ifndef NDEBUG
const size_t PS = page_size();
assert(
reinterpret_cast<intptr_t>(p) % PS == 0 &&
"Precondition: pointer is page-aligned.");
#endif
int param = MADV_NORMAL;
switch (advice) {
case MAdvice::Random:
param = MADV_RANDOM;
break;
case MAdvice::Sequential:
param = MADV_SEQUENTIAL;
break;
}
return madvise(p, sz, param) == 0;
}
llvh::ErrorOr<size_t> vm_footprint(char *start, char *end) {
#ifdef __MACH__
const task_t self = mach_task_self();
vm_address_t vAddr = reinterpret_cast<vm_address_t>(start);
vm_size_t vSz = static_cast<vm_size_t>(end - start);
vm_region_extended_info_data_t info;
mach_msg_type_number_t fields = VM_REGION_EXTENDED_INFO_COUNT;
mach_port_t unused;
auto ret = vm_region_64(
self,
&vAddr,
&vSz,
VM_REGION_EXTENDED_INFO,
// The expected contents, and requisite size of this struct depend on the
// previous and next parameters to this function respectively. We cast it
// to a "generic" info type to indicate this.
reinterpret_cast<vm_region_info_t>(&info),
&fields,
&unused);
if (ret != KERN_SUCCESS)
return std::error_code(errno, std::generic_category());
return info.pages_dirtied;
#else
auto rStart = reinterpret_cast<uintptr_t>(start);
auto rEnd = reinterpret_cast<uintptr_t>(end);
char label[] = "Rss:";
std::ifstream smaps("/proc/self/smaps");
while (smaps) {
std::string firstToken;
smaps >> firstToken;
// Ignore the rest of the line.
smaps.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
if (firstToken.find_last_of(':') != std::string::npos) {
// We are inside an entry, rather than at the start of one, so we should
// ignore this line.
continue;
}
// The first token should be the mapping's virtual address range if this is
// the first line of a mapping's entry, so we extract it.
std::stringstream ris(firstToken);
uintptr_t mStart, mEnd;
ris >> std::hex >> mStart;
// Ignore '-'
ris.ignore();
ris >> mEnd;
// The working assumption is that the kernel will not split a single memory
// region allocated by \p mmap across multiple entries in the smaps output.
if (mStart <= rStart && rEnd <= mEnd) {
// Found the start of the section pertaining to our memory map
break;
}
}
while (smaps) {
std::string line;
std::getline(smaps, line);
if (line.compare(0, sizeof(label) - 1, label) != 0) {
continue;
}
std::stringstream lis(line);
lis.ignore(line.length(), ' '); // Pop the label
size_t rss;
std::string unit;
lis >> std::skipws >> rss >> unit;
assert(unit == "kB");
return rss * 1024 / page_size();
}
return std::error_code(errno, std::generic_category());
#endif
}
int pages_in_ram(const void *p, size_t sz, llvh::SmallVectorImpl<int> *runs) {
const auto PS = page_size();
{
// Align region start down to page boundary.
const uintptr_t addr = reinterpret_cast<uintptr_t>(p);
const size_t adjust = addr % PS;
p = reinterpret_cast<const void *>(addr - adjust);
sz += adjust;
}
// Total number of pages that the region overlaps.
const size_t mapSize = (sz + PS - 1) / PS;
#ifdef __linux__
using MapElm = unsigned char;
#else
using MapElm = char;
#endif
std::vector<MapElm> bitMap(mapSize);
if (mincore(const_cast<void *>(p), sz, bitMap.data())) {
return -1;
}
// Total pages in RAM.
int totalIn = 0;
bool currentRunStatus = true;
if (runs)
runs->push_back(0);
for (auto elm : bitMap) {
// Lowest bit tells whether in RAM.
bool thisStatus = (elm & 1);
totalIn += thisStatus;
if (runs) {
if (thisStatus != currentRunStatus)
runs->push_back(0);
currentRunStatus = thisStatus;
++runs->back();
}
}
return totalIn;
}
uint64_t peak_rss() {
rusage ru;
if (getrusage(RUSAGE_SELF, &ru)) {
// failed
return 0;
}
uint64_t rss = ru.ru_maxrss;
#if !defined(__APPLE__) || !defined(__MACH__)
// Linux maxrss is in kilobytes, expand into bytes.
rss *= 1024;
#endif
return rss;
}
uint64_t current_rss() {
#if defined(__APPLE__) && defined(__MACH__)
struct mach_task_basic_info info;
mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
if (task_info(
mach_task_self(),
MACH_TASK_BASIC_INFO,
(task_info_t)&info,
&infoCount) != KERN_SUCCESS)
return 0;
return info.resident_size * page_size_real();
#else
FILE *fp = fopen("/proc/self/statm", "r");
if (!fp) {
return 0;
}
long rss = 0;
// The first field is total program size, second field is resident set size.
if (fscanf(fp, "%*d %ld", &rss) != 1) {
fclose(fp);
return 0;
}
fclose(fp);
// The RSS number from from statm is in number of pages. Multiply by the real
// page size to get the number in bytes.
return rss * page_size_real();
#endif
}
uint64_t current_private_dirty() {
#if defined(__linux__)
uint64_t sum = 0;
FILE *fp = fopen("/proc/self/smaps", "r");
static const char kPrefix[] = "Private_Dirty:";
constexpr size_t kPrefixLen = sizeof(kPrefix) - 1;
char buf[128]; // Just needs to fit the lines we care about.
while (fgets(buf, sizeof(buf), fp))
if (strncmp(buf, kPrefix, kPrefixLen) == 0)
sum += atoll(buf + kPrefixLen);
fclose(fp);
return sum * 1024;
#else
return 0;
#endif
}
#if defined(__linux__)
static bool overlap(uintptr_t a, size_t asize, uintptr_t b, size_t bsize) {
// An empty interval has no overlap.
if (asize == 0 || bsize == 0)
return false;
// Order by start address.
if (a > b)
return overlap(b, bsize, a, asize);
// Overlap iff the first interval extends beyond the start of the second.
return a + asize > b;
}
#endif
std::vector<std::string> get_vm_protect_modes(const void *p, size_t sz) {
std::vector<std::string> modes;
#if defined(__linux__)
unsigned long long begin;
unsigned long long end;
char mode[4 + 1];
FILE *fp = fopen("/proc/self/maps", "r");
if (!fp) {
modes.emplace_back("unknown");
return modes;
}
while (fscanf(fp, "%llx-%llx %4s", &begin, &end, mode) == 3) {
if (overlap(
reinterpret_cast<uintptr_t>(p),
sz,
static_cast<uintptr_t>(begin),
static_cast<size_t>(end - begin))) {
modes.push_back(mode);
}
// Discard remainder of the line.
int result;
do {
result = fgetc(fp);
} while (result != '\n' && result > 0);
}
#endif
return modes;
}
bool num_context_switches(long &voluntary, long &involuntary) {
voluntary = involuntary = -1;
rusage ru;
// Only Linux is known to have RUSAGE_THREAD.
#if defined(__linux__)
const int who = RUSAGE_THREAD;
#else
const int who = RUSAGE_SELF;
#endif
if (getrusage(who, &ru)) {
// failed
return false;
}
voluntary = ru.ru_nvcsw;
involuntary = ru.ru_nivcsw;
return true;
}
// Platform-specific implementations of thread_id
#if defined(__APPLE__) && defined(__MACH__)
uint64_t thread_id() {
uint64_t tid = 0;
auto ret = pthread_threadid_np(nullptr, &tid);
assert(ret == 0 && "pthread_threadid_np shouldn't fail for current thread");
(void)ret;
return tid;
}
#elif defined(__ANDROID__)
uint64_t thread_id() {
return gettid();
}
#elif defined(__linux__)
uint64_t thread_id() {
return syscall(__NR_gettid);
}
#else
#error "Thread ID not supported on this platform"
#endif
void set_thread_name(const char *name) {
// Set the thread name for TSAN. It doesn't share the same name mapping as the
// OS does. This macro expands to nothing if TSAN isn't on.
TsanThreadName(name);
#if defined(__linux__) || defined(__ANDROID__)
prctl(PR_SET_NAME, name);
#elif defined(__APPLE__)
::pthread_setname_np(name);
#endif
// Do nothing if the platform doesn't support it.
}
// Platform-specific implementations of thread_cpu_time
#if defined(__APPLE__) && defined(__MACH__)
std::chrono::microseconds thread_cpu_time() {
using namespace std::chrono;
struct thread_basic_info tbi;
mach_port_t self = pthread_mach_thread_np(pthread_self());
mach_msg_type_number_t fields = THREAD_BASIC_INFO_COUNT;
if (thread_info(self, THREAD_BASIC_INFO, (thread_info_t)&tbi, &fields) !=
KERN_SUCCESS) {
return microseconds::max();
}
microseconds::rep total = 0;
total += tbi.user_time.microseconds;
total += tbi.user_time.seconds * 1000000;
total += tbi.system_time.microseconds;
total += tbi.system_time.seconds * 1000000;
return microseconds(total);
}
#elif defined(__linux__) // !(__APPLE__ && __MACH__)
std::chrono::microseconds thread_cpu_time() {
using namespace std::chrono;
struct timespec ts;
if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) != 0) {
return microseconds::max();
}
microseconds::rep total = 0;
total += ts.tv_nsec / 1000;
total += ts.tv_sec * 1000000;
return microseconds(total);
}
#else // !(__APPLE__ && __MACH__), !__linux__
#error "Thread CPU Time not supported on this platform"
#endif // thread_cpu_time: (__APPLE__ && __MACH__), __linux__
// Platform-specific implementations of thread_page_fault_count
#if defined(__APPLE__) && defined(__MACH__)
bool thread_page_fault_count(int64_t *outMinorFaults, int64_t *outMajorFaults) {
task_events_info eventsInfo;
mach_msg_type_number_t count = TASK_EVENTS_INFO_COUNT;
kern_return_t kr = task_info(
mach_task_self(), TASK_EVENTS_INFO, (task_info_t)&eventsInfo, &count);
if (kr == KERN_SUCCESS) {
*outMinorFaults = eventsInfo.faults;
*outMajorFaults = eventsInfo.pageins;
}
return kr == KERN_SUCCESS;
}
#elif defined(__linux__) // !(__APPLE__ && __MACH__)
bool thread_page_fault_count(int64_t *outMinorFaults, int64_t *outMajorFaults) {
struct rusage stats = {};
int ret = getrusage(RUSAGE_THREAD, &stats);
if (ret == 0) {
*outMinorFaults = stats.ru_minflt;
*outMajorFaults = stats.ru_majflt;
}
return ret == 0;
}
#else // !(__APPLE__ && __MACH__), !__linux__
#error "Thread page fault count not supported on this platform"
#endif // thread_page_fault_count: (__APPLE__ && __MACH__), __linux__
std::string thread_name() {
constexpr int kMaxThreadNameSize = 100;
int ret = 0;
char threadName[kMaxThreadNameSize];
#ifdef __ANDROID__
ret = prctl(PR_GET_NAME, threadName);
#else
ret = pthread_getname_np(pthread_self(), threadName, sizeof(threadName));
#endif
if (ret != 0) {
// thread name error should be non-fatal, simply return empty thread name.
perror("thread_name failed");
return "";
}
return threadName;
}
#ifdef __linux__
std::vector<bool> sched_getaffinity() {
std::vector<bool> v;
cpu_set_t mask;
CPU_ZERO(&mask);
int status = ::sched_getaffinity(0, sizeof(mask), &mask);
if (status != 0) {
return v;
}
int lastSet = -1;
for (int cpu = 0; cpu < CPU_SETSIZE; ++cpu) {
v.push_back(CPU_ISSET(cpu, &mask));
if (v.back())
lastSet = cpu;
}
// Trim trailing zeroes.
v.resize(lastSet + 1);
return v;
}
int sched_getcpu() {
return ::sched_getcpu();
}
#else
std::vector<bool> sched_getaffinity() {
// Not yet supported.
return std::vector<bool>();
}
int sched_getcpu() {
// Not yet supported.
return -1;
}
#endif
bool set_env(const char *name, const char *value) {
// Enforce the contract of this function that value must not be empty
assert(*value != '\0' && "value cannot be empty string");
return setenv(name, value, 1) == 0;
}
bool unset_env(const char *name) {
return unsetenv(name) == 0;
}
/*static*/
void *SigAltStackLeakSuppressor::stackRoot_{nullptr};
SigAltStackLeakSuppressor::~SigAltStackLeakSuppressor() {
stack_t oldAltStack;
if (sigaltstack(nullptr, &oldAltStack) == 0) {
stackRoot_ = oldAltStack.ss_sp;
}
}
} // namespace oscompat
} // namespace hermes
#endif // not _WINDOWS