lib/VM/Profiler/SamplingProfilerPosix.cpp (460 lines of code) (raw):
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include "hermes/VM/Profiler/SamplingProfiler.h"
#ifdef HERMESVM_SAMPLING_PROFILER_POSIX
#include "hermes/Support/ThreadLocal.h"
#include "hermes/VM/Callable.h"
#include "hermes/VM/HostModel.h"
#include "hermes/VM/Profiler/ChromeTraceSerializerPosix.h"
#include "hermes/VM/RuntimeModule-inline.h"
#include "hermes/VM/StackFrame-inline.h"
#include "llvh/Support/Compiler.h"
#include <fcntl.h>
#include <pthread.h>
#include <unistd.h>
#include <cassert>
#include <chrono>
#include <cmath>
#include <csignal>
#include <random>
#include <thread>
namespace hermes {
namespace vm {
/// Name of the semaphore.
const char *const kSamplingDoneSemaphoreName = "/samplingDoneSem";
std::atomic<SamplingProfiler::GlobalProfiler *>
SamplingProfiler::GlobalProfiler::instance_{nullptr};
std::atomic<SamplingProfiler *>
SamplingProfiler::GlobalProfiler::profilerForSig_{nullptr};
void SamplingProfiler::GlobalProfiler::registerRuntime(
SamplingProfiler *profiler) {
std::lock_guard<std::mutex> lockGuard(profilerLock_);
profilers_.insert(profiler);
#if defined(__ANDROID__) && defined(HERMES_FACEBOOK_BUILD)
assert(
threadLocalProfilerForLoom_.get() == nullptr &&
"multiple hermes runtime in the same thread");
threadLocalProfilerForLoom_.set(profiler);
#endif
}
void SamplingProfiler::GlobalProfiler::unregisterRuntime(
SamplingProfiler *profiler) {
std::lock_guard<std::mutex> lockGuard(profilerLock_);
bool succeed = profilers_.erase(profiler);
// TODO: should we allow recursive style
// register/register -> unregister/unregister call?
assert(succeed && "How can runtime not registered yet?");
(void)succeed;
#if defined(__ANDROID__) && defined(HERMES_FACEBOOK_BUILD)
assert(
threadLocalProfilerForLoom_.get() == profiler &&
"invalid hermes runtime for this thread");
threadLocalProfilerForLoom_.set(nullptr);
#endif
}
void SamplingProfiler::registerDomain(Domain *domain) {
// If domain is not already registered, add it to the list.
auto it = std::find(domains_.begin(), domains_.end(), domain);
if (it == domains_.end())
domains_.push_back(domain);
}
void SamplingProfiler::markRoots(RootAcceptor &acceptor) {
std::lock_guard<std::mutex> lockGuard(runtimeDataLock_);
for (Domain *&domain : domains_) {
acceptor.acceptPtr(domain);
}
}
int SamplingProfiler::GlobalProfiler::invokeSignalAction(void (*handler)(int)) {
struct sigaction actions;
memset(&actions, 0, sizeof(actions));
sigemptyset(&actions.sa_mask);
// Allows interrupted IO primitives to restart.
actions.sa_flags = SA_RESTART;
actions.sa_handler = handler;
return sigaction(SIGPROF, &actions, nullptr);
}
bool SamplingProfiler::GlobalProfiler::registerSignalHandlers() {
if (isSigHandlerRegistered_) {
return true;
}
if (invokeSignalAction(profilingSignalHandler) != 0) {
perror("signal handler registration failed");
return false;
}
isSigHandlerRegistered_ = true;
return true;
}
bool SamplingProfiler::GlobalProfiler::unregisterSignalHandler() {
if (!isSigHandlerRegistered_) {
return true;
}
// Restore to default.
if (invokeSignalAction(SIG_DFL) != 0) {
perror("signal handler unregistration failed");
return false;
}
isSigHandlerRegistered_ = false;
return true;
}
void SamplingProfiler::GlobalProfiler::profilingSignalHandler(int signo) {
// Ensure that writes made on the timer thread before setting the current
// profiler are correctly acquired.
SamplingProfiler *localProfiler;
while (!(localProfiler = profilerForSig_.load(std::memory_order_acquire))) {
}
// Avoid spoiling errno in a signal handler by storing the old version and
// re-assigning it.
auto oldErrno = errno;
auto &curThreadRuntime = localProfiler->runtime_;
auto profilerInstance = instance_.load();
// Sampling stack will touch GC objects(like closure) so
// only do so if heap is valid.
if (LLVM_LIKELY(!curThreadRuntime.getHeap().inGC())) {
assert(
profilerInstance != nullptr &&
"Why is GlobalProfiler::instance_ not initialized yet?");
profilerInstance->sampledStackDepth_ = localProfiler->walkRuntimeStack(
profilerInstance->sampleStorage_, SaveDomains::Yes);
} else {
// GC in process. Copy pre-captured stack instead.
if (localProfiler->preGCStackDepth_ > 0) {
profilerInstance->sampleStorage_ = localProfiler->preGCStackStorage_;
profilerInstance->sampledStackDepth_ = localProfiler->preGCStackDepth_;
} else {
// This GC (like mallocGC) did not record JS stack.
// TODO: fix this for all GCs.
profilerInstance->sampledStackDepth_ = 0;
}
}
// Ensure that writes made in the handler are visible to the timer thread.
profilerForSig_.store(nullptr);
if (!instance_.load()->samplingDoneSem_.notifyOne()) {
errno = oldErrno;
abort(); // Something is wrong.
}
errno = oldErrno;
}
bool SamplingProfiler::GlobalProfiler::sampleStack() {
for (SamplingProfiler *localProfiler : profilers_) {
auto targetThreadId = localProfiler->currentThread_;
std::lock_guard<std::mutex> lk(localProfiler->runtimeDataLock_);
// Ensure there are no allocations in the signal handler by keeping ample
// reserved space.
localProfiler->domains_.reserve(
localProfiler->domains_.size() + kMaxStackDepth);
size_t domainCapacityBefore = localProfiler->domains_.capacity();
(void)domainCapacityBefore;
// Guarantee that the runtime thread will not proceed until it has acquired
// the updates to domains_.
profilerForSig_.store(localProfiler, std::memory_order_release);
// Signal target runtime thread to sample stack.
pthread_kill(targetThreadId, SIGPROF);
// Threading: samplingDoneSem_ will synchronise this thread with the signal
// handler, so that we only have one active signal at a time.
if (!samplingDoneSem_.wait()) {
return false;
}
// Guarantee that this thread will observe all changes made to data
// structures in the signal handler.
while (profilerForSig_.load(std::memory_order_acquire) != nullptr) {
}
assert(
localProfiler->domains_.capacity() == domainCapacityBefore &&
"Must not dynamically allocate in signal handler");
assert(
sampledStackDepth_ <= sampleStorage_.stack.size() &&
"How can we sample more frames than storage?");
localProfiler->sampledStacks_.emplace_back(
sampleStorage_.tid,
sampleStorage_.timeStamp,
sampleStorage_.stack.begin(),
sampleStorage_.stack.begin() + sampledStackDepth_);
}
return true;
}
void SamplingProfiler::GlobalProfiler::timerLoop() {
oscompat::set_thread_name("hermes-sampling-profiler");
constexpr double kMeanMilliseconds = 10;
constexpr double kStdDevMilliseconds = 5;
std::random_device rd{};
std::mt19937 gen{rd()};
// The amount of time that is spent sleeping comes from a normal distribution,
// to avoid the case where the timer thread samples a stack at a predictable
// period.
std::normal_distribution<> distribution{
kMeanMilliseconds, kStdDevMilliseconds};
std::unique_lock<std::mutex> uniqueLock(profilerLock_);
while (enabled_) {
if (!sampleStack()) {
return;
}
const uint64_t millis = round(std::fabs(distribution(gen)));
// TODO: make sampling rate configurable.
enabledCondVar_.wait_for(
uniqueLock, std::chrono::milliseconds(millis), [this]() {
return !enabled_;
});
}
}
uint32_t SamplingProfiler::walkRuntimeStack(
StackTrace &sampleStorage,
SaveDomains saveDomains,
uint32_t startIndex) {
unsigned count = startIndex;
// TODO: capture leaf frame IP.
const Inst *ip = nullptr;
for (ConstStackFramePtr frame : runtime_.getStackFrames()) {
// Whether we successfully captured a stack frame or not.
bool capturedFrame = true;
auto &frameStorage = sampleStorage.stack[count];
// Check if it is pure JS frame.
auto *calleeCodeBlock = frame.getCalleeCodeBlock();
if (calleeCodeBlock != nullptr) {
frameStorage.kind = StackFrame::FrameKind::JSFunction;
frameStorage.jsFrame.functionId = calleeCodeBlock->getFunctionID();
frameStorage.jsFrame.offset =
(ip == nullptr ? 0 : calleeCodeBlock->getOffsetOf(ip));
auto *module = calleeCodeBlock->getRuntimeModule();
assert(module != nullptr && "Cannot fetch runtimeModule for code block");
frameStorage.jsFrame.module = module;
// Don't execute a read or write barrier here because this is a signal
// handler.
if (saveDomains == SaveDomains::Yes)
registerDomain(module->getDomainForSamplingProfiler());
} else if (
auto *nativeFunction =
dyn_vmcast<NativeFunction>(frame.getCalleeClosureUnsafe())) {
frameStorage.kind = vmisa<FinalizableNativeFunction>(nativeFunction)
? StackFrame::FrameKind::FinalizableNativeFunction
: StackFrame::FrameKind::NativeFunction;
frameStorage.nativeFrame = nativeFunction->getFunctionPtr();
} else {
// TODO: handle BoundFunction.
capturedFrame = false;
}
// Update ip to caller for next iteration.
ip = frame.getSavedIP();
if (capturedFrame) {
++count;
if (count >= sampleStorage.stack.size()) {
break;
}
}
}
sampleStorage.tid = oscompat::thread_id();
sampleStorage.timeStamp = std::chrono::steady_clock::now();
return count;
}
/*static*/ SamplingProfiler::GlobalProfiler *
SamplingProfiler::GlobalProfiler::get() {
// We intentionally leak this memory to avoid a case where instance is
// accessed after it is destroyed during shutdown.
static GlobalProfiler *instance = new GlobalProfiler{};
return instance;
}
SamplingProfiler::GlobalProfiler::GlobalProfiler() {
instance_.store(this);
#if defined(__ANDROID__) && defined(HERMES_FACEBOOK_BUILD)
profilo_api()->register_external_tracer_callback(
TRACER_TYPE_JAVASCRIPT, collectStackForLoom);
#endif
}
bool SamplingProfiler::GlobalProfiler::enabled() {
std::lock_guard<std::mutex> lockGuard(profilerLock_);
return enabled_;
}
#if defined(__ANDROID__) && defined(HERMES_FACEBOOK_BUILD)
/*static*/ StackCollectionRetcode SamplingProfiler::collectStackForLoom(
ucontext_t *ucontext,
int64_t *frames,
uint16_t *depth,
uint16_t max_depth) {
auto profilerInstance = GlobalProfiler::instance_.load();
SamplingProfiler *localProfiler =
profilerInstance->threadLocalProfilerForLoom_.get();
if (localProfiler == nullptr) {
// No runtime in this thread.
return StackCollectionRetcode::NO_STACK_FOR_THREAD;
}
Runtime &curThreadRuntime = localProfiler->runtime_;
// Sampling stack will touch GC objects(like closure) so
// only do so if heap is valid.
uint32_t sampledStackDepth = 0;
if (!curThreadRuntime.getHeap().inGC()) {
assert(
profilerInstance != nullptr &&
"Why is GlobalProfiler::instance_ not initialized yet?");
// Do not register domains for Loom profiling, since we don't use them for
// symbolication.
sampledStackDepth = localProfiler->walkRuntimeStack(
profilerInstance->sampleStorage_, SaveDomains::No);
} else {
// TODO: log "GC in process" meta event.
sampledStackDepth = 0;
}
// Loom stack frames are encoded like this:
// 1. Most significant bit of 64bits address is set for native frame.
// 2. JS frame: module id in high 32 bits, address virtual offset in lower
// 32 bits, with MSB unset.
// 3. Native/Finalizable frame: address returned with MSB set.
// TODO: enhance this when supporting more frame types.
sampledStackDepth = std::min(sampledStackDepth, (uint32_t)max_depth);
for (uint32_t i = 0; i < sampledStackDepth; ++i) {
constexpr uint64_t kNativeFrameMask = ((uint64_t)1 << 63);
const StackFrame &stackFrame = profilerInstance->sampleStorage_.stack[i];
switch (stackFrame.kind) {
case StackFrame::FrameKind::JSFunction: {
auto *bcProvider = stackFrame.jsFrame.module->getBytecode();
uint32_t virtualOffset = bcProvider->getVirtualOffsetForFunction(
stackFrame.jsFrame.functionId) +
stackFrame.jsFrame.offset;
uint32_t segmentID = bcProvider->getSegmentID();
uint64_t frameAddress = ((uint64_t)segmentID << 32) + virtualOffset;
assert(
(frameAddress & kNativeFrameMask) == 0 &&
"Module id should take less than 32 bits");
frames[i] = frameAddress;
break;
}
case StackFrame::FrameKind::NativeFunction:
frames[i] = ((uint64_t)stackFrame.nativeFrame | kNativeFrameMask);
break;
case StackFrame::FrameKind::FinalizableNativeFunction:
frames[i] =
((uint64_t)stackFrame.finalizableNativeFrame | kNativeFrameMask);
break;
default:
llvm_unreachable("Loom: unknown frame kind");
}
}
*depth = sampledStackDepth;
if (*depth == 0) {
return StackCollectionRetcode::EMPTY_STACK;
}
return StackCollectionRetcode::SUCCESS;
}
#endif
SamplingProfiler::SamplingProfiler(Runtime &runtime)
: currentThread_{pthread_self()}, runtime_{runtime} {
threadNames_[oscompat::thread_id()] = oscompat::thread_name();
GlobalProfiler::get()->registerRuntime(this);
}
SamplingProfiler::~SamplingProfiler() {
if (pthread_self() != currentThread_) {
::hermes::hermes_fatal(
"SamplingProfiler should be destroyed on the same thread it is "
"created");
}
GlobalProfiler::get()->unregisterRuntime(this);
}
void SamplingProfiler::dumpSampledStackGlobal(llvh::raw_ostream &OS) {
auto globalProfiler = GlobalProfiler::get();
std::lock_guard<std::mutex> lk(globalProfiler->profilerLock_);
if (!globalProfiler->profilers_.empty()) {
auto *localProfiler = *globalProfiler->profilers_.begin();
localProfiler->dumpSampledStack(OS);
}
}
void SamplingProfiler::dumpSampledStack(llvh::raw_ostream &OS) {
std::lock_guard<std::mutex> lk(runtimeDataLock_);
OS << "dumpSamples called from runtime\n";
OS << "Total " << sampledStacks_.size() << " samples\n";
for (unsigned i = 0; i < sampledStacks_.size(); ++i) {
auto &sample = sampledStacks_[i];
uint64_t timeStamp = sample.timeStamp.time_since_epoch().count();
OS << "[" << i << "]: tid[" << sample.tid << "], ts[" << timeStamp << "] ";
// Leaf frame is in sample[0] so dump it backward to
// get root => leaf represenation.
for (auto iter = sample.stack.rbegin(); iter != sample.stack.rend();
++iter) {
const StackFrame &frame = *iter;
switch (frame.kind) {
case StackFrame::FrameKind::JSFunction:
OS << "[JS] " << frame.jsFrame.functionId << ":"
<< frame.jsFrame.offset;
break;
case StackFrame::FrameKind::NativeFunction:
OS << "[Native] " << reinterpret_cast<uintptr_t>(frame.nativeFrame);
break;
case StackFrame::FrameKind::FinalizableNativeFunction:
OS << "[HostFunction]";
break;
default:
llvm_unreachable("Unknown frame kind");
}
OS << " => ";
}
OS << "\n";
}
}
void SamplingProfiler::dumpChromeTraceGlobal(llvh::raw_ostream &OS) {
auto globalProfiler = GlobalProfiler::get();
std::lock_guard<std::mutex> lk(globalProfiler->profilerLock_);
if (!globalProfiler->profilers_.empty()) {
auto *localProfiler = *globalProfiler->profilers_.begin();
localProfiler->dumpChromeTrace(OS);
}
}
void SamplingProfiler::dumpChromeTrace(llvh::raw_ostream &OS) {
std::lock_guard<std::mutex> lk(runtimeDataLock_);
auto pid = getpid();
ChromeTraceSerializer serializer(
ChromeTraceFormat::create(pid, threadNames_, sampledStacks_));
serializer.serialize(OS);
clear();
}
void SamplingProfiler::serializeInDevToolsFormat(llvh::raw_ostream &OS) {
std::lock_guard<std::mutex> lk(runtimeDataLock_);
hermes::vm::serializeAsProfilerProfile(
OS, ChromeTraceFormat::create(getpid(), threadNames_, sampledStacks_));
clear();
}
bool SamplingProfiler::enable() {
return GlobalProfiler::get()->enable();
}
bool SamplingProfiler::GlobalProfiler::enable() {
std::lock_guard<std::mutex> lockGuard(profilerLock_);
if (enabled_) {
return true;
}
if (!samplingDoneSem_.open(kSamplingDoneSemaphoreName)) {
return false;
}
if (!registerSignalHandlers()) {
return false;
}
enabled_ = true;
// Start timer thread.
timerThread_ = std::thread(&GlobalProfiler::timerLoop, this);
return true;
}
bool SamplingProfiler::disable() {
return GlobalProfiler::get()->disable();
}
bool SamplingProfiler::GlobalProfiler::disable() {
{
std::lock_guard<std::mutex> lockGuard(profilerLock_);
if (!enabled_) {
// Already disabled.
return true;
}
if (!samplingDoneSem_.close()) {
return false;
}
// Unregister handlers before shutdown.
if (!unregisterSignalHandler()) {
return false;
}
// Telling timer thread to exit.
enabled_ = false;
}
// Notify the timer thread that it has been disabled.
enabledCondVar_.notify_all();
// Wait for timer thread to exit. This avoids the timer thread reading from
// memory that is freed after a main thread exits. This is outside the lock
// on profilerLock_ since the timer thread needs to acquire that lock.
timerThread_.join();
return true;
}
void SamplingProfiler::clear() {
sampledStacks_.clear();
// Release all strong roots to domains.
domains_.clear();
// TODO: keep thread names that are still in use.
threadNames_.clear();
}
void SamplingProfiler::onGCEvent(
GCEventKind kind,
const std::string &extraInfo) {
assert(
!runtime_.getHeap().inGC() &&
"Cannot be in a GC when setting a GC event");
switch (kind) {
case GCEventKind::CollectionStart: {
assert(
preGCStackDepth_ == 0 && "preGCStackDepth_ is not reset after GC?");
if (LLVM_LIKELY(!GlobalProfiler::get()->enabled())) {
return;
}
recordPreGCStack(extraInfo);
break;
}
case GCEventKind::CollectionEnd:
preGCStackDepth_ = 0;
break;
default:
llvm_unreachable("Unknown GC event");
}
}
void SamplingProfiler::recordPreGCStack(const std::string &extraInfo) {
GCFrameInfo gcExtraInfo = nullptr;
if (!extraInfo.empty()) {
std::pair<std::unordered_set<std::string>::iterator, bool> retPair =
gcEventExtraInfoSet_.insert(extraInfo);
gcExtraInfo = &(*(retPair.first));
}
auto &leafFrame = preGCStackStorage_.stack[0];
leafFrame.kind = StackFrame::FrameKind::GCFrame;
leafFrame.gcFrame = gcExtraInfo;
std::lock_guard<std::mutex> lk(runtimeDataLock_);
// Leaf frame slot has been used, filling from index 1.
preGCStackDepth_ = walkRuntimeStack(preGCStackStorage_, SaveDomains::Yes, 1);
}
bool operator==(
const SamplingProfiler::StackFrame &left,
const SamplingProfiler::StackFrame &right) {
if (left.kind != right.kind) {
return false;
}
switch (left.kind) {
case SamplingProfiler::StackFrame::FrameKind::JSFunction:
return left.jsFrame.functionId == right.jsFrame.functionId &&
left.jsFrame.offset == right.jsFrame.offset;
case SamplingProfiler::StackFrame::FrameKind::NativeFunction:
return left.nativeFrame == right.nativeFrame;
case SamplingProfiler::StackFrame::FrameKind::FinalizableNativeFunction:
return left.finalizableNativeFrame == right.finalizableNativeFrame;
case SamplingProfiler::StackFrame::FrameKind::GCFrame:
return left.gcFrame == right.gcFrame;
default:
llvm_unreachable("Unknown frame kind");
}
}
} // namespace vm
} // namespace hermes
#endif // HERMESVM_SAMPLING_PROFILER_POSIX