reverie-ptrace/src/perf.rs

/* * Copyright (c) Facebook, Inc. and its affiliates. * * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ //! Provides a more rustic interface to a minimal set of `perf` functionality. //! //! Explicitly missing (because they are unnecessary) perf features include: //! * Grouping //! * Sample type flags //! * Reading any kind of sample events //! * BPF //! * Hardware breakpoints //! //! The arguments and behaviors in this module generally correspond exactly to //! those of `perf_event_open(2)`. No attempts are made to paper over the //! non-determinism/weirndess of `perf`. For example, counter increments are //! dropped whenever an event fires on a running thread. //! [`PerfCounter::DISABLE_SAMPLE_PERIOD`] can be used to avoid this for sampling. //! events. use crate::validation::{check_for_pmu_bugs, PmuValidationError}; use core::ptr::NonNull; use lazy_static::lazy_static; use nix::{ sys::signal::Signal, unistd::{sysconf, SysconfVar}, }; use perf_event_open_sys::{bindings as perf, ioctls}; use reverie::Errno; use reverie::Tid; use tracing::{info, warn}; #[allow(unused_imports)] // only used if we have an error use std::compile_error; pub use perf::perf_event_header; lazy_static! { static ref PMU_BUG: Result<(), PmuValidationError> = check_for_pmu_bugs(); } // Not available in the libc crate const F_SETOWN_EX: libc::c_int = 15; const F_SETSIG: libc::c_int = 10; const F_OWNER_TID: libc::c_int = 0; #[repr(C)] struct f_owner_ex { pub type_: libc::c_int, pub pid: libc::pid_t, } /// An incomplete enumeration of events perf can monitor #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum Event { #[allow(dead_code)] // used in tests /// A perf-supported hardware event. Hardware(HardwareEvent), /// A perf-supported software event. Software(SoftwareEvent), /// A raw CPU event. The inner value will have a CPU-specific meaning. Raw(u64), } /// An incomplete enumeration of hardware events perf can monitor. #[allow(dead_code)] // used in tests #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum HardwareEvent { /// Count retired instructions. Can be affected by hardware interrupt counts. Instructions, /// Count retired branch instructions. BranchInstructions, } /// An incomplete enumeration of software events perf can monitor. #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum SoftwareEvent { /// A placeholder event that counts nothing. Dummy, } /// A perf counter with a very limited range of configurability. /// Construct via [`Builder`]. #[derive(Debug)] pub struct PerfCounter { fd: libc::c_int, mmap: Option<NonNull<perf::perf_event_mmap_page>>, } impl Event { fn attr_type(self) -> u32 { match self { Event::Hardware(_) => perf::perf_type_id_PERF_TYPE_HARDWARE, Event::Software(_) => perf::perf_type_id_PERF_TYPE_SOFTWARE, Event::Raw(_) => perf::perf_type_id_PERF_TYPE_RAW, } } fn attr_config(self) -> u64 { match self { Event::Raw(x) => x, Event::Hardware(HardwareEvent::Instructions) => { perf::perf_hw_id_PERF_COUNT_HW_INSTRUCTIONS.into() } Event::Hardware(HardwareEvent::BranchInstructions) => { perf::perf_hw_id_PERF_COUNT_HW_BRANCH_INSTRUCTIONS.into() } Event::Software(SoftwareEvent::Dummy) => perf::perf_sw_ids_PERF_COUNT_SW_DUMMY.into(), } } } /// Builder for a PerfCounter. Contains only the subset of the attributes that /// this API allows manipulating set to non-defaults. #[derive(Debug, Clone)] pub struct Builder { pid: libc::pid_t, cpu: libc::c_int, evt: Event, sample_period: u64, precise_ip: u32, fast_reads: bool, } impl Builder { /// Initialize the builder. The initial configuration is for a software /// counting event that never increments. /// /// `pid` accepts a *TID* from `gettid(2)`. Passing `getpid(2)` will /// monitor the main thread of the calling thread group. Passing `0` /// monitors the calling thread. Passing `-1` monitors all threads on /// the specified CPU. /// /// `cpu` should almost always be `-1`, which tracks the specified `pid` /// across all CPUs. Non-negative integers track only the specified `pid` /// on that CPU. /// /// Passing `-1` for both `pid` and `cpu` will result in an error. pub fn new(pid: libc::pid_t, cpu: libc::c_int) -> Self { Self { pid, cpu, evt: Event::Software(SoftwareEvent::Dummy), sample_period: 0, precise_ip: 0, fast_reads: false, } } /// Select the event to monitor. pub fn event(&mut self, evt: Event) -> &mut Self { self.evt = evt; self } /// Set the period for sample collection. Default is 0, which creates a /// counting event. /// /// Because this module always sets `wakeup_events` to 1, this also /// specifies after how many events an overflow notification should be /// raised. If a signal has been setup with /// `PerfCounter::set_signal_delivery`], this corresponds to one sent /// signal. Overflow notifications are sent whenever the counter reaches a /// multiple of `sample_period`. /// /// If you only want accurate counts, pass /// `DISABLE_SAMPLE_PERIOD`. Passing `0` will also work, but will create a /// _counting_ event that cannot become a _sampling event_ via the /// `PERF_EVENT_IOC_PERIOD` ioctl. pub fn sample_period(&mut self, period: u64) -> &mut Self { self.sample_period = period; self } /// Set `precise_ip` on the underlying perf attribute structure. Valid /// values are 0-3; the underlying field is 2 bits. /// /// Non-zero values will cause perf to attempt to lower the skid of *samples* /// (but not necessarily notifications), usually via hardware features like /// Intel PEBS. /// /// Use with caution: experiments have shown that counters with non-zero /// `precise_ip` can drop events under certain circumstances. See /// `experiments/test_consistency.c` for more information. pub fn precise_ip(&mut self, precise_ip: u32) -> &mut Self { self.precise_ip = precise_ip; self } /// Enable fast reads via shared memory with the kernel for the latest /// counter value. pub fn fast_reads(&mut self, enable: bool) -> &mut Self { self.fast_reads = enable; self } /// Render the builder into a `PerfCounter`. Created counters begin in a /// disabled state. Additional initialization steps should be performed, /// followed by a call to [`PerfCounter::enable`]. pub fn create(&self) -> Result<PerfCounter, Errno> { let mut attr = perf::perf_event_attr::default(); attr.size = core::mem::size_of_val(&attr) as u32; attr.type_ = self.evt.attr_type(); attr.config = self.evt.attr_config(); attr.__bindgen_anon_1.sample_period = self.sample_period; attr.set_disabled(1); // user must enable later attr.set_exclude_kernel(1); // we only care about user code attr.set_exclude_guest(1); attr.set_exclude_hv(1); // unlikely this is supported, but it doesn't hurt attr.set_pinned(1); // error state if we are descheduled from the PMU attr.set_precise_ip(self.precise_ip.into()); attr.__bindgen_anon_2.wakeup_events = 1; // generate a wakeup (overflow) after one sample event let pid = self.pid; let cpu = self.cpu; let group_fd: libc::c_int = -1; // always create a new group let flags = perf::PERF_FLAG_FD_CLOEXEC; // marginally more safe if we fork+exec let fd = Errno::result(unsafe { libc::syscall(libc::SYS_perf_event_open, &attr, pid, cpu, group_fd, flags) })?; let fd = fd as libc::c_int; let mmap = if self.fast_reads { let res = Errno::result(unsafe { libc::mmap( core::ptr::null_mut(), get_mmap_size(), libc::PROT_READ, // leaving PROT_WRITE unset lets us passively read libc::MAP_SHARED, fd, 0, ) }); match res { Ok(ptr) => Some(NonNull::new(ptr as *mut _).unwrap()), Err(e) => { close_perf_fd(fd); return Err(e); } } } else { None }; Ok(PerfCounter { fd, mmap }) } pub(crate) fn check_for_pmu_bugs(&mut self) -> &mut Self { if let Err(pmu_error) = &*PMU_BUG { warn!("Pmu bugs detected: {:?}", pmu_error); } self } } impl PerfCounter { /// Perf counters cannot be switched from sampling to non-sampling, so /// setting their period to this large value effectively disables overflows /// and sampling. pub const DISABLE_SAMPLE_PERIOD: u64 = 1 << 60; /// Call the `PERF_EVENT_IOC_ENABLE` ioctl. Enables increments of the /// counter and event generation. pub fn enable(&self) -> Result<(), Errno> { Errno::result(unsafe { ioctls::ENABLE(self.fd, 0) }).and(Ok(())) } /// Call the `PERF_EVENT_IOC_ENABLE` ioctl. Disables increments of the /// counter and event generation. pub fn disable(&self) -> Result<(), Errno> { Errno::result(unsafe { ioctls::DISABLE(self.fd, 0) }).and(Ok(())) } /// Corresponds exactly to the `PERF_EVENT_IOC_REFRESH` ioctl. #[allow(dead_code)] pub fn refresh(&self, count: libc::c_int) -> Result<(), Errno> { assert!(count != 0); // 0 is undefined behavior Errno::result(unsafe { ioctls::REFRESH(self.fd, 0) }).and(Ok(())) } /// Call the `PERF_EVENT_IOC_RESET` ioctl. Resets the counter value to 0, /// which results in delayed overflow events. pub fn reset(&self) -> Result<(), Errno> { Errno::result(unsafe { ioctls::RESET(self.fd, 0) }).and(Ok(())) } /// Call the `PERF_EVENT_IOC_PERIOD` ioctl. This causes the counter to /// behave as if `ticks` was the original argument to `sample_period` in /// the builder. pub fn set_period(&self, ticks: u64) -> Result<(), Errno> { // The bindings are wrong for this ioctl. The method signature takes a // u64, but the actual ioctl expects a pointer to a u64. Thus, we use // the constant manually. // This ioctl shouldn't mutate it's argument per its API. But in case it // does, create a mutable copy to avoid Rust UB. let mut ticks = ticks; Errno::result(unsafe { libc::ioctl( self.fd, perf::perf_event_ioctls_PERIOD as _, &mut ticks as *mut u64, ) }) .and(Ok(())) } /// Call the `PERF_EVENT_IOC_ID` ioctl. Returns a unique identifier for this /// perf counter. #[allow(dead_code)] pub fn id(&self) -> Result<u64, Errno> { let mut res = 0u64; Errno::result(unsafe { ioctls::ID(self.fd, &mut res as *mut u64) })?; Ok(res) } /// Sets up overflow events to deliver a `SIGPOLL`-style signal, with the /// signal number specified in `signal`, to the specified `thread`. /// /// There is no reason this couldn't be called at any point, but typial use /// cases will set up signal delivery once or not at all. pub fn set_signal_delivery(&self, thread: Tid, signal: Signal) -> Result<(), Errno> { let owner = f_owner_ex { type_: F_OWNER_TID, pid: thread.as_raw(), }; Errno::result(unsafe { libc::fcntl(self.fd, F_SETOWN_EX, &owner as *const _) })?; Errno::result(unsafe { libc::fcntl(self.fd, libc::F_SETFL, libc::O_ASYNC) })?; Errno::result(unsafe { libc::fcntl(self.fd, F_SETSIG, signal as i32) })?; Ok(()) } /// Read the current value of the counter. pub fn ctr_value(&self) -> Result<u64, Errno> { let mut value = 0u64; let expected_bytes = std::mem::size_of_val(&value); loop { let res = unsafe { libc::read(self.fd, &mut value as *mut u64 as *mut _, expected_bytes) }; if res == -1 { let errno = Errno::last(); if errno != Errno::EINTR { return Err(errno); } } if res == 0 { // EOF: this only occurs when attr.pinned = 1 and our event was descheduled. // This unrecoverably gives us innacurate counts. panic!("pinned perf event descheduled!") } if res == expected_bytes as isize { break; } } Ok(value) } /// Perform a fast read, which doesn't involve a syscall in the fast path. /// This falls back to a slow syscall read where necessary, including if /// fast reads weren't enabled in the `Builder`. pub fn ctr_value_fast(&self) -> Result<u64, Errno> { match self.mmap { Some(ptr) => { // SAFETY: self.mmap is constructed as the correct page or not at all let res = unsafe { self.ctr_value_fast_loop(ptr) }; // TODO: remove this assertion after we're confident in correctness debug_assert_eq!(res, self.ctr_value_fallback()); res } None => self.ctr_value_fallback(), } } #[cold] fn ctr_value_fallback(&self) -> Result<u64, Errno> { self.ctr_value() } /// Safety: `ptr` must refer to the metadata page corresponding to self.fd. #[deny(unsafe_op_in_unsafe_fn)] #[inline(always)] unsafe fn ctr_value_fast_loop( &self, ptr: NonNull<perf::perf_event_mmap_page>, ) -> Result<u64, Errno> { // This implements synchronization with the kernel via a seqlock, // see https://www.kernel.org/doc/html/latest/locking/seqlock.html. // Also see experiments/perf_fast_reads.c for more details on fast reads. use std::ptr::addr_of_mut; let ptr = ptr.as_ptr(); let mut seq; let mut running; let mut enabled; let mut count; loop { // Acquire a lease on the seqlock -- even values are outside of // writers' critical sections. loop { // SAFETY: ptr->lock is valid and aligned seq = unsafe { read_once(addr_of_mut!((*ptr).lock)) }; if seq & 1 == 0 { break; } } smp_rmb(); // force re-reads of other data let index; // SAFETY: these reads are synchronized by the correct reads of the // seqlock. We don't do anything with them until after the outer // loop finishing has guaranteed our read was serialized. unsafe { running = (*ptr).time_running; enabled = (*ptr).time_enabled; count = (*ptr).offset; index = (*ptr).index; } if index != 0 { // `index` being non-zero indicates we need to read from the // hardware counter and add it to our count. Instead, we // fallback to the slow path for a few reasons: // 1. This only works if we're on the same core, which is basically // never true for our usecase. // 2. Reads of an active PMU are racy. // 3. The PMU should almost never be active, because we should // generally only read from stopped processes. return self.ctr_value_fallback(); } smp_rmb(); // SAFETY: ptr->lock is valid and aligned if seq == unsafe { read_once(addr_of_mut!((*ptr).lock)) } { // if seq is unchanged, we didn't race with writer break; } } // This check must be outside the loop to ensure our reads were actually // serialized with any writes. if running != enabled { // Non-equal running/enabled time indicates the event was // descheduled at some point, meaning our counts are inaccurate. // This is not recoverable. The slow-read equivalent is getting EOF // when attr.pinned = 1. panic!("fast-read perf event was probably descheduled!") } Ok(count as u64) } /// Return the underlying perf fd. pub fn raw_fd(&self) -> libc::c_int { self.fd } } fn close_perf_fd(fd: libc::c_int) { Errno::result(unsafe { libc::close(fd) }).expect("Could not close perf fd"); } fn close_mmap(ptr: *mut perf::perf_event_mmap_page) { Errno::result(unsafe { libc::munmap(ptr as *mut _, get_mmap_size()) }) .expect("Could not munmap ring buffer"); } impl Drop for PerfCounter { fn drop(&mut self) { if let Some(ptr) = self.mmap { close_mmap(ptr.as_ptr()); } close_perf_fd(self.fd); } } // Safety: // The mmap region is never written to. Multiple readers then race with the // kernel as any single thread would. Though the reads are racy, that is the // intended behavior of the perf api. unsafe impl std::marker::Send for PerfCounter {} unsafe impl std::marker::Sync for PerfCounter {} fn get_mmap_size() -> usize { // Use a single page; we only want the perf metadata sysconf(SysconfVar::PAGE_SIZE) .unwrap() .unwrap() .try_into() .unwrap() } /// Force a relaxed atomic load. Like Linux's READ_ONCE. /// SAFETY: caller must ensure v points to valid data and is aligned #[inline(always)] #[deny(unsafe_op_in_unsafe_fn)] unsafe fn read_once(v: *mut u32) -> u32 { use std::sync::atomic::{AtomicU32, Ordering::Relaxed}; // SAFETY: AtomicU32 is guaranteed to have the same in-memory representation // SAFETY: The UnsafeCell inside AtomicU32 allows aliasing with *mut // SAFETY: The reference doesn't escape this function, so any lifetime is ok let av: &AtomicU32 = unsafe { &*(v as *const AtomicU32) }; av.load(Relaxed) } #[inline(always)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn smp_rmb() { use std::sync::atomic::{compiler_fence, Ordering::SeqCst}; compiler_fence(SeqCst); } // Test if we have PMU access by doing a check for a basic hardware event. fn test_perf_pmu_support() -> bool { // Do a raw perf_event_open because our default configuration has flags that // might be the actual cause of the error, which we want to catch separately. let evt = Event::Hardware(HardwareEvent::Instructions); let mut attr = perf::perf_event_attr::default(); attr.size = core::mem::size_of_val(&attr) as u32; attr.type_ = evt.attr_type(); attr.config = evt.attr_config(); attr.__bindgen_anon_1.sample_period = PerfCounter::DISABLE_SAMPLE_PERIOD; attr.set_exclude_kernel(1); // lowers permission requirements let pid: libc::pid_t = 0; // track this thread let cpu: libc::c_int = -1; // across any CPU let group_fd: libc::c_int = -1; let flags = perf::PERF_FLAG_FD_CLOEXEC; let res = Errno::result(unsafe { libc::syscall(libc::SYS_perf_event_open, &attr, pid, cpu, group_fd, flags) }); match res { Ok(fd) => { Errno::result(unsafe { libc::close(fd as libc::c_int) }) .expect("perf feature check: close(fd) failed"); return true; } Err(Errno::ENOENT) => info!("Perf feature check failed due to ENOENT"), Err(Errno::EPERM) => info!("Perf feature check failed due to EPERM"), Err(Errno::EACCES) => info!("Perf feature check failed due to EACCES"), Err(e) => panic!("Unexpected error during perf feature check: {}", e), } false } lazy_static! { static ref IS_PERF_SUPPORTED: bool = test_perf_pmu_support(); } /// Returns true if the current system configuration supports use of perf for /// hardware events. pub fn is_perf_supported() -> bool { *IS_PERF_SUPPORTED } /// Concisely return if `is_perf_supported` is `false`. Useful for guarding /// tests. #[macro_export] macro_rules! ret_without_perf { () => { if !$crate::is_perf_supported() { return; } }; (expr:expr) => { if !$crate::is_perf_supported() { return ($expr); } }; } /// Perform exactly `count+1` conditional branch instructions. Useful for /// testing timer-related code. #[cfg(target_arch = "x86_64")] #[cfg(not(feature = "llvm_asm"))] #[inline(never)] pub fn do_branches(mut count: u64) { // Anything but assembly is unreliable between debug and release unsafe { // Loop until carry flag is set, indicating underflow core::arch::asm!( "2:", "sub {0}, 1", "jnc 2b", inout(reg) count, ) } assert_eq!(count, u64::MAX); } /// Perform exactly `count+1` conditional branch instructions. Useful for /// testing timer-related code. #[cfg(target_arch = "x86_64")] #[cfg(feature = "llvm_asm")] #[inline(never)] pub fn do_branches(count: u64) { // Anything but assembly is unreliable between debug and release #[allow(deprecated)] unsafe { // Loop until carry flag is set, indicating underflow llvm_asm!(" mov $0, %rax perf_test_branch_loop: subq $$1, %rax jnc perf_test_branch_loop " : /* no output */ : "r"(count) : "cc", "rax" ); } } #[cfg(test)] mod test { use super::*; use nix::unistd::gettid; #[test] fn trace_self() { ret_without_perf!(); let pc = Builder::new(gettid().as_raw(), -1) .sample_period(PerfCounter::DISABLE_SAMPLE_PERIOD) .event(Event::Hardware(HardwareEvent::BranchInstructions)) .create() .unwrap(); pc.reset().unwrap(); pc.enable().unwrap(); const ITERS: u64 = 10000; do_branches(ITERS); pc.disable().unwrap(); let ctr = pc.ctr_value().unwrap(); assert!(ctr >= ITERS); assert!(ctr <= ITERS + 100); // `.disable()` overhead } #[test] fn trace_other_thread() { ret_without_perf!(); use std::sync::mpsc::sync_channel; let (tx1, rx1) = sync_channel(0); // send TID let (tx2, rx2) = sync_channel(0); // start guest spinn const ITERS: u64 = 100000; let handle = std::thread::spawn(move || { tx1.send(gettid()).unwrap(); rx2.recv().unwrap(); do_branches(ITERS); }); let pc = Builder::new(rx1.recv().unwrap().as_raw(), -1) .sample_period(PerfCounter::DISABLE_SAMPLE_PERIOD) .event(Event::Hardware(HardwareEvent::BranchInstructions)) .create() .unwrap(); pc.enable().unwrap(); tx2.send(()).unwrap(); // tell thread to start handle.join().unwrap(); let ctr = pc.ctr_value().unwrap(); assert!(ctr >= ITERS); assert!(ctr <= ITERS + 5000, "{}", ctr); // overhead from channel operations } #[test] fn deliver_signal() { ret_without_perf!(); use std::mem::MaybeUninit; use std::sync::mpsc::sync_channel; let (tx1, rx1) = sync_channel(0); // send TID let (tx2, rx2) = sync_channel(0); // start guest spinn // SIGSTKFLT defaults to TERM, so if any thread but the traced one // receives the signal, the test will fail due to process exit. const MARKER_SIGNAL: Signal = Signal::SIGSTKFLT; const SPIN_BRANCHES: u64 = 50000; // big enough to "absorb" noise from debug/release const SPINS_PER_EVENT: u64 = 10; const SAMPLE_PERIOD: u64 = SPINS_PER_EVENT * SPIN_BRANCHES + (SPINS_PER_EVENT / 4); fn signal_is_pending() -> bool { unsafe { let mut mask = MaybeUninit::<libc::sigset_t>::zeroed(); libc::sigemptyset(mask.as_mut_ptr()); libc::sigpending(mask.as_mut_ptr()); libc::sigismember(mask.as_ptr(), MARKER_SIGNAL as _) == 1 } } let handle = std::thread::spawn(move || { unsafe { let mut mask = MaybeUninit::<libc::sigset_t>::zeroed(); libc::sigemptyset(mask.as_mut_ptr()); libc::sigaddset(mask.as_mut_ptr(), MARKER_SIGNAL as _); libc::sigprocmask(libc::SIG_BLOCK, mask.as_ptr(), std::ptr::null_mut()); } tx1.send(gettid()).unwrap(); rx2.recv().unwrap(); let mut count = 0; loop { count += 1; do_branches(SPIN_BRANCHES); if signal_is_pending() { break; } } assert_eq!(count, SPINS_PER_EVENT); }); let tid = rx1.recv().unwrap(); let pc = Builder::new(tid.as_raw(), -1) .sample_period(SAMPLE_PERIOD) .event(Event::Hardware(HardwareEvent::BranchInstructions)) .create() .unwrap(); pc.set_signal_delivery(tid.into(), MARKER_SIGNAL).unwrap(); pc.enable().unwrap(); tx2.send(()).unwrap(); // tell thread to start handle.join().unwrap(); // propagate panics } }

reverie-ptrace/src/perf.rs (496 lines of code) (raw):