kprobe_queue.c (1,251 lines of code) (raw):
// SPDX-License-Identifier: Apache-2.0
/* Copyright (c) 2024 Elastic NV */
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <sys/epoll.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/param.h>
#include <sys/syscall.h>
#include <sys/sysinfo.h>
#include <ctype.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <poll.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <time.h>
#include <unistd.h>
#include "quark.h"
#define PERF_MMAP_PAGES 16 /* Must be power of 2 */
struct perf_sample_id {
u32 pid;
u32 tid;
u64 time; /* See raw_event_insert() */
u32 cpu;
u32 cpu_unused;
};
struct perf_record_fork {
struct perf_event_header header;
u32 pid;
u32 ppid;
u32 tid;
u32 ptid;
u64 time;
struct perf_sample_id sample_id;
};
struct perf_record_exit {
struct perf_event_header header;
u32 pid;
u32 ppid;
u32 tid;
u32 ptid;
u64 time;
struct perf_sample_id sample_id;
};
struct perf_record_comm {
struct perf_event_header header;
u32 pid;
u32 tid;
char comm[];
/* followed by sample_id */
};
/*
* Kernels might actually have a different common area, so far we only
* need common_type, so hold onto that
*/
struct perf_sample_data_hdr {
/* this is the actual id from tracefs eg: sched_process_exec/id */
u16 common_type;
/* ... */
};
struct perf_sample_data_loc {
u16 offset;
u16 size;
};
struct perf_record_sample {
struct perf_event_header header;
struct perf_sample_id sample_id;
u32 data_size;
char data[];
};
struct perf_record_lost {
struct perf_event_header header;
u64 id;
u64 lost;
struct perf_sample_id sample_id;
};
struct perf_event {
union {
struct perf_event_header header;
struct perf_record_fork fork;
struct perf_record_exit exit;
struct perf_record_comm comm;
struct perf_record_sample sample;
struct perf_record_lost lost;
};
};
struct perf_mmap {
struct perf_event_mmap_page *metadata;
size_t mapped_size;
size_t data_size;
size_t data_mask;
u8 *data_start;
u64 data_tmp_tail;
u8 wrapped_event_buf[4096] __aligned(8);
};
struct perf_group_leader {
TAILQ_ENTRY(perf_group_leader) entry;
int fd;
int cpu;
struct perf_event_attr attr;
struct perf_mmap mmap;
};
/*
* Forbid padding on samples/wire structures
*/
#ifndef NO_PUSH_PRAGMA
#pragma GCC diagnostic push
#pragma GCC diagnostic error "-Wpadded"
#endif /* NO_PUSH_PRAGMA */
struct exec_sample {
struct perf_sample_data_loc filename;
s32 pid;
s32 old_pid;
};
#define MAX_PWD 7
/* Sorted by alignment restriction, 64->32->16->8 */
struct task_sample {
/* 64bit */
u64 probe_ip;
u64 cap_inheritable;
u64 cap_permitted;
u64 cap_effective;
u64 cap_bset;
u64 cap_ambient;
u64 start_boottime;
u64 tty_addr;
u64 root_k;
u64 mnt_root_k;
u64 mnt_mountpoint_k;
u64 pwd_k[MAX_PWD];
/* 32bit */
struct perf_sample_data_loc root_s;
struct perf_sample_data_loc mnt_root_s;
struct perf_sample_data_loc mnt_mountpoint_s;
struct perf_sample_data_loc pwd_s[MAX_PWD];
struct perf_sample_data_loc comm;
u32 uid;
u32 gid;
u32 suid;
u32 sgid;
u32 euid;
u32 egid;
u32 pgid;
u32 sid;
u32 pid;
u32 tid;
u32 ppid;
s32 exit_code;
u32 tty_major;
u32 tty_minor_start;
u32 tty_minor_index;
u32 uts_inonum;
u32 ipc_inonum;
u32 mnt_inonum;
u32 net_inonum;
/* 16bit */
/* 8bit */
};
struct exec_connector_sample {
struct task_sample task_sample; /* must be 8 byte aligned */
/* 64bit */
u64 argc;
u64 stack[55]; /* sync with kprobe_defs */
};
#ifndef NO_PUSH_PRAGMA
#pragma GCC diagnostic pop
#endif /* NO_PUSH_PRAGMA */
/*
* End samples/wire/ structures
*/
struct kprobe_state {
TAILQ_ENTRY(kprobe_state) entry;
struct kprobe *k;
struct perf_event_attr attr;
int fd;
int cpu;
int group_fd;
};
struct kprobe_arg {
const char *name;
const char *reg;
const char *typ;
const char *arg_dsl;
};
struct kprobe {
const char *target;
int sample_kind;
int is_kret;
struct kprobe_arg args[];
};
struct path_ctx {
const char *root;
u64 root_k;
const char *mnt_root;
u64 mnt_root_k;
const char *mnt_mountpoint;
u64 mnt_mountpoint_k;
struct {
const char *pwd;
u64 pwd_k;
} pwd[MAX_PWD];
};
/*
* Kprobe sample formats
*/
enum sample_kinds {
INVALID_SAMPLE,
EXEC_SAMPLE,
WAKE_UP_NEW_TASK_SAMPLE,
EXIT_THREAD_SAMPLE,
EXEC_CONNECTOR_SAMPLE
};
/*
* Attributes of sample, maps id to kind and data_offset
*/
struct sample_attr {
int id;
int kind;
size_t data_offset;
};
/*
* The actual probe definitions, they're too big and ugly so they get a separate
* file
*/
#include "kprobe_defs.h"
/*
* Queue backend state
*/
TAILQ_HEAD(perf_group_leaders, perf_group_leader);
TAILQ_HEAD(kprobe_states, kprobe_state);
/* We only use 4, bump when needed */
#define MAX_SAMPLE_ATTR 8
struct kprobe_queue {
struct perf_group_leaders perf_group_leaders;
int num_perf_group_leaders;
struct kprobe_states kprobe_states;
int qid;
/* matches each sample event to a kind like EXEC_SAMPLE, FOO_SAMPLE */
struct sample_attr id_to_sample_attr[MAX_SAMPLE_ATTR];
};
static int kprobe_queue_populate(struct quark_queue *);
static int kprobe_queue_update_stats(struct quark_queue *);
static void kprobe_queue_close(struct quark_queue *);
struct quark_queue_ops queue_ops_kprobe = {
.open = kprobe_queue_open,
.populate = kprobe_queue_populate,
.update_stats = kprobe_queue_update_stats,
.close = kprobe_queue_close,
};
static ssize_t parse_data_offset(const char *);
static const char *
str_of_dataloc(struct perf_record_sample *sample,
const struct perf_sample_data_loc *data_loc)
{
return (sample->data + data_loc->offset);
}
static const struct sample_attr *
sample_attr_of_id(struct kprobe_queue *kqq, int id)
{
int i;
for (i = 0; i < MAX_SAMPLE_ATTR; i++) {
if (kqq->id_to_sample_attr[i].id == id)
return (&kqq->id_to_sample_attr[i]);
}
return (NULL);
}
static struct sample_attr *
sample_attr_prepare(struct kprobe_queue *kqq, int id, const char *format,
ssize_t *data_offset)
{
int i;
struct sample_attr *sattr;
if (sample_attr_of_id(kqq, id) != NULL) {
qwarnx("id already allocated");
return (NULL);
}
*data_offset = parse_data_offset(format);
if (*data_offset == -1) {
qwarnx("can't parse data offset");
return (NULL);
}
sattr = NULL;
for (i = 0; i < MAX_SAMPLE_ATTR; i++) {
if (kqq->id_to_sample_attr[i].kind == INVALID_SAMPLE) {
sattr = &kqq->id_to_sample_attr[i];
break;
}
}
if (sattr == NULL)
qwarnx("no more free sample attr slots");
return (sattr);
}
static inline int
sample_data_id(struct perf_record_sample *sample)
{
struct perf_sample_data_hdr *h = (struct perf_sample_data_hdr *)sample->data;
return (h->common_type);
}
static inline const void *
sample_data_body(struct perf_record_sample *sample, const struct sample_attr *sattr)
{
return (sample->data + sattr->data_offset);
}
static char *
build_path(struct path_ctx *ctx)
{
int i, done;
char *p, path[PATH_MAX];
const char *pwd, *ppwd;
u64 pwd_k;
p = &path[sizeof(path) - 1];
*p = 0;
done = 0;
for (i = 0; i < (int)nitems(ctx->pwd) && !done; i++) {
pwd_k = ctx->pwd[i].pwd_k;
pwd = ctx->pwd[i].pwd;
if (pwd_k == ctx->root_k)
break;
if (pwd_k == ctx->mnt_root_k) {
pwd = ctx->mnt_mountpoint;
done = 1;
}
/* XXX this strlen sucks as we had the length on the wire */
ppwd = pwd + strlen(pwd);
/* +1 is the / */
/* XXX this is way too dangerous XXX */
if (((ppwd - pwd) + 1) > (p - path))
return (NULL);
while (ppwd != pwd)
*--p = *--ppwd;
*--p = '/';
}
if (*p == 0)
*--p = '/';
return (strdup(p));
}
static void
task_sample_to_raw_task(struct kprobe_queue *kqq, const struct sample_attr *sattr,
struct perf_record_sample *sample, struct raw_task *task)
{
const struct task_sample *w = sample_data_body(sample, sattr);
struct path_ctx pctx;
int i;
task->cap_inheritable = w->cap_inheritable;
task->cap_permitted = w->cap_permitted;
task->cap_effective = w->cap_effective;
task->cap_bset = w->cap_bset;
task->cap_ambient = w->cap_ambient;
task->start_boottime = w->start_boottime;
task->uid = w->uid;
task->gid = w->gid;
task->suid = w->suid;
task->sgid = w->sgid;
task->euid = w->euid;
task->egid = w->egid;
task->pgid = w->pgid;
task->sid = w->sid;
task->ppid = w->ppid;
if (w->tty_addr) {
task->tty_major = w->tty_major;
task->tty_minor = w->tty_minor_start + w->tty_minor_index;
}
task->uts_inonum = w->uts_inonum;
task->ipc_inonum = w->ipc_inonum;
task->mnt_inonum = w->mnt_inonum;
task->net_inonum = w->net_inonum;
/* cwd below */
strlcpy(task->comm, str_of_dataloc(sample, &w->comm),
sizeof(task->comm));
if (sattr->kind == EXIT_THREAD_SAMPLE) {
task->exit_code = (w->exit_code >> 8) & 0xff;
task->exit_time_event = sample->sample_id.time;
task->cwd = NULL;
/* No cwd on exit */
return;
}
task->exit_code = -1;
task->exit_time_event = 0;
/* Consider moving all this inside build_path() */
pctx.root = str_of_dataloc(sample, &w->root_s);
pctx.root_k = w->root_k;
pctx.mnt_root = str_of_dataloc(sample, &w->mnt_root_s);
pctx.mnt_root_k = w->mnt_root_k;
pctx.mnt_mountpoint = str_of_dataloc(sample,
&w->mnt_mountpoint_s);
pctx.mnt_mountpoint_k = w->mnt_mountpoint_k;
for (i = 0; i < (int)nitems(pctx.pwd); i++) {
pctx.pwd[i].pwd = str_of_dataloc(sample,
&w->pwd_s[i]);
pctx.pwd[i].pwd_k = w->pwd_k[i];
}
if ((task->cwd = build_path(&pctx)) == NULL)
qwarn("can't build path");
}
static struct raw_event *
perf_sample_to_raw(struct quark_queue *qq, struct perf_record_sample *sample)
{
struct kprobe_queue *kqq = qq->queue_be;
const struct sample_attr *sattr;
int id, kind;
struct raw_event *raw = NULL;
id = sample_data_id(sample);
sattr = sample_attr_of_id(kqq, id);
if (sattr != NULL)
kind = sattr->kind;
else
kind = INVALID_SAMPLE;
switch (kind) {
case EXEC_SAMPLE: {
const struct exec_sample *exec = sample_data_body(sample, sattr);
if ((raw = raw_event_alloc(RAW_EXEC)) == NULL)
return (NULL);
if (exec->filename.size == 0) {
raw_event_free(raw);
return (NULL);
}
/* size includes NUL */
raw->exec.filename = malloc(exec->filename.size);
if (raw->exec.filename == NULL) {
raw_event_free(raw);
return (NULL);
}
memcpy(raw->exec.filename,
sample->data + exec->filename.offset,
exec->filename.size);
/* don't trust the kernel that much */
raw->exec.filename[exec->filename.size - 1] = 0;
break;
}
case WAKE_UP_NEW_TASK_SAMPLE: /* FALLTHROUGH */
case EXIT_THREAD_SAMPLE: {
const struct task_sample *w = sample_data_body(sample, sattr);
int raw_type;
/*
* ev->sample.sample_id.pid is the parent, if the new task has
* the same pid as it, then this is a thread event
*/
if ((qq->flags & QQ_THREAD_EVENTS) == 0
&& w->pid != w->tid)
return (NULL);
raw_type = kind == WAKE_UP_NEW_TASK_SAMPLE ?
RAW_WAKE_UP_NEW_TASK : RAW_EXIT_THREAD;
if ((raw = raw_event_alloc(raw_type)) == NULL)
return (NULL);
/*
* Cheat, make it look like a child event
*/
if (raw_type == RAW_WAKE_UP_NEW_TASK) {
raw->pid = w->pid;
raw->tid = w->tid;
}
task_sample_to_raw_task(kqq, sattr, sample, &raw->task);
break;
}
case EXEC_CONNECTOR_SAMPLE: {
const char *start, *p, *end;
int i;
const struct exec_connector_sample *exec_sample;
struct raw_exec_connector *exec;
exec_sample = sample_data_body(sample, sattr);
if ((raw = raw_event_alloc(RAW_EXEC_CONNECTOR)) == NULL)
return (NULL);
exec = &raw->exec_connector;
start = p = (const char *)&exec_sample->stack[0];
end = start + sizeof(exec_sample->stack);
for (i = 0; i < (int)exec_sample->argc && p < end; i++)
p += strnlen(p, end - p) + 1;
if (p >= end)
p = end;
exec->args_len = p - start;
if (exec->args_len == 0)
exec->args = NULL;
else {
if ((exec->args = malloc(exec->args_len)) == NULL) {
raw_event_free(raw);
return (NULL);
}
memcpy(exec->args, start, exec->args_len);
exec->args[exec->args_len - 1] = 0;
}
task_sample_to_raw_task(kqq, sattr, sample, &exec->task);
break;
}
default:
qwarnx("unknown or invalid sample id=%d", id);
return (NULL);
}
return (raw);
}
static struct raw_event *
perf_event_to_raw(struct quark_queue *qq, struct perf_event *ev)
{
struct raw_event *raw = NULL;
struct perf_sample_id *sid = NULL;
ssize_t n;
switch (ev->header.type) {
case PERF_RECORD_SAMPLE:
raw = perf_sample_to_raw(qq, &ev->sample);
if (raw != NULL)
sid = &ev->sample.sample_id;
break;
case PERF_RECORD_COMM:
/*
* Supress comm events due to exec as we can fetch comm
* directly from the task struct
*/
if (ev->header.misc & PERF_RECORD_MISC_COMM_EXEC)
return (NULL);
if ((qq->flags & QQ_THREAD_EVENTS) == 0 &&
ev->comm.pid != ev->comm.tid)
return (NULL);
if ((raw = raw_event_alloc(RAW_COMM)) == NULL)
return (NULL);
n = strlcpy(raw->comm.comm, ev->comm.comm,
sizeof(raw->comm.comm));
/*
* Yes, comm is variable length, maximum 16. The kernel
* guarantees alignment on an 8byte boundary for the sample_id,
* that means we have to calculate the next boundary.
*/
sid = (struct perf_sample_id *)
ALIGN_UP(ev->comm.comm + n + 1, 8);
break;
case PERF_RECORD_FORK:
case PERF_RECORD_EXIT:
/*
* As long as we are still using PERF_RECORD_COMM events, the
* kernel implies we want FORK and EXIT as well, see
* core.c:perf_event_task_match(), this is likely unintended
* behaviour.
*/
break;
case PERF_RECORD_LOST:
qq->stats.lost += ev->lost.lost;
break;
default:
qwarnx("unhandled type %d", ev->header.type);
return (NULL);
break;
}
if (sid != NULL) {
/* FORK/WAKE_UP_NEW_TASK overloads pid and tid */
if (raw->pid == 0)
raw->pid = sid->pid;
if (raw->tid == 0)
raw->tid = sid->tid;
raw->opid = sid->pid;
raw->tid = sid->tid;
raw->time = sid->time;
raw->cpu = sid->cpu;
}
return (raw);
}
static int
perf_mmap_init(struct perf_mmap *mm, int fd)
{
mm->mapped_size = (1 + PERF_MMAP_PAGES) * getpagesize();
mm->metadata = mmap(NULL, mm->mapped_size, PROT_READ|PROT_WRITE,
MAP_SHARED, fd, 0);
if (mm->metadata == MAP_FAILED)
return (-1);
mm->data_size = PERF_MMAP_PAGES * getpagesize();
mm->data_mask = mm->data_size - 1;
mm->data_start = (uint8_t *)mm->metadata + getpagesize();
mm->data_tmp_tail = mm->metadata->data_tail;
return (0);
}
static inline uint64_t
perf_mmap_load_head(struct perf_event_mmap_page *metadata)
{
return (__atomic_load_n(&metadata->data_head, __ATOMIC_ACQUIRE));
}
static inline void
perf_mmap_update_tail(struct perf_event_mmap_page *metadata, uint64_t tail)
{
return (__atomic_store_n(&metadata->data_tail, tail, __ATOMIC_RELEASE));
}
static struct perf_event *
perf_mmap_read(struct perf_mmap *mm)
{
struct perf_event_header *evh;
uint64_t data_head;
int diff;
ssize_t leftcont; /* contiguous size left */
data_head = perf_mmap_load_head(mm->metadata);
diff = data_head - mm->data_tmp_tail;
evh = (struct perf_event_header *)
(mm->data_start + (mm->data_tmp_tail & mm->data_mask));
/* Do we have at least one complete event */
if (diff < (int)sizeof(*evh) || diff < evh->size)
return (NULL);
/* Guard that we will always be able to fit a wrapped event */
if (unlikely(evh->size > sizeof(mm->wrapped_event_buf)))
errx(1, "getting an event larger than wrapped buf");
/* How much contiguous space there is left */
leftcont = mm->data_size - (mm->data_tmp_tail & mm->data_mask);
/* Everything fits without wrapping */
if (likely(evh->size <= leftcont)) {
mm->data_tmp_tail += evh->size;
return ((struct perf_event *)evh);
}
/*
* Slow path, we have to copy the event out in a linear buffer. Start
* from the remaining end
*/
memcpy(mm->wrapped_event_buf, evh, leftcont);
/* Copy the wrapped portion from the beginning */
memcpy(mm->wrapped_event_buf + leftcont, mm->data_start, evh->size - leftcont);
/* Record where our future tail will be on consume */
mm->data_tmp_tail += evh->size;
return ((struct perf_event *)mm->wrapped_event_buf);
}
static inline void
perf_mmap_consume(struct perf_mmap *mmap)
{
perf_mmap_update_tail(mmap->metadata, mmap->data_tmp_tail);
}
static int
perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu,
int group_fd, unsigned long flags)
{
return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
}
static int
perf_event_open_degradable(struct perf_event_attr *attr, pid_t pid, int cpu,
int group_fd, unsigned long flags)
{
int r;
again:
r = perf_event_open(attr, pid, cpu, group_fd, flags);
if (r >= 0)
return (r);
else if (r == -1 && errno != EINVAL)
return (-1);
/* start degrading until it works */
if (attr->comm_exec) {
attr->comm_exec = 0;
goto again;
}
if (attr->use_clockid) {
attr->use_clockid = 0;
attr->clockid = 0;
goto again;
}
return (r);
}
static int
open_tracing(int flags, const char *fmt, ...)
{
va_list ap;
int dfd, fd, i, r, saved_errno;
char tail[MAXPATHLEN];
char *paths[] = {
"/sys/kernel/tracing",
"/sys/kernel/debug/tracing",
};
va_start(ap, fmt);
r = vsnprintf(tail, sizeof(tail), fmt, ap);
va_end(ap);
if (r == -1 || r >= (int)sizeof(tail))
return (-1);
if (tail[0] == '/')
return (errno = EINVAL, -1);
saved_errno = 0;
for (i = 0; i < (int)nitems(paths); i++) {
if ((dfd = open(paths[i], O_PATH)) == -1) {
if (!saved_errno && errno != ENOENT)
saved_errno = errno;
qwarn("open: %s", paths[i]);
continue;
}
fd = openat(dfd, tail, flags);
close(dfd);
if (fd == -1) {
if (!saved_errno && errno != ENOENT)
saved_errno = errno;
qwarn("open: %s", tail);
continue;
}
return (fd);
}
if (saved_errno)
errno = saved_errno;
return (-1);
}
static int
fetch_tracing_id(const char *tail)
{
int id, fd;
char idbuf[16];
const char *errstr;
ssize_t n;
fd = open_tracing(O_RDONLY, "%s", tail);
if (fd == -1)
return (-1);
n = qread(fd, idbuf, sizeof(idbuf));
close(fd);
if (n <= 0)
return (-1);
idbuf[n - 1] = 0;
id = strtonum(idbuf, 1, INT_MAX, &errstr);
if (errstr != NULL) {
qwarnx("strtonum: %s", errstr);
return (errno = ERANGE, -1);
}
return (id);
}
static ssize_t
parse_data_offset(const char *path)
{
int fd;
FILE *f;
char *line, *s, *e;
const char *errstr;
ssize_t n, data_offset;
size_t line_len;
int past_common;
fd = open_tracing(O_RDONLY, path);
if (fd == -1)
return (-1);
f = fdopen(fd, "r");
if (f == NULL) {
close(fd);
return (-1);
}
past_common = 0;
line = NULL;
line_len = 0;
data_offset = -1;
while ((n = getline(&line, &line_len, f)) != -1) {
if (!past_common) {
past_common = !strcmp(line, "\n");
continue;
}
s = strstr(line, "offset:");
if (s == NULL)
break;
s += strlen("offset:");
e = strchr(s, ';');
if (e == NULL)
break;
*e = 0;
data_offset = strtonum(s, 0, SSIZE_MAX, &errstr);
if (errstr)
data_offset = -1;
break;
}
free(line);
fclose(f);
return (data_offset);
}
#define TOKSZ 256
static int
kprobe_exp_split(const char *exp1, char left[TOKSZ], char *op, char right[TOKSZ])
{
int paren_depth;
char exp[1024];
char *p, *start_left, *start_right;
if (strlcpy(exp, exp1, sizeof(exp)) >= sizeof(exp)) {
qwarnx("expression too long");
return (-1);
}
paren_depth = 0;
start_left = start_right = NULL;
*op = 0;
for (p = exp; *p != 0; p++) {
switch (*p) {
case '(':
if (++paren_depth == 1)
start_left = p + 1;
break;
case '+': /* FALLTHROUGH */
case '-': /* FALLTHROUGH */
case '*': /* FALLTHROUGH */
if (paren_depth > 1)
break;
if (*op != 0) {
qwarnx("multiple operators");
return (-1);
}
*op = *p;
*p = 0;
start_right = p + 1;
break;
case ')':
if (--paren_depth == 0)
*p = 0;
if (paren_depth < 0) {
qwarnx("unbalanced parenthesis");
return (-1);
}
break;
default:
break;
}
}
if (*op == 0) {
qwarnx("no operator");
return (-1);
}
if (start_left == NULL || start_right == NULL) {
qwarnx("syntax error");
return (-1);
}
if (strlcpy(left, start_left, TOKSZ) >= TOKSZ) {
qwarnx("left token overflow");
return (-1);
}
if (strlen(left) == 0) {
qwarnx("empty left token");
return (-1);
}
if (strlcpy(right, start_right, TOKSZ) >= TOKSZ) {
qwarnx("right token overflow");
return (-1);
}
if (strlen(right) == 0) {
qwarnx("empty right token");
return (-1);
}
return (0);
}
static int
kprobe_exp(const char *exp1, ssize_t *off1, struct quark_btf *qbtf)
{
ssize_t off, off_left, off_right;
char exp[1024], op, *end;
char left[TOKSZ], right[TOKSZ];
bzero(left, sizeof(left));
bzero(right, sizeof(right));
if (strlcpy(exp, exp1, sizeof(exp)) >= sizeof(exp)) {
qwarnx("expression too long");
return (-1);
}
switch (*exp) {
case '(': {
if ((end = strrchr(exp, ')')) == NULL) {
qwarnx("unclosed parenthesis: %s", exp);
return (-1);
}
if (kprobe_exp_split(exp, left, &op, right) == -1) {
qwarnx("can't split expression: %s", exp);
return (-1);
}
if (kprobe_exp(left, &off_left, qbtf) == -1) {
qwarnx("left expression is unresolved: %s", exp);
return (-1);
}
if (kprobe_exp(right, &off_right, qbtf) == -1) {
qwarnx("right expression is unresolved: %s", exp);
return (-1);
}
switch (op) {
case '+':
off = off_left + off_right;
break;
case '-':
off = off_left - off_right;
break;
case '*':
off = off_left * off_right;
break;
default:
qwarnx("invalid operator `%c`: %s", op, exp);
return (-1);
}
/* If there is a dot after `)`, recurse */
if (*(end + 1) == '.') {
if (kprobe_exp(end + 2, &off_left, qbtf) == -1) {
qwarnx("expression unresolved: %s", exp);
return (-1);
}
off += off_left;
}
break;
}
default: {
const char *errstr;
off = strtonum(exp, INT32_MIN, INT32_MAX, &errstr);
if (errstr == NULL)
break;
if ((off = quark_btf_offset(qbtf, exp)) == -1) {
qwarnx("expression unresolved: %s", exp);
return (-1);
}
break;
}}
*off1 = off;
return (0);
}
/*
* Old kernels have some offsets in different structures, not just under a
* different name(see btf_alternatives{}). We handle those differences here
* by detecting it at runtime and issuing the correct kprobe_arg.
*/
static struct kprobe_arg *
kprobe_kludge_arg(struct kprobe *k, struct kprobe_arg *karg,
struct quark_btf *qbtf)
{
/*
* For TASK_SAMPLE, pgid and sid depend on fetching pids, which in newer
* kernels are deep within signal_struct, but older kernels have it
* within task_struct. So if signal_struct.pids exists, it's the "new"
* version.
*/
if ((k == &kp_wake_up_new_task ||
k == &kp_exit ||
k == &kp_exec_connector) &&
!strcmp(karg->name, "pgid")) {
if (quark_btf_offset(qbtf, "signal_struct.pids") == -1)
return (&ka_task_old_pgid);
return (&ka_task_new_pgid);
}
if ((k == &kp_wake_up_new_task ||
k == &kp_exit ||
k == &kp_exec_connector) &&
!strcmp(karg->name, "sid")) {
if (quark_btf_offset(qbtf, "signal_struct.pids") == -1)
return (&ka_task_old_sid);
return (&ka_task_new_sid);
}
/* No kludges found, carry on */
return (karg);
}
static char *
kprobe_make_arg(struct kprobe *k, struct kprobe_arg *karg,
struct quark_btf *qbtf)
{
int i;
ssize_t off;
char *p, **pp, *last, *kstr, *tokens[128], *arg_dsl;
karg = kprobe_kludge_arg(k, karg, qbtf);
kstr = NULL;
if ((arg_dsl = strdup(karg->arg_dsl)) == NULL)
return (NULL);
i = 0;
for (p = strtok_r(arg_dsl, " ", &last);
p != NULL;
p = strtok_r(NULL, " ", &last)) {
/* Last is sentinel */
if (i == ((int)nitems(tokens) - 1)) {
qwarnx("too many tokens");
free(arg_dsl);
return (NULL);
}
tokens[i++] = p;
}
tokens[i] = NULL;
if (asprintf(&kstr, "%%%s", karg->reg) == -1) {
free(arg_dsl);
return (NULL);
}
for (pp = tokens; *pp != NULL; pp++) {
p = *pp;
last = kstr;
if (kprobe_exp(p, &off, qbtf) == -1 ||
asprintf(&kstr, "+%zd(%s)", off, last) == -1) {
free(arg_dsl);
free(last);
return (NULL);
}
free(last);
}
last = kstr;
if (asprintf(&kstr, "%s=%s:%s", karg->name, last, karg->typ) == -1) {
free(arg_dsl);
free(last);
return (NULL);
}
free(last);
free(arg_dsl);
return (kstr);
}
static void
kprobe_tracefs_name(struct kprobe *k, u64 qid, char *buf, size_t len)
{
snprintf(buf, len, "quark_%s_%llu_%llu", k->target, (u64)getpid(), qid);
}
static char *
kprobe_build_string(struct kprobe *k, char *name, struct quark_btf *qbtf)
{
struct kprobe_arg *karg;
char *p, *o, *a;
int r;
r = asprintf(&p, "%c:%s %s", k->is_kret ? 'r' : 'p',
name, k->target);
if (r == -1)
return (NULL);
for (karg = k->args; karg->name != NULL; karg++) {
a = kprobe_make_arg(k, karg, qbtf);
if (a == NULL) {
free(p);
return (NULL);
}
o = p;
r = asprintf(&p, "%s %s", o, a);
free(o);
free(a);
if (r == -1)
return (NULL);
}
return (p);
}
static int
kprobe_uninstall(struct kprobe *k, u64 qid)
{
char buf[4096];
ssize_t n;
int fd;
char fsname[MAXPATHLEN];
kprobe_tracefs_name(k, qid, fsname, sizeof(fsname));
if ((fd = open_tracing(O_WRONLY | O_APPEND, "kprobe_events")) == -1)
return (-1);
if (snprintf(buf, sizeof(buf), "-:%s", fsname) >=
(int)sizeof(buf)) {
close(fd);
return (-1);
}
n = qwrite(fd, buf, strlen(buf));
close(fd);
if (n == -1)
return (-1);
return (0);
}
/*
* Builds the kprobe string and "installs" in tracefs, mapping to a perf ring is
* later and belongs to kprobe_state. This separation makes library cleanup
* easier.
*/
static int
kprobe_install(struct kprobe *k, u64 qid, struct quark_btf *qbtf)
{
int fd;
ssize_t n;
char *kstr;
char fsname[MAXPATHLEN];
kprobe_tracefs_name(k, qid, fsname, sizeof(fsname));
if (kprobe_uninstall(k, qid) == -1 && errno != ENOENT)
qwarn("kprobe_uninstall");
if ((kstr = kprobe_build_string(k, fsname, qbtf)) == NULL)
return (-1);
if ((fd = open_tracing(O_WRONLY, "kprobe_events")) == -1) {
free(kstr);
return (-1);
}
n = qwrite(fd, kstr, strlen(kstr));
close(fd);
free(kstr);
if (n == -1)
return (-1);
return (0);
}
static int
kprobe_install_all(u64 qid)
{
int i, r;
struct quark_btf *qbtf;
if ((qbtf = quark_btf_open()) == NULL) {
qwarnx("can't initialize btf");
return (-1);
}
r = 0;
for (i = 0; all_kprobes[i] != NULL; i++) {
if (kprobe_install(all_kprobes[i], qid, qbtf) == -1) {
qwarnx("kprobe %s failed", all_kprobes[i]->target);
/* Uninstall the ones that succeeded */
while (--i >= 0)
kprobe_uninstall(all_kprobes[i], qid);
r = -1;
break;
}
}
quark_btf_close(qbtf);
return (r);
}
static void
kprobe_uninstall_all(u64 qid)
{
int i;
for (i = 0; all_kprobes[i] != NULL; i++)
kprobe_uninstall(all_kprobes[i], qid);
}
static void
perf_attr_init(struct perf_event_attr *attr, int id)
{
bzero(attr, sizeof(*attr));
attr->type = PERF_TYPE_TRACEPOINT;
attr->size = sizeof(*attr);
attr->config = id;
/* attr->config = PERF_COUNT_SW_DUMMY; */
attr->sample_period = 1; /* we want all events */
attr->sample_type =
PERF_SAMPLE_TID |
PERF_SAMPLE_TIME |
PERF_SAMPLE_CPU |
PERF_SAMPLE_RAW;
attr->use_clockid = 1;
attr->clockid = CLOCK_MONOTONIC;
attr->disabled = 1;
}
static struct perf_group_leader *
perf_open_group_leader(struct kprobe_queue *kqq, int cpu)
{
struct perf_group_leader *pgl;
int id;
struct sample_attr *sattr;
ssize_t data_offset;
/* By putting EXEC on group leader we save one fd per cpu */
if ((id = fetch_tracing_id("events/sched/sched_process_exec/id")) == -1)
return (NULL);
sattr = NULL;
data_offset = -1;
if (cpu == 0) {
sattr = sample_attr_prepare(kqq, id,
"events/sched/sched_process_exec/format", &data_offset);
if (sattr == NULL)
return (NULL);
}
pgl = calloc(1, sizeof(*pgl));
if (pgl == NULL)
return (NULL);
perf_attr_init(&pgl->attr, id);
/*
* We will still get task events as long as set comm, see
* perf_event_to_raw()
*/
pgl->attr.comm = 1;
pgl->attr.comm_exec = 1;
pgl->attr.sample_id_all = 1; /* add sample_id to all types */
pgl->attr.watermark = 1;
pgl->attr.wakeup_watermark = (PERF_MMAP_PAGES * getpagesize()) / 10;;
pgl->fd = perf_event_open_degradable(&pgl->attr, -1, cpu, -1, 0);
if (pgl->fd == -1) {
qwarn("perf_event_open_degradable");
free(pgl);
return (NULL);
}
if (perf_mmap_init(&pgl->mmap, pgl->fd) == -1) {
close(pgl->fd);
free(pgl);
return (NULL);
}
pgl->cpu = cpu;
/* Take the slot now that there are no error paths */
if (sattr != NULL) {
sattr->id = id;
sattr->kind = EXEC_SAMPLE;
sattr->data_offset = (size_t)data_offset;
}
return (pgl);
}
static struct kprobe_state *
perf_open_kprobe(struct kprobe_queue *kqq, struct kprobe *k,
u64 qid, int cpu, int group_fd)
{
int id;
char buf[MAXPATHLEN];
char fsname[MAXPATHLEN];
struct kprobe_state *ks;
ssize_t data_offset;
struct sample_attr *sattr;
kprobe_tracefs_name(k, qid, fsname, sizeof(fsname));
if (snprintf(buf, sizeof(buf), "events/kprobes/%s/id",
fsname) >= (int)sizeof(buf))
return (errno = ENAMETOOLONG, NULL);
if ((id = fetch_tracing_id(buf)) == -1)
return (NULL);
sattr = NULL;
data_offset = -1;
if (cpu == 0) {
if (snprintf(buf, sizeof(buf), "events/kprobes/%s/format",
fsname) >= (int)sizeof(buf))
return (errno = ENAMETOOLONG, NULL);
sattr = sample_attr_prepare(kqq, id, buf, &data_offset);
if (sattr == NULL)
return (NULL);
}
ks = calloc(1, sizeof(*ks));
if (ks == NULL)
return (NULL);
perf_attr_init(&ks->attr, id);
ks->fd = perf_event_open_degradable(&ks->attr, -1, cpu, group_fd, 0);
if (ks->fd == -1) {
qwarn("perf_event_open_degradable");
free(ks);
return (NULL);
}
/* Output our records in the group_fd */
if (ioctl(ks->fd, PERF_EVENT_IOC_SET_OUTPUT, group_fd) == -1) {
close(ks->fd);
free(ks);
return (NULL);
}
ks->k = k;
ks->cpu = cpu;
ks->group_fd = group_fd;
/* Take the slot now that there are no error paths */
if (sattr != NULL) {
sattr->id = id;
sattr->kind = ks->k->sample_kind;
sattr->data_offset = (size_t)data_offset;
}
return (ks);
}
int
kprobe_queue_open(struct quark_queue *qq)
{
struct kprobe_queue *kqq;
struct perf_group_leader *pgl;
struct kprobe *k;
struct kprobe_state *ks;
struct epoll_event ev;
int i;
u64 qid;
static u64 qids;
if ((qq->flags & QQ_KPROBE) == 0)
return (errno = ENOTSUP, -1);
qid = __atomic_fetch_add(&qids, 1, __ATOMIC_RELAXED);
if (kprobe_install_all(qid) == -1)
goto fail;
if ((kqq = calloc(1, sizeof(*kqq))) == NULL)
goto fail;
TAILQ_INIT(&kqq->perf_group_leaders);
kqq->num_perf_group_leaders = 0;
TAILQ_INIT(&kqq->kprobe_states);
kqq->qid = qid;
qq->queue_be = kqq;
for (i = 0; i < get_nprocs_conf(); i++) {
pgl = perf_open_group_leader(kqq, i);
if (pgl == NULL)
goto fail;
TAILQ_INSERT_TAIL(&kqq->perf_group_leaders, pgl, entry);
kqq->num_perf_group_leaders++;
}
i = 0;
while ((k = all_kprobes[i++]) != NULL) {
TAILQ_FOREACH(pgl, &kqq->perf_group_leaders, entry) {
ks = perf_open_kprobe(kqq, k, kqq->qid, pgl->cpu, pgl->fd);
if (ks == NULL)
goto fail;
TAILQ_INSERT_TAIL(&kqq->kprobe_states, ks, entry);
}
}
TAILQ_FOREACH(pgl, &kqq->perf_group_leaders, entry) {
/* XXX PERF_IOC_FLAG_GROUP see bugs */
if (ioctl(pgl->fd, PERF_EVENT_IOC_RESET,
PERF_IOC_FLAG_GROUP) == -1) {
qwarn("ioctl PERF_EVENT_IOC_RESET");
goto fail;
}
if (ioctl(pgl->fd, PERF_EVENT_IOC_ENABLE,
PERF_IOC_FLAG_GROUP) == -1) {
qwarn("ioctl PERF_EVENT_IOC_ENABLE");
goto fail;
}
}
qq->epollfd = epoll_create1(EPOLL_CLOEXEC);
if (qq->epollfd == -1) {
qwarn("epoll_create1");
goto fail;
}
TAILQ_FOREACH(pgl, &kqq->perf_group_leaders, entry) {
bzero(&ev, sizeof(ev));
ev.events = EPOLLIN;
ev.data.fd = pgl->fd;
if (epoll_ctl(qq->epollfd, EPOLL_CTL_ADD, pgl->fd, &ev) == -1) {
qwarn("epoll_ctl");
goto fail;
}
}
qq->queue_ops = &queue_ops_kprobe;
qq->stats.backend = QQ_KPROBE;
return (0);
fail:
kprobe_queue_close(qq);
return (-1);
}
static int
kprobe_queue_populate(struct quark_queue *qq)
{
struct kprobe_queue *kqq = qq->queue_be;
int empty_rings, num_rings, npop;
struct perf_group_leader *pgl;
struct perf_event *ev;
struct raw_event *raw;
num_rings = kqq->num_perf_group_leaders;
npop = 0;
/*
* We stop if the queue is full, or if we see all perf ring buffers
* empty.
*/
while (qq->length < qq->max_length) {
empty_rings = 0;
TAILQ_FOREACH(pgl, &kqq->perf_group_leaders, entry) {
ev = perf_mmap_read(&pgl->mmap);
if (ev == NULL) {
empty_rings++;
continue;
}
empty_rings = 0;
raw = perf_event_to_raw(qq, ev);
if (raw != NULL) {
if (raw_event_insert(qq, raw) == -1) {
raw_event_free(raw);
raw = NULL;
}
npop++;
}
perf_mmap_consume(&pgl->mmap);
}
if (empty_rings == num_rings)
break;
}
return (npop);
}
static int
kprobe_queue_update_stats(struct quark_queue *qq)
{
/* NADA */
return (0);
}
static void
kprobe_queue_close(struct quark_queue *qq)
{
struct kprobe_queue *kqq = qq->queue_be;
struct perf_group_leader *pgl;
struct kprobe_state *ks;
if (kqq != NULL) {
/* Stop and close the perf rings */
while ((pgl = TAILQ_FIRST(&kqq->perf_group_leaders)) != NULL) {
/* XXX PERF_IOC_FLAG_GROUP see bugs */
if (pgl->fd != -1) {
if (ioctl(pgl->fd, PERF_EVENT_IOC_DISABLE,
PERF_IOC_FLAG_GROUP) == -1)
qwarnx("ioctl PERF_EVENT_IOC_DISABLE:");
close(pgl->fd);
}
if (pgl->mmap.metadata != NULL) {
if (munmap(pgl->mmap.metadata,
pgl->mmap.mapped_size) != 0)
qwarn("munmap");
}
TAILQ_REMOVE(&kqq->perf_group_leaders, pgl, entry);
free(pgl);
}
/* Clean up all state allocated to kprobes */
while ((ks = TAILQ_FIRST(&kqq->kprobe_states)) != NULL) {
if (ks->fd != -1)
close(ks->fd);
TAILQ_REMOVE(&kqq->kprobe_states, ks, entry);
free(ks);
}
kprobe_uninstall_all(kqq->qid);
free(kqq);
kqq = NULL;
qq->queue_be = NULL;
}
/* Clean up epoll instance */
if (qq->epollfd != -1) {
close(qq->epollfd);
qq->epollfd = -1;
}
}