non-GPL/Events/Lib/EbpfEvents.c (664 lines of code) (raw):

// SPDX-License-Identifier: Elastic-2.0 /* * Copyright 2021 Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under * one or more contributor license agreements. Licensed under the Elastic * License 2.0; you may not use this file except in compliance with the Elastic * License 2.0. */ #include "EbpfEvents.h" #include <bpf/bpf.h> #include <bpf/btf.h> #include <bpf/libbpf.h> #include <errno.h> #include <fcntl.h> #include <stdbool.h> #include <stdio.h> #include <sys/resource.h> #include <sys/utsname.h> #include <unistd.h> #include "EventProbe.skel.h" #define KERNEL_VERSION(maj, min, patch) \ (((maj) << 16) | ((min) << 8) | (patch > 255 ? 255 : (patch))) bool log_verbose = false; static int verbose(const char *fmt, ...); struct ring_buf_cb_ctx { ebpf_event_handler_fn cb; uint64_t events_mask; }; struct ebpf_event_ctx { uint64_t features; struct ring_buffer *ringbuf; struct EventProbe_bpf *probe; struct ring_buf_cb_ctx *cb_ctx; }; /* This is just a thin wrapper that calls the event context's saved callback */ static int ring_buf_cb(void *ctx, void *data, size_t size) { struct ring_buf_cb_ctx *cb_ctx = ctx; if (cb_ctx == NULL) { return 0; } ebpf_event_handler_fn cb = cb_ctx->cb; if (cb == NULL) { return 0; } struct ebpf_event_header *evt = data; if (evt == NULL) { return 0; } if (evt->type & cb_ctx->events_mask) { return cb(evt); } return 0; } /* https://www.kernel.org/doc/html/latest/bpf/btf.html#btf-kind-union */ static int resolve_btf_field_off_recur(struct btf *btf, int base_off, const struct btf_type *t, const char *field) { struct btf_member *m = btf_members(t); for (int i = 0; i < btf_vlen(t); i++, m++) { const char *name = btf__name_by_offset(btf, m->name_off); if (name == NULL) continue; if (strcmp(name, field) == 0) return (base_off + m->offset) / 8; const struct btf_type *m_type = btf__type_by_id(btf, m->type); if (!btf_is_struct(m_type) && !btf_is_union(m_type)) continue; // Recursively traverse structs and unions int ret = resolve_btf_field_off_recur(btf, base_off + m->offset, m_type, field); if (ret == -1) continue; return ret; } return -1; } /* Find the BTF field relocation offset for a named field of a kernel struct */ static int resolve_btf_field_off(struct btf *btf, const char *struct_name, const char *field) { int ret = -1; __s32 btf_id = btf__find_by_name_kind(btf, struct_name, BTF_KIND_STRUCT); if (btf_id <= 0) goto out; const struct btf_type *t = btf__type_by_id(btf, btf_id); if (t == NULL) goto out; if (!btf_is_struct(t)) goto out; return resolve_btf_field_off_recur(btf, 0, t, field); out: return ret; } static const struct btf_type *resolve_btf_type_by_func(struct btf *btf, const char *func) { if (func == NULL) { goto out; } for (u_int i = 0; i < btf__type_cnt(btf); i++) { int btf_type = btf__resolve_type(btf, i); if (btf_type < 0) continue; const struct btf_type *btf_type_ptr = btf__type_by_id(btf, btf_type); if (!btf_is_func(btf_type_ptr)) continue; const char *name = btf__name_by_offset(btf, btf_type_ptr->name_off); if (name == NULL) continue; if (strcmp(name, func)) continue; int proto_btf_type = btf__resolve_type(btf, btf_type_ptr->type); if (proto_btf_type < 0) goto out; const struct btf_type *proto_btf_type_ptr = btf__type_by_id(btf, proto_btf_type); if (!btf_is_func_proto(proto_btf_type_ptr)) continue; return proto_btf_type_ptr; } out: verbose("resolve_btf_type_by_func(%s): not found\n", func); return NULL; } /* Find the BTF type relocation index for a named argument of a kernel function */ static int resolve_btf_func_arg_idx(struct btf *btf, const char *func, const char *arg) { int ret = -1; const struct btf_type *proto_btf_type_ptr = resolve_btf_type_by_func(btf, func); if (!proto_btf_type_ptr) goto out; struct btf_param *params = btf_params(proto_btf_type_ptr); for (int j = 0; j < btf_vlen(proto_btf_type_ptr); j++) { const char *cur_name = btf__name_by_offset(btf, params[j].name_off); if (cur_name == NULL) { continue; } if (strcmp(cur_name, arg) == 0) { ret = j; goto out; } } verbose("resolve_btf_func_arg_idx(%s, %s): not found\n", func, arg); out: return ret; } /* Find the BTF relocation index for a func return value */ static int resolve_btf_func_ret_idx(struct btf *btf, const char *func) { int ret = -1; const struct btf_type *proto_btf_type_ptr = resolve_btf_type_by_func(btf, func); if (!proto_btf_type_ptr) goto out; ret = btf_vlen(proto_btf_type_ptr); out: return ret; } /* Given a function name and an argument name, returns the argument index * in the function signature. */ #define FILL_FUNC_ARG_IDX(obj, btf, func, arg) \ ({ \ int __r = -1; \ int r = resolve_btf_func_arg_idx(btf, #func, #arg); \ if (r >= 0) \ __r = 0; \ else \ verbose("fill func arg idx (%s, %s): %d\n", #func, #arg, r); \ obj->rodata->arg__##func##__##arg##__ = r; \ __r; \ }) /* Given a function name, returns the "ret" argument index. */ #define FILL_FUNC_RET_IDX(obj, btf, func) \ ({ \ int __r = -1; \ int r = resolve_btf_func_ret_idx(btf, #func); \ if (r >= 0) \ __r = 0; \ else \ verbose("fill func ret idx (%s): %d\n", #func, r); \ obj->rodata->ret__##func##__ = r; \ __r; \ }) /* Given a function name and an argument name, returns whether the argument * exists or not. */ #define FILL_FUNC_ARG_EXISTS(obj, btf, func, arg) \ ({ \ int __r = -1; \ int r = resolve_btf_func_arg_idx(btf, #func, #arg); \ if (r >= 0) { \ obj->rodata->exists__##func##__##arg##__ = true; \ __r = 0; \ } else { \ verbose("fill func arg exists (%s, %s): %d\n", #func, #arg, r); \ } \ __r; \ }) /* Given a struct name and a field name, returns the field offset * within the struct. */ #define FILL_FIELD_OFFSET(obj, btf, struct, field) \ ({ \ int __r = -1; \ int r = resolve_btf_field_off(btf, #struct, #field); \ if (r >= 0) \ __r = 0; \ else \ verbose("fill field offset (%s, %s): %d\n", #struct, #field, r); \ obj->rodata->off__##struct##__##field##__ = r; \ __r; \ }) /* Given a function name, returns whether it exists in the provided BTF. */ #define BTF_FUNC_EXISTS(btf, func) ({ (bool)resolve_btf_type_by_func(btf, #func); }) /* Given a struct name and a field name, returns whether the field exists within the struct. */ #define BTF_FIELD_EXISTS(btf, struct, field) \ ({ \ bool __r = false; \ if (resolve_btf_field_off(btf, #struct, #field) >= 0) \ __r = true; \ __r; \ }) /* Fill context relocations for kernel functions * You can add additional functions here by using the macros defined above. * * Rodata constants must be declared in `EventProbe.bpf.c` via the relative helper macros. */ static int probe_fill_relos(struct btf *btf, struct EventProbe_bpf *obj) { int err = 0; err = err ?: FILL_FUNC_RET_IDX(obj, btf, inet_csk_accept); err = err ?: FILL_FUNC_ARG_IDX(obj, btf, vfs_unlink, dentry); err = err ?: FILL_FUNC_RET_IDX(obj, btf, vfs_unlink); if (FILL_FUNC_ARG_EXISTS(obj, btf, vfs_rename, rd)) { /* We are on a 5.12- kernel */ err = err ?: FILL_FUNC_ARG_IDX(obj, btf, vfs_rename, old_dentry); err = err ?: FILL_FUNC_ARG_IDX(obj, btf, vfs_rename, new_dentry); } err = err ?: FILL_FUNC_RET_IDX(obj, btf, vfs_rename); if (BTF_FIELD_EXISTS(btf, iov_iter, __iov)) err = err ?: FILL_FIELD_OFFSET(obj, btf, iov_iter, __iov); err = err ?: FILL_FUNC_ARG_IDX(obj, btf, do_truncate, filp); err = err ?: FILL_FUNC_RET_IDX(obj, btf, do_truncate); if (BTF_FIELD_EXISTS(btf, inode, __i_atime)) err = err ?: FILL_FIELD_OFFSET(obj, btf, inode, __i_atime); if (BTF_FIELD_EXISTS(btf, inode, __i_mtime)) err = err ?: FILL_FIELD_OFFSET(obj, btf, inode, __i_mtime); if (BTF_FIELD_EXISTS(btf, inode, __i_ctime)) err = err ?: FILL_FIELD_OFFSET(obj, btf, inode, __i_ctime); return err; } static int probe_resize_maps(struct EventProbe_bpf *obj) { int ncpu = libbpf_num_possible_cpus(); if (ncpu < 0) { verbose("could not determine number of CPUs: %d\n", ncpu); return ncpu; } int err = 0; if ((err = bpf_map__set_max_entries(obj->maps.event_buffer_map, ncpu)) < 0) { verbose("could not resize event buffer map: %d\n", err); return err; }; return 0; } /* Some programs in the skeleton are mutually exclusive, based on local kernel features. */ static int probe_set_autoload(struct btf *btf, struct EventProbe_bpf *obj, uint64_t features) { int err = 0; bool has_bpf_tramp = features & EBPF_FEATURE_BPF_TRAMP; // do_renameat2 kprobe and fentry probe are mutually exclusive. // disable auto-loading of kprobe if `do_renameat2` exists in BTF and // if bpf trampolines are supported on the current arch, and vice-versa. if (has_bpf_tramp && BTF_FUNC_EXISTS(btf, do_renameat2)) { err = err ?: bpf_program__set_autoload(obj->progs.kprobe__do_renameat2, false); } else { err = err ?: bpf_program__set_autoload(obj->progs.fentry__do_renameat2, false); } // tcp_v6_connect kprobes and fexit probe are mutually exclusive. // disable auto-loading of kprobes if `tcp_v6_connect` exists in BTF and // if bpf trampolines are supported on the current arch, and vice-versa. if (has_bpf_tramp && BTF_FUNC_EXISTS(btf, tcp_v6_connect)) { err = err ?: bpf_program__set_autoload(obj->progs.kprobe__tcp_v6_connect, false); err = err ?: bpf_program__set_autoload(obj->progs.kretprobe__tcp_v6_connect, false); } else { err = err ?: bpf_program__set_autoload(obj->progs.fexit__tcp_v6_connect, false); } // tty_write BTF information is not available on all supported kernels due // to a pahole bug, see: // https://rhysre.net/how-an-obscure-arm64-link-option-broke-our-bpf-probe.html // // If BTF is not present we can't attach a fentry/ program to it, so // fallback to a kprobe. if (has_bpf_tramp && BTF_FUNC_EXISTS(btf, tty_write)) { err = err ?: bpf_program__set_autoload(obj->progs.kprobe__tty_write, false); } else { err = err ?: bpf_program__set_autoload(obj->progs.fentry__tty_write, false); } // vfs_writev BTF information is not available on all supported kernels. // If BTF is not present we can't attach a fentry/ program to it, so // fallback to a kprobe. if (has_bpf_tramp && BTF_FUNC_EXISTS(btf, vfs_writev)) { err = err ?: bpf_program__set_autoload(obj->progs.kprobe__vfs_writev, false); err = err ?: bpf_program__set_autoload(obj->progs.kretprobe__vfs_writev, false); } else { err = err ?: bpf_program__set_autoload(obj->progs.fexit__vfs_writev, false); } // bpf trampolines are only implemented for x86. disable auto-loading of all // fentry/fexit progs if EBPF_FEATURE_BPF_TRAMP is not in `features` and // enable the k[ret]probe counterpart. if (has_bpf_tramp) { err = err ?: bpf_program__set_autoload(obj->progs.kprobe__do_unlinkat, false); err = err ?: bpf_program__set_autoload(obj->progs.kprobe__mnt_want_write, false); err = err ?: bpf_program__set_autoload(obj->progs.kprobe__vfs_unlink, false); err = err ?: bpf_program__set_autoload(obj->progs.kretprobe__vfs_unlink, false); err = err ?: bpf_program__set_autoload(obj->progs.kretprobe__do_filp_open, false); err = err ?: bpf_program__set_autoload(obj->progs.kprobe__vfs_rename, false); err = err ?: bpf_program__set_autoload(obj->progs.kretprobe__vfs_rename, false); err = err ?: bpf_program__set_autoload(obj->progs.kprobe__taskstats_exit, false); err = err ?: bpf_program__set_autoload(obj->progs.kprobe__commit_creds, false); err = err ?: bpf_program__set_autoload(obj->progs.kretprobe__inet_csk_accept, false); err = err ?: bpf_program__set_autoload(obj->progs.kprobe__tcp_v4_connect, false); err = err ?: bpf_program__set_autoload(obj->progs.kretprobe__tcp_v4_connect, false); err = err ?: bpf_program__set_autoload(obj->progs.kprobe__tcp_close, false); err = err ?: bpf_program__set_autoload(obj->progs.kprobe__chmod_common, false); err = err ?: bpf_program__set_autoload(obj->progs.kretprobe__chmod_common, false); err = err ?: bpf_program__set_autoload(obj->progs.kprobe__do_truncate, false); err = err ?: bpf_program__set_autoload(obj->progs.kretprobe__do_truncate, false); err = err ?: bpf_program__set_autoload(obj->progs.kprobe__vfs_write, false); err = err ?: bpf_program__set_autoload(obj->progs.kretprobe__vfs_write, false); err = err ?: bpf_program__set_autoload(obj->progs.kprobe__chown_common, false); err = err ?: bpf_program__set_autoload(obj->progs.kretprobe__chown_common, false); } else { err = err ?: bpf_program__set_autoload(obj->progs.fentry__do_unlinkat, false); err = err ?: bpf_program__set_autoload(obj->progs.fentry__mnt_want_write, false); err = err ?: bpf_program__set_autoload(obj->progs.fentry__vfs_unlink, false); err = err ?: bpf_program__set_autoload(obj->progs.fexit__vfs_unlink, false); err = err ?: bpf_program__set_autoload(obj->progs.fexit__do_filp_open, false); err = err ?: bpf_program__set_autoload(obj->progs.fentry__vfs_rename, false); err = err ?: bpf_program__set_autoload(obj->progs.fexit__vfs_rename, false); err = err ?: bpf_program__set_autoload(obj->progs.fentry__taskstats_exit, false); err = err ?: bpf_program__set_autoload(obj->progs.fentry__commit_creds, false); err = err ?: bpf_program__set_autoload(obj->progs.fexit__inet_csk_accept, false); err = err ?: bpf_program__set_autoload(obj->progs.fexit__tcp_v4_connect, false); err = err ?: bpf_program__set_autoload(obj->progs.fentry__tcp_close, false); err = err ?: bpf_program__set_autoload(obj->progs.fexit__chmod_common, false); err = err ?: bpf_program__set_autoload(obj->progs.fexit__do_truncate, false); err = err ?: bpf_program__set_autoload(obj->progs.fexit__vfs_write, false); err = err ?: bpf_program__set_autoload(obj->progs.fexit__chown_common, false); } return err; } static int probe_attach_cgroup(struct EventProbe_bpf *obj) { int cgroup_fd; /* * Hardcoded for now, consider making the path an argument. */ cgroup_fd = open("/sys/fs/cgroup", O_RDONLY); if (cgroup_fd == -1) return -1; #define ATTACH_OR_FAIL(_program) \ if (bpf_program__attach_cgroup(obj->progs._program, cgroup_fd) == NULL) { \ close(cgroup_fd); \ return -1; \ } ATTACH_OR_FAIL(skb_egress); ATTACH_OR_FAIL(skb_ingress); ATTACH_OR_FAIL(sock_create); ATTACH_OR_FAIL(sock_release); ATTACH_OR_FAIL(sendmsg4); ATTACH_OR_FAIL(connect4); ATTACH_OR_FAIL(recvmsg4); #undef ATTACH_OR_FAIL close(cgroup_fd); return 0; } static bool system_has_bpf_tramp(void) { /* * This is somewhat-fragile but as far as I can see, is the most robust * possible way to detect BPF trampoline support on any given kernel, (i.e. * if we can load "fentry/" and "fexit/" programs). BPF trampoline support * was introduced on x86 with kernel commit * fec56f5890d93fc2ed74166c397dc186b1c25951 in 5.5. * * To detect it, you not only need to load a BPF trampoline program, but * you also need to _attach_ to that program. Loading will succeed even if * BPF trampoline support is absent, only attaching will fail. * * To load + attach, we need to pass a BTF id to the attach_btf_id * corresponding to the BTF type (of kind BTF_KIND_FUNC) of a valid * function in the kernel that this program is supposed to be attached to. * Loading will otherwise fail. The most robust thing to do here would be * to iterate over the list of all BTF types and just pick the first one * where kind == BTF_KIND_FUNC (i.e. just pick an arbitrary function that * we know exists on the currently running kernel). Unfortunately this * isn't possible, as some functions are marked with the __init attribute * in the kernel, thus they cease to exist after bootup and can't be * attached to. * * Instead we just use the taskstats_exit function. It's been in the kernel * since 2006 and we already attach to it with a BPF probe, so if it's * removed, more visible parts of the code should break as well, indicating * this needs to be updated. */ int prog_fd, attach_fd, btf_id; bool ret = true; struct btf *btf = btf__load_vmlinux_btf(); if (libbpf_get_error(btf)) { verbose("could not load system BTF (does the kernel have BTF?)\n"); ret = false; goto out; } /* * r0 = 0 * exit * * This could be done more clearly with BPF_MOV64_IMM and BPF_EXIT_INSN * macros in the kernel sources but unfortunately they're not exported to * userspace. */ struct bpf_insn insns[] = { {.code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = BPF_REG_0, .src_reg = 0, .off = 0, .imm = 0}, {.code = BPF_EXIT | BPF_JMP, .dst_reg = 0, .src_reg = 0, .off = 0, .imm = 0}}; int insns_cnt = 2; btf_id = btf__find_by_name(btf, "taskstats_exit"); LIBBPF_OPTS(bpf_prog_load_opts, opts, .log_buf = NULL, .log_level = 0, .expected_attach_type = BPF_TRACE_FENTRY, .attach_btf_id = btf_id); prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACING, NULL, "GPL", insns, insns_cnt, &opts); if (prog_fd < 0) { ret = false; goto out_free_btf; } /* * NB: This is a confusingly named API: bpf(BPF_RAW_TRACEPOINT_OPEN, ...) * is used to attach an already-loaded BPF trampoline program (in addition * to a raw tracepoint). * * A new, more intuitively named API was added later called BPF_LINK_CREATE * (see kernel commit 8462e0b46fe2d4c56d0a7de705228e3bf1da03d9), but the * BPF_RAW_TRACEPOINT_OPEN approach should continue to work on all kernels * due to the kernel's userspace API guarantees. */ attach_fd = bpf_raw_tracepoint_open(NULL, prog_fd); if (attach_fd < 0) { ret = false; goto out_close_prog_fd; } /* Successfully attached, we know BPF trampolines work, clean everything up */ close(attach_fd); out_close_prog_fd: close(prog_fd); out_free_btf: btf__free(btf); out: return ret; } static uint64_t detect_system_features(void) { uint64_t features = 0; if (system_has_bpf_tramp()) features |= EBPF_FEATURE_BPF_TRAMP; return features; } static bool system_has_btf(void) { struct btf *btf = btf__load_vmlinux_btf(); if (libbpf_get_error(btf)) { verbose("Kernel does not support BTF, bpf events are not supported\n"); return false; } else { btf__free(btf); return true; } } static uint64_t get_kernel_version(void) { int maj = 0, min = 0, patch = 0; // Ubuntu kernels do not report the true upstream kernel source version in // utsname.release, they report the "ABI version", which is the upstream // kernel major.minor with some extra ABI information, e.g.: // 5.15.0-48-generic. The upstream patch version is always set to 0. // // Ubuntu provides a file under procfs that reports the actual upstream // source version, so we use that instead if it exists. if (access("/proc/version_signature", R_OK) == 0) { FILE *f = fopen("/proc/version_signature", "r"); if (f) { // Example: Ubuntu 5.15.0-48.54-generic 5.15.53 if (fscanf(f, "%*s %*s %d.%d.%d\n", &maj, &min, &patch) == 3) { fclose(f); return KERNEL_VERSION(maj, min, patch); } fclose(f); } verbose("Ubuntu version file exists but could not be parsed, using uname\n"); } struct utsname un; if (uname(&un) == -1) { verbose("uname failed: %d: %s\n", errno, strerror(errno)); return 0; } char *debian_start = strstr(un.version, "Debian"); if (debian_start != NULL) { // We're running on Debian. // // Like Ubuntu, what Debian reports in the un.release buffer is the // "ABI version", which is the major.minor of the upstream, with the // patch always set to 0 (and some further ABI numbers). e.g.: // 5.10.0-18-amd64 // // See the following docs for more info: // https://kernel-team.pages.debian.net/kernel-handbook/ch-versions.html // // Unlike Ubuntu, Debian does not provide a special procfs file // indicating the actual upstream source. Instead, it puts the actual // upstream source version into the un.version field, after the string // "Debian": // // $ uname -a // Linux bullseye 5.10.0-18-amd64 #1 SMP Debian 5.10.140-1 (2022-09-02) x86_64 GNU/Linux // // $ uname -v // #1 SMP Debian 5.10.140-1 (2022-09-02) // // Due to this, we pull the upstream kernel source out of un.version here. if (sscanf(debian_start, "Debian %d.%d.%d", &maj, &min, &patch) != 3) { verbose("could not parse uname version string: %s\n", un.version); return 0; } return KERNEL_VERSION(maj, min, patch); } // We're not on Ubuntu or Debian, un.release should tell us the actual // upstream source if (sscanf(un.release, "%d.%d.%d", &maj, &min, &patch) != 3) { verbose("could not parse uname release string: %d: %s\n", errno, strerror(errno)); return 0; } return KERNEL_VERSION(maj, min, patch); } static bool kernel_version_is_supported(void) { // We only support Linux 5.10.16+ // // Linux commit e114dd64c0071500345439fc79dd5e0f9d106ed (went in in // 5.11/5.10.16) fixed a verifier bug that (as of 9/28/2022) causes our // probes to fail to load. // // Theoretically, we could push support back to 5.8 without any // foundational changes (the BPF ringbuffer was added in 5.8, we'd need to // use per-cpu perfbuffers prior to that), but, for the time being, it's // been decided that this is more hassle than it's worth. uint64_t kernel_version = get_kernel_version(); if (kernel_version < KERNEL_VERSION(5, 10, 16)) { verbose("kernel version is < 5.10.16 (version code: %x), bpf events are not supported\n", kernel_version); return false; } return true; } static int libbpf_verbose_print(enum libbpf_print_level lvl, const char *fmt, va_list args) { return vfprintf(stderr, fmt, args); } static int verbose(const char *fmt, ...) { va_list args; va_start(args, fmt); if (!log_verbose) return 0; return vfprintf(stderr, fmt, args); } int ebpf_set_verbose_logging() { libbpf_set_print(libbpf_verbose_print); log_verbose = true; return 0; } uint64_t ebpf_event_ctx__get_features(struct ebpf_event_ctx *ctx) { return ctx->features; } int ebpf_event_ctx__new(struct ebpf_event_ctx **ctx, ebpf_event_handler_fn cb, uint64_t events) { struct EventProbe_bpf *probe = NULL; struct btf *btf = NULL; // Our probes aren't 100% guaranteed to load if these two facts are true // e.g. maybe someone compiled a kernel without kprobes or bpf trampolines. // However, checking these two things should cover the vast majority of // failure cases, allowing us to print a more understandable message than // what you'd get if you just tried to load the probes. if (!kernel_version_is_supported() || !system_has_btf()) { verbose("this system does not support BPF events (see logs)\n"); return -ENOTSUP; } // ideally we'd be calling // // ```c // libbpf_set_strict_mode(LIBBPF_STRICT_AUTO_RLIMIT_MEMLOCK); // ``` // // to automatically detect if `RLIMIT_MEMLOCK` needs increasing, however // with kernel 5.10.109+ on GKE, it incorrectly detects that bpf uses memcg // instead of memlock rlimit, so it does nothing. // // The check for memcg loads a program with the `bpf_ktime_get_coarse_ns` // helper in order to check for memcg memory accounting, which was added // around the same time the memory account change took place (5.11). This // helper is backported in 5.10.109+ making the detection mechanism faulty, // so instead we just blindy set `RLIMIT_MEMLOCK` to infinity for now. struct rlimit rlim = { .rlim_cur = RLIM_INFINITY, .rlim_max = RLIM_INFINITY, }; int err = setrlimit(RLIMIT_MEMLOCK, &rlim); if (err != 0) goto out_destroy_probe; uint64_t features = detect_system_features(); btf = btf__load_vmlinux_btf(); if (libbpf_get_error(btf)) { verbose("could not load system BTF (does the kernel have BTF?)\n"); err = -ENOENT; goto out_destroy_probe; } probe = EventProbe_bpf__open(); if (probe == NULL) { /* EventProbe_bpf__open doesn't report errors, hard to find something * that fits perfect here */ verbose("EventProbe_bpf__open: %d\n", err); err = -ENOENT; goto out_destroy_probe; } probe->rodata->consumer_pid = getpid(); err = probe_fill_relos(btf, probe); if (err != 0) { verbose("probe_fill_relos: %d\n", err); goto out_destroy_probe; } err = probe_resize_maps(probe); if (err != 0) { verbose("probe_resize_maps: %d\n", err); goto out_destroy_probe; } err = probe_set_autoload(btf, probe, features); if (err != 0) { verbose("probe_set_autoload: %d\n", err); goto out_destroy_probe; } err = EventProbe_bpf__load(probe); if (err != 0) { verbose("EventProbe_bpf__load: %d\n", err); goto out_destroy_probe; } err = EventProbe_bpf__attach(probe); if (err != 0) { verbose("EventProbe_bpf__attach: %d\n", err); goto out_destroy_probe; } err = probe_attach_cgroup(probe); if (err != 0) { verbose("probe_attach_cgroup: %d\n", err); goto out_destroy_probe; } if (!ctx) goto out_destroy_probe; *ctx = calloc(1, sizeof(struct ebpf_event_ctx)); if (*ctx == NULL) { err = -ENOMEM; goto out_destroy_probe; } (*ctx)->probe = probe; (*ctx)->features = features; probe = NULL; struct ring_buffer_opts rb_opts; rb_opts.sz = sizeof(rb_opts); (*ctx)->cb_ctx = calloc(1, sizeof(struct ring_buf_cb_ctx)); if ((*ctx)->cb_ctx == NULL) { err = -ENOMEM; goto out_destroy_probe; } (*ctx)->cb_ctx->cb = cb; (*ctx)->cb_ctx->events_mask = events; (*ctx)->ringbuf = ring_buffer__new(bpf_map__fd((*ctx)->probe->maps.ringbuf), ring_buf_cb, (*ctx)->cb_ctx, &rb_opts); if ((*ctx)->ringbuf == NULL) { /* ring_buffer__new doesn't report errors, hard to find something that * fits perfect here */ err = -ENOENT; goto out_destroy_probe; } return ring_buffer__epoll_fd((*ctx)->ringbuf); out_destroy_probe: btf__free(btf); if (probe) EventProbe_bpf__destroy(probe); ebpf_event_ctx__destroy(ctx); return err; } int ebpf_event_ctx__next(struct ebpf_event_ctx *ctx, int timeout) { if (!ctx) return -1; int consumed = ring_buffer__poll(ctx->ringbuf, timeout); return consumed > 0 ? 0 : consumed; } int ebpf_event_ctx__poll(struct ebpf_event_ctx *ctx, int timeout) { if (!ctx) return -1; return ring_buffer__poll(ctx->ringbuf, timeout); } int ebpf_event_ctx__read_stats(struct ebpf_event_ctx *ctx, struct ebpf_event_stats *ees) { struct ebpf_event_stats pcpu_ees[libbpf_num_possible_cpus()]; uint32_t zero = 0; int i; if (!ctx || !ees) return -1; if (bpf_map__lookup_elem(ctx->probe->maps.ringbuf_stats, &zero, sizeof(zero), pcpu_ees, sizeof(pcpu_ees), 0) != 0) { return -1; } memset(ees, 0, sizeof(*ees)); for (i = 0; i < libbpf_num_possible_cpus(); i++) { ees->lost += pcpu_ees[i].lost; ees->sent += pcpu_ees[i].sent; ees->dns_zero_body += pcpu_ees[i].dns_zero_body; } return 0; } int ebpf_event_ctx__consume(struct ebpf_event_ctx *ctx) { if (!ctx) return -1; return ring_buffer__consume(ctx->ringbuf); } void ebpf_event_ctx__destroy(struct ebpf_event_ctx **ctx) { if (!ctx) return; if (*ctx) { if ((*ctx)->ringbuf) { ring_buffer__free((*ctx)->ringbuf); } if ((*ctx)->probe) { EventProbe_bpf__destroy((*ctx)->probe); } if ((*ctx)->cb_ctx) { free((*ctx)->cb_ctx); (*ctx)->cb_ctx = NULL; } free(*ctx); *ctx = NULL; } } struct bpf_map *ebpf_event_get_trustlist_map(struct ebpf_event_ctx *ctx) { if (NULL == ctx) { verbose("ebpf ctx is NULL"); return NULL; } struct EventProbe_bpf *probe = ctx->probe; if (NULL == probe) { verbose("Ebpf events probe is NULL"); return NULL; } struct bpf_map *map = probe->maps.elastic_ebpf_events_trusted_pids; if (NULL == map) { verbose("Ebpf trusted pids map is NULL"); return NULL; } return map; } static int ebpf_clear_process_trustlist(int map_fd) { int rv = 0; uint8_t key_buf[64] = {0}; uint8_t next_key_buf[64] = {0}; // get the first key if (bpf_map_get_next_key(map_fd, NULL, key_buf) < 0) { if (errno == ENOENT) { // map is already empty rv = 0; return rv; } else { // failure (perhaps not supported) verbose("Error getting next key while clearing trusted pids map, errno=%d\n", errno); rv = -1; return rv; } } // iterate over map while (0 == bpf_map_get_next_key(map_fd, key_buf, next_key_buf)) { // return value 0 means 'key' exists and 'next_key' has been set (void)bpf_map_delete_elem(map_fd, key_buf); memcpy(key_buf, next_key_buf, sizeof(key_buf)); } // -1 was returned so 'key' is the last element - delete it (void)bpf_map_delete_elem(map_fd, key_buf); return 0; } int ebpf_set_process_trustlist(struct bpf_map *map, uint32_t *pids, int count) { int rv = 0; if (!map || libbpf_get_error(map)) { verbose("Error: invalid trustlist map, errno=%d\n", errno); rv = -1; return rv; } int map_fd = bpf_map__fd(map); if (map_fd < 0) { verbose("Error: invalid trustlist map fd, errno=%d\n", errno); rv = -1; return rv; } // first clear the entire map rv = ebpf_clear_process_trustlist(map_fd); if (rv) { verbose("Error: failed to clear trusted pids map, errno=%d\n", errno); return rv; } // add entries to trustlist int i = 0; for (i = 0; i < count; i++) { uint32_t val = 1; uint32_t pid = pids[i]; rv = bpf_map_update_elem(map_fd, &pid, &val, BPF_ANY); if (rv) { verbose("Error: failed to add entry to trusted pids map, errno=%d\n", errno); return rv; } } return rv; }