metric/system/cgroup/util.go (362 lines of code) (raw):
// Licensed to Elasticsearch B.V. under one or more contributor
// license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright
// ownership. Elasticsearch B.V. licenses this file to you under
// the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package cgroup
import (
"bufio"
"errors"
"fmt"
"io/fs"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
"github.com/elastic/elastic-agent-libs/logp"
"github.com/elastic/elastic-agent-system-metrics/metric/system/resolve"
)
// cgroupCntainerCache is a performance helper used for
// cases where we're in a container and we need to fetch our cgroup
// path from the host system. We want to cache these results, since traversing
// /hostfs/sys/fs/cgroup is a bit intensive.
// This value is also unlikely to change more than once.
// see guessContainerCgroupPath() below for more context
type cgroupContainerCache struct {
mut sync.Mutex
cgPath string
}
func (cgc *cgroupContainerCache) get() string {
cgc.mut.Lock()
defer cgc.mut.Unlock()
return cgc.cgPath
}
func (cgc *cgroupContainerCache) set(update string) {
cgc.mut.Lock()
defer cgc.mut.Unlock()
cgc.cgPath = update
}
var cgroupContainerPath *cgroupContainerCache
func init() {
cgroupContainerPath = &cgroupContainerCache{cgPath: ""}
}
var (
// ErrCgroupsMissing indicates the /proc/cgroups was not found. This means
// that cgroups were disabled at compile time (CONFIG_CGROUPS=n) or that
// an invalid rootfs path was given.
ErrCgroupsMissing = errors.New("cgroups not found or unsupported by OS")
)
// mountinfo represents a subset of the fields containing /proc/[pid]/mountinfo.
type mountinfo struct {
mountpoint string
filesystemType string
superOptions []string
}
// Mountpoints organizes info about V1 and V2 cgroup mountpoints
// V2 uses a "unified" hierarchy, so we have less to keep track of
type Mountpoints struct {
V1Mounts map[string]string
V2Loc string
ContainerizedRootMount string
}
// ControllerPath wraps the controller path
type ControllerPath struct {
ControllerPath string
FullPath string
IsV2 bool
}
// PathList contains the V1 and V2 controller paths in a process
// Separate the V1 and V2 cgroups so we don't have hybrid cgroups fighting for one namespace
type PathList struct {
V1 map[string]ControllerPath
V2 map[string]ControllerPath
}
// wrapper that allows us to bypass isCgroupNSPrivate() for testing
var cgroupNSStateFetch = isCgroupNSPrivate
// Flatten combines the V1 and V2 cgroups in cases where we don't need a map with keys
func (pl PathList) Flatten() []ControllerPath {
list := make([]ControllerPath, 0, len(pl.V1)+len(pl.V2))
for _, v1 := range pl.V1 {
list = append(list, v1)
}
for _, v2 := range pl.V2 {
list = append(list, v2)
}
return list
}
// parseMountinfoLine parses a line from the /proc/[pid]/mountinfo file on
// Linux. The format of the line is specified in section 3.5 of
// https://www.kernel.org/doc/Documentation/filesystems/proc.txt.
func parseMountinfoLine(line string) (mountinfo, error) {
mount := mountinfo{}
fields := strings.Fields(line)
if len(fields) < 10 {
return mount, fmt.Errorf("invalid mountinfo line, expected at least "+
"10 fields but got %d from line='%s'", len(fields), line)
}
mount.mountpoint = fields[4]
var separatorIndex int
for i, value := range fields {
if value == "-" {
separatorIndex = i
break
}
}
if fields[separatorIndex] != "-" {
return mount, fmt.Errorf("invalid mountinfo line, separator ('-') not "+
"found in line='%s'", line)
}
if len(fields)-separatorIndex-1 < 3 {
return mount, fmt.Errorf("invalid mountinfo line, expected at least "+
"3 fields after separator but got %d from line='%s'",
len(fields)-separatorIndex-1, line)
}
fields = fields[separatorIndex+1:]
mount.filesystemType = fields[0]
mount.superOptions = strings.Split(fields[2], ",")
return mount, nil
}
// SupportedSubsystems returns the subsystems that are supported by the
// kernel. The returned map contains a entry for each subsystem.
func SupportedSubsystems(rootfs resolve.Resolver) (map[string]struct{}, error) {
cgroups, err := os.Open(rootfs.ResolveHostFS("/proc/cgroups"))
if err != nil {
if os.IsNotExist(err) {
return nil, ErrCgroupsMissing
}
return nil, err
}
defer cgroups.Close()
subsystemSet := map[string]struct{}{}
sc := bufio.NewScanner(cgroups)
for sc.Scan() {
line := sc.Text()
// Ignore the header.
if len(line) > 0 && line[0] == '#' {
continue
}
// Parse the cgroup subsystems.
// Format: subsys_name hierarchy num_cgroups enabled
// Example: cpuset 4 1 1
fields := strings.Fields(line)
if len(fields) == 0 {
continue
}
// Check the enabled flag.
if len(fields) > 3 {
enabled := fields[3]
if enabled == "0" {
// Ignore cgroup subsystems that are disabled (via the
// cgroup_disable kernel command-line boot parameter).
continue
}
}
subsystem := fields[0]
subsystemSet[subsystem] = struct{}{}
}
return subsystemSet, sc.Err()
}
// SubsystemMountpoints returns the mountpoints for each of the given subsystems.
// The returned map contains the subsystem name as a key and the value is the
// mountpoint.
func SubsystemMountpoints(rootfs resolve.Resolver, subsystems map[string]struct{}) (Mountpoints, error) {
// TODO: will we run into mount namespace issues if we use /proc/self/mountinfo?
mountinfo, err := os.Open(rootfs.ResolveHostFS("/proc/self/mountinfo"))
if err != nil {
return Mountpoints{}, err
}
defer mountinfo.Close()
mounts := map[string]string{}
mountInfo := Mountpoints{}
sc := bufio.NewScanner(mountinfo)
possibleV2Paths := []string{}
for sc.Scan() {
// https://www.kernel.org/doc/Documentation/filesystems/proc.txt
// Example:
// 25 21 0:20 / /cgroup/cpu rw,relatime - cgroup cgroup rw,cpu
line := strings.TrimSpace(sc.Text())
if line == "" {
continue
}
mount, err := parseMountinfoLine(line)
if err != nil {
return Mountpoints{}, err
}
// if the mountpoint from the subsystem has a different root than ours, it probably belongs to something else.
if !strings.HasPrefix(mount.mountpoint, rootfs.ResolveHostFS("")) {
continue
}
// cgroupv1 option
if mount.filesystemType == "cgroup" {
for _, opt := range mount.superOptions {
// Sometimes the subsystem name is written like "name=blkio".
fields := strings.SplitN(opt, "=", 2)
if len(fields) > 1 {
opt = fields[1]
}
// Test if option is a subsystem name.
if _, found := subsystems[opt]; found {
// Add the subsystem mount if it does not already exist.
if _, exists := mounts[opt]; !exists {
mounts[opt] = mount.mountpoint
}
}
}
}
// V2 option
if mount.filesystemType == "cgroup2" {
possibleV2Paths = append(possibleV2Paths, mount.mountpoint)
}
}
mountInfo.V2Loc = getProperV2Paths(rootfs, possibleV2Paths)
mountInfo.V1Mounts = mounts
// we only care about a contanerized root path if we're trying to monitor a host system
// from inside a container
// This logic helps us proper fetch the cgroup path when we're running inside a container
// with a private namespace
if mountInfo.V2Loc != "" && rootfs.IsSet() && cgroupNSStateFetch() {
mountInfo.ContainerizedRootMount, err = guessContainerCgroupPath(mountInfo.V2Loc, os.Getpid())
// treat this as a non-fatal error. If we end up needing this value, the lookups will fail down the line
if err != nil {
logp.L().Debugf("could not fetch cgroup path inside container: %w", err)
}
}
return mountInfo, sc.Err()
}
// isCgroupNSHost returns true if we're running inside a container with a
// private cgroup namespace. Will return true if we're in a public namespace, or there's an error
// Note that this function only makes sense *inside* a container. Outside it will probably always return false.
func isCgroupNSPrivate() bool {
// we don't care about hostfs here, since we're just concerned about
// detecting the environment we're running under.
raw, err := os.ReadFile("/proc/self/cgroup")
if err != nil {
logp.L().Debugf("error reading /proc/self/cgroup to detect docker namespace settings: %w", err)
return false
}
// if we have a path of just "/" that means we're in our own private namespace
// if it's something else, we're probably in a host namespace
segments := strings.Split(strings.TrimSpace(string(raw)), ":")
return segments[len(segments)-1] == "/"
}
// tries to find the cgroup path for the currently-running container,
// assuming we are running in a container.
// see https://docs.docker.com/config/containers/runmetrics/#find-the-cgroup-for-a-given-container
// We need to know the root cgroup we're running under, as
// for monitoring a v2 system with a private namespace, we'll get relative paths
// for the cgroup of a pid, see https://github.com/elastic/elastic-agent-system-metrics/issues/139
// This will only work on v2 cgroups, I haven't run into this on a system with cgroups v1 yet;
// not sure if docker namespacing behaves the same.
func guessContainerCgroupPath(v2Loc string, OurPid int) (string, error) {
// check the cache first
if cachePath := cgroupContainerPath.get(); cachePath != "" {
// check the validity of the cache
rawFile, err := os.ReadFile(filepath.Join(v2Loc, cachePath, "cgroup.procs"))
// if we get a read error, assume the cache is invalid, move on
if err == nil {
if foundMatchingPidInProcsFile(OurPid, string(rawFile)) {
return cachePath, nil
}
}
}
// pattern:
// if in a private cgroup namespace,
// traverse over the root cgroup path, look for *.procs files
// go through all of the *.procs files until we have one that contains our pid
// that path is our cgroup
foundCgroupPath := ""
err := filepath.WalkDir(v2Loc, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
return nil
}
if strings.Contains(d.Name(), "procs") {
pidfile, err := os.ReadFile(path) //nolint: nilerr // we can get lots of weird permissions errors here, so don't fail on an error
if err != nil {
return nil //nolint: nilerr // we can get lots of weird permissions errors here, so don't fail on an error
}
if foundMatchingPidInProcsFile(OurPid, string(pidfile)) {
foundCgroupPath = path
return nil
}
}
return nil
})
if err != nil {
return "", fmt.Errorf("error traversing paths to find cgroup: %w", err)
}
if foundCgroupPath == "" {
return "", nil
}
// strip to cgroup path
cgroupDir := filepath.Dir(foundCgroupPath)
relativePath := strings.TrimPrefix(cgroupDir, v2Loc)
cgroupContainerPath.set(relativePath)
return relativePath, nil
}
// foundMatchingPidInProcsFile is a helper for guessContainerCgroupPath
// that tells us if we have a matching process in a cgroup.procs file
func foundMatchingPidInProcsFile(ourPid int, fileData string) bool {
for _, rawPid := range strings.Split(fileData, "\n") {
if len(rawPid) == 0 {
continue
}
pidInt, err := strconv.ParseInt(strings.TrimSpace(rawPid), 10, 64)
if err != nil {
return false
}
if pidInt == int64(ourPid) {
return true
}
}
return false
}
// when we're reading from a host mountinfo path from inside a container
// (i.e) `/hostfs/proc/self/mountinfo`, we can get a set of cgroup2 mountpoints like this:
// 1718 1686 0:26 / /hostfs/sys/fs/cgroup rw,nosuid,nodev,noexec,relatime master:4 - cgroup2 cgroup2 rw,seclabel
// 1771 1770 0:26 / /hostfs/var/lib/docker/overlay2/1b570230fa3ec3679e354b0c219757c739f91d774ebc02174106488606549da0/merged/sys/fs/cgroup ro,nosuid,nodev,noexec,relatime - cgroup2 cgroup rw,seclabel
// That latter mountpoint, just a link to the overlayfs, is almost guaranteed to throw a permissions error
// try to sort out the mountpoints, and use the correct one
func getProperV2Paths(rootfs resolve.Resolver, possibleV2Paths []string) string {
if len(possibleV2Paths) > 1 {
// try to sort out anything that looks like a docker fs
filteredPaths := []string{}
for _, path := range possibleV2Paths {
if strings.Contains(path, "overlay2") {
continue
}
filteredPaths = append(filteredPaths, path)
}
// if we have no correct paths, give up and use the last one
// the "last one" ideom preserves behavior before we got more clever with looking for the V2 paths
if len(filteredPaths) == 0 {
usePath := possibleV2Paths[len(possibleV2Paths)-1]
logp.L().Debugf("could not find correct cgroupv2 path, reverting to path that may produce errors: %s", usePath)
return usePath
}
// if we're using an alternate hostfs, assume we want to monitor the host system, from inside a container
// and use that path
if rootfs.IsSet() {
root := rootfs.ResolveHostFS("")
hostFSPaths := []string{}
for _, path := range filteredPaths {
if strings.Contains(path, root) {
hostFSPaths = append(hostFSPaths, path)
}
}
// return the last path
if len(hostFSPaths) > 0 {
return hostFSPaths[len(hostFSPaths)-1]
} else {
usePath := filteredPaths[len(filteredPaths)-1]
logp.L().Debugf("An alternate hostfs was specified, but could not find any cgroup mountpoints that contain a hostfs. Using: %s", usePath)
return usePath
}
} else {
// if no hosfs is set, just use the last element
return filteredPaths[len(filteredPaths)-1]
}
} else if len(possibleV2Paths) == 1 {
return possibleV2Paths[0]
}
return ""
}
// ProcessCgroupPaths returns the cgroups to which a process belongs and the
// pathname of the cgroup relative to the mountpoint of the subsystem.
func (r *Reader) ProcessCgroupPaths(pid int) (PathList, error) {
cgroupPath := filepath.Join("proc", strconv.Itoa(pid), "cgroup")
cgroup, err := os.Open(r.rootfsMountpoint.ResolveHostFS(cgroupPath))
if err != nil {
return PathList{}, err //return a blank error so other events can use any file not found errors
}
defer cgroup.Close()
version, err := r.CgroupsVersion(pid)
if err != nil {
return PathList{}, fmt.Errorf("error finding cgroup version for pid %d: %w", pid, err)
}
cPaths := PathList{V1: map[string]ControllerPath{}, V2: map[string]ControllerPath{}}
sc := bufio.NewScanner(cgroup)
for sc.Scan() {
// http://man7.org/linux/man-pages/man7/cgroups.7.html
// Format: hierarchy-ID:subsystem-list:cgroup-path
// Example:
// 2:cpu:/docker/b29faf21b7eff959f64b4192c34d5d67a707fe8561e9eaa608cb27693fba4242
line := sc.Text()
fields := strings.Split(line, ":")
if len(fields) != 3 {
continue
}
path := fields[2]
if r.cgroupsHierarchyOverride != "" {
path = r.cgroupsHierarchyOverride
}
//on newer docker versions (1.41+?), docker will do namespacing with cgroups
// such that we'll get a cgroup path like `0::/../../user.slice/user-1000.slice/session-520.scope`
// `man 7 cgroups` says the following about the path field in the `cgroup` file (emphasis mine):
//
// This field contains the pathname of the control group
// in the hierarchy to which the process belongs. This
// pathname is **relative to the mount point of the hierarchy**.
//
// However, when we try to append something like `/../..` to another path, we obviously blow things up.
// we need to use the absolute path of the container cgroup
if cgroupNSStateFetch() && r.rootfsMountpoint.IsSet() {
if r.cgroupMountpoints.ContainerizedRootMount == "" {
logp.L().Debugf("cgroup for process %d contains a relative cgroup path (%s), but we were not able to find a root cgroup. Cgroup monitoring for this PID may be incomplete",
pid, path)
} else {
logp.L().Debugf("using root mount %s and path %s", r.cgroupMountpoints.ContainerizedRootMount, path)
path = filepath.Join(r.cgroupMountpoints.ContainerizedRootMount, path)
}
}
// cgroup V2
// cgroup v2 controllers will always start with this string
if strings.HasPrefix(line, "0::/") {
// if you're running inside a container
// that's operating with a hybrid cgroups config,
// the containerized process won't see the V2 mount
// inside /proc/self/mountinfo if docker is using cgroups V1
// For this very annoying edge case, revert to the hostfs flag
// If it's not set, warn the user that they've hit this.
// we skip reading paths in case there are cgroups V1 controllers, we are at the cgroup V2 root and the cgroup V2 mount is not available
// instead of returning an error because we don't want to break V1 metric collection for misconfigured hybrid systems that have only
// a cgroup V2 root but don't have any other controllers. This case happens when cgroup V2 FS is mounted at a special location but not used
if version == CgroupsV1 && line == "0::/" && r.cgroupMountpoints.V2Loc == "" {
continue
}
controllerPath := filepath.Join(r.cgroupMountpoints.V2Loc, path)
if r.cgroupMountpoints.V2Loc == "" && !r.rootfsMountpoint.IsSet() {
logp.L().Debugf(`PID %d contains a cgroups V2 path (%s) but no V2 mountpoint was found.
This may be because metricbeat is running inside a container on a hybrid system.
To monitor cgroups V2 processess in this way, mount the unified (V2) hierarchy inside
the container as /sys/fs/cgroup/unified and start the system module with the hostfs setting.`, pid, line)
continue
} else if r.cgroupMountpoints.V2Loc == "" && r.rootfsMountpoint.IsSet() {
controllerPath = r.rootfsMountpoint.ResolveHostFS(filepath.Join("/sys/fs/cgroup/unified", path))
}
// Check if there is an entry for controllerPath already cached.
r.v2ControllerPathCache.Lock()
cacheEntry, ok := r.v2ControllerPathCache.cache[controllerPath]
if ok {
// If the cached entry for controllerPath is not older than 5 minutes,
// return the cached entry.
if time.Since(cacheEntry.added) < 5*time.Minute {
cPaths.V2 = cacheEntry.pathList.V2
r.v2ControllerPathCache.Unlock()
continue
}
// Consider the existing entry for controllerPath invalid, as it is
// older than 5 minutes.
delete(r.v2ControllerPathCache.cache, controllerPath)
}
r.v2ControllerPathCache.Unlock()
cgpaths, err := os.ReadDir(controllerPath)
if err != nil {
return cPaths, fmt.Errorf("error fetching cgroupV2 controllers for cgroup location '%s' and path line '%s': %w", r.cgroupMountpoints.V2Loc, line, err)
}
// In order to produce the same kind of data for cgroups V1 and V2 controllers,
// We iterate over the group, and look for controllers, since the V2 unified system doesn't list them under the PID
for _, singlePath := range cgpaths {
if strings.Contains(singlePath.Name(), "stat") {
controllerName := strings.TrimSuffix(singlePath.Name(), ".stat")
cPaths.V2[controllerName] = ControllerPath{ControllerPath: path, FullPath: controllerPath, IsV2: true}
}
}
r.v2ControllerPathCache.Lock()
r.v2ControllerPathCache.cache[controllerPath] = pathListWithTime{
added: time.Now(),
pathList: cPaths,
}
r.v2ControllerPathCache.Unlock()
// cgroup v1
} else {
subsystems := strings.Split(fields[1], ",")
for _, subsystem := range subsystems {
fullPath := filepath.Join(r.cgroupMountpoints.V1Mounts[subsystem], path)
cPaths.V1[subsystem] = ControllerPath{ControllerPath: path, FullPath: fullPath, IsV2: false}
}
}
}
if sc.Err() != nil {
return cPaths, fmt.Errorf("error scanning cgroup file: %w", err)
}
return cPaths, nil
}