metric/system/process/process_linux_common.go (409 lines of code) (raw):

// Licensed to Elasticsearch B.V. under one or more contributor // license agreements. See the NOTICE file distributed with // this work for additional information regarding copyright // ownership. Elasticsearch B.V. licenses this file to you under // the Apache License, Version 2.0 (the "License"); you may // not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. //go:build freebsd || linux package process import ( "bytes" "errors" "fmt" "io" "os" "os/user" "strconv" "strings" "syscall" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/elastic-agent-libs/mapstr" "github.com/elastic/elastic-agent-libs/opt" "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" ) // Indulging in one non-const global variable for the sake of storing boot time // This value obviously won't change while this code is running. var bootTime uint64 = 0 // system tick multiplier, see C.sysconf(C._SC_CLK_TCK) const ticks = 100 // GetSelfPid returns the process we're running as. // for cases of self-monitoring this requires some actual thought; // if we use os.Getpid() and we're running inside a container, that PID will // only be valid inside the container, and an attempt to fetch metrics from // `/hostfs/proc/` for that pid will fail. If we're using a hostfs, revert to _that_ for fetching the pid metrics. func GetSelfPid(hostfs resolve.Resolver) (int, error) { if !hostfs.IsSet() { return os.Getpid(), nil } statRaw, err := os.ReadFile(hostfs.ResolveHostFS("/proc/self/stat")) if err != nil { return 0, fmt.Errorf("error reading from self/stat while searching for our PID in a container: %w", err) } parts := strings.Split(string(statRaw), " ") pidRaw := parts[0] pid, err := strconv.ParseInt(pidRaw, 10, 64) if err != nil { return 0, fmt.Errorf("error parsing int from `stat` while searching for our pid in a container: %w", err) } return int(pid), nil } // FetchPids is the linux implementation of FetchPids func (procStats *Stats) FetchPids() (ProcsMap, []ProcState, error) { dir, err := os.Open(procStats.Hostfs.ResolveHostFS("proc")) if err != nil { return nil, nil, fmt.Errorf("error reading from procfs %s: %w", procStats.Hostfs.ResolveHostFS("/"), err) } defer dir.Close() const readAllDirnames = -1 // see os.File.Readdirnames doc names, err := dir.Readdirnames(readAllDirnames) if err != nil { return nil, nil, fmt.Errorf("error reading directory names: %w", err) } procMap := make(ProcsMap, len(names)) plist := make([]ProcState, 0, len(names)) var wrappedErr error // Iterate over the directory, fetch just enough info so we can filter based on user input. logger := logp.L() for _, name := range names { if !dirIsPid(name) { continue } // Will this actually fail? pid, err := strconv.Atoi(name) if err != nil { logger.Debugf("Error converting PID name %s", name) continue } procMap, plist, err = procStats.pidIter(pid, procMap, plist) wrappedErr = errors.Join(wrappedErr, err) } return procMap, plist, toNonFatal(wrappedErr) } func FillPidMetrics(hostfs resolve.Resolver, pid int, state ProcState, filter func(string) bool) (ProcState, error) { // Memory Data var err error state.Memory, err = getMemData(hostfs, pid) if err != nil { return state, fmt.Errorf("error getting memory data for pid %d: %w", pid, err) } // CPU Data state.CPU, err = getCPUTime(hostfs, pid) if err != nil { return state, fmt.Errorf("error getting CPU data for pid %d: %w", pid, err) } // CLI args if len(state.Args) == 0 { state.Args, err = getArgs(hostfs, pid) if err != nil { return state, fmt.Errorf("error getting CLI args for pid %d: %w", pid, err) } } // FD metrics state.FD, err = getFDStats(hostfs, pid) if err != nil { return state, fmt.Errorf("error getting FD metrics for pid %d: %w", pid, err) } if state.Env == nil { // env vars state.Env, _ = getEnvData(hostfs, pid, filter) } state.Exe, state.Cwd, err = getProcStringData(hostfs, pid) // skip permission errors and file not found errors // see https://github.com/elastic/elastic-agent-system-metrics/issues/135 for a bit more context, // depending on the permissions/caps that this is running with, the /exe symlink may have different levels of permission restrictions. // A kernel proc will also return file not found. if err != nil && !errors.Is(err, os.ErrPermission) && !errors.Is(err, os.ErrNotExist) { // ignore permission errors return state, fmt.Errorf("error getting metadata for pid %d: %w", pid, err) } state.Username, err = getUser(hostfs, pid) if err != nil { return state, fmt.Errorf("error creating username for pid %d: %w", pid, err) } // the /proc/[pid]/io metrics require SYS_PTRACE when run from inside docker state.IO, err = getIOData(hostfs, pid) if err != nil { return state, NonFatalErr{Err: fmt.Errorf("/io unavailable; if running inside a container, use SYS_PTRACE: %w", err)} } return state, nil } // GetInfoForPid fetches and parses the process information of the process // identified by pid from /proc/[PID]/stat func GetInfoForPid(hostFS resolve.Resolver, pid int) (ProcState, error) { path := hostFS.Join("proc", strconv.Itoa(pid), "stat") data, err := os.ReadFile(path) // Transform the error into a more sensible error in cases where the directory doesn't exist, i.e the process is gone if err != nil { if os.IsNotExist(err) { return ProcState{}, syscall.ESRCH } return ProcState{}, fmt.Errorf("error reading procdir %s: %w", path, err) } state, err := parseProcStat(data) state.Pid = opt.IntWith(pid) if err != nil { return state, fmt.Errorf("failed to parse information for pid %d': %w", pid, err) } return state, nil } func parseProcStat(data []byte) (ProcState, error) { const minFields = 36 state := ProcState{} // Extract the comm value with is surrounded by parentheses. lIdx := bytes.Index(data, []byte("(")) rIdx := bytes.LastIndex(data, []byte(")")) if lIdx < 0 || rIdx < 0 || lIdx >= rIdx || rIdx+2 >= len(data) { return state, fmt.Errorf("failed to extract 'comm' field from '%v'", string(data)) } state.Name = string(data[lIdx+1 : rIdx]) // Extract the rest of the fields that we are interested in. fields := bytes.Fields(data[rIdx+2:]) if len(fields) <= minFields { return state, fmt.Errorf("expected at least %d stat fields from '%v'", minFields, string(data)) } // See https://man7.org/linux/man-pages/man5/proc.5.html for all fields. interests := bytes.Join([][]byte{ fields[0], // state fields[1], // ppid fields[2], // pgrp fields[17], // num_threads }, []byte(" ")) var procState string var ppid, pgid, numThreads int _, err := fmt.Fscan(bytes.NewBuffer(interests), &procState, &ppid, &pgid, &numThreads, ) if err != nil { return state, fmt.Errorf("failed to parse stat fields from '%s': %w", string(data), err) } state.State = getProcState(procState[0]) state.Ppid = opt.IntWith(ppid) state.Pgid = opt.IntWith(pgid) state.NumThreads = opt.IntWith(numThreads) return state, nil } func getProcStringData(hostfs resolve.Resolver, pid int) (string, string, error) { exe, err := os.Readlink(hostfs.Join("proc", strconv.Itoa(pid), "exe")) if errors.Is(err, os.ErrPermission) || errors.Is(err, os.ErrNotExist) { // pass through errors return "", "", err } else if err != nil { return "", "", fmt.Errorf("error fetching exe from pid %d: %w", pid, err) } cwd, err := os.Readlink(hostfs.Join("proc", strconv.Itoa(pid), "cwd")) if errors.Is(err, os.ErrPermission) { return "", "", err } else if err != nil { return "", "", fmt.Errorf("error fetching cwd for pid %d: %w", pid, err) } return exe, cwd, nil } func dirIsPid(name string) bool { if name[0] < '0' || name[0] > '9' { return false } return true } func getUser(hostfs resolve.Resolver, pid int) (string, error) { status, err := getProcStatus(hostfs, pid) if err != nil { return "", fmt.Errorf("error fetching user ID for pid %d: %w", pid, err) } uidValues, ok := status["Uid"] if !ok { return "", fmt.Errorf("field Uid not found in proc status: %w", err) } uidStrings := strings.Fields(uidValues) var userFinal string user, err := user.LookupId(uidStrings[0]) if err == nil { userFinal = user.Username } else { userFinal = uidStrings[0] } return userFinal, nil } func getEnvData(hostfs resolve.Resolver, pid int, filter func(string) bool) (mapstr.M, error) { path := hostfs.Join("proc", strconv.Itoa(pid), "environ") data, err := os.ReadFile(path) if errors.Is(err, os.ErrPermission) { // pass through permission errors return nil, err } else if err != nil { return nil, fmt.Errorf("error opening file %s: %w", path, err) } env := mapstr.M{} pairs := bytes.Split(data, []byte{0}) for _, kv := range pairs { parts := bytes.SplitN(kv, []byte{'='}, 2) if len(parts) != 2 { continue } key := string(bytes.TrimSpace(parts[0])) if key == "" { continue } if filter == nil || filter(key) { env[key] = string(bytes.TrimSpace(parts[1])) } } return env, nil } func getMemData(hostfs resolve.Resolver, pid int) (ProcMemInfo, error) { // Memory data state := ProcMemInfo{} path := hostfs.Join("proc", strconv.Itoa(pid), "statm") data, err := os.ReadFile(path) if err != nil { return state, fmt.Errorf("error opening file %s: %w", path, err) } fields := strings.Fields(string(data)) size, err := strconv.ParseUint(fields[0], 10, 64) if err != nil { return state, fmt.Errorf("error parsing memory size %s: %w", fields[0], err) } state.Size = opt.UintWith(size << 12) rss, err := strconv.ParseUint(fields[1], 10, 64) if err != nil { return state, fmt.Errorf("error parsing memory rss %s: %w", fields[1], err) } state.Rss.Bytes = opt.UintWith(rss << 12) share, _ := strconv.ParseUint(fields[2], 10, 64) state.Share = opt.UintWith(share << 12) return state, nil } func getIOData(hostfs resolve.Resolver, pid int) (ProcIOInfo, error) { state := ProcIOInfo{} path := hostfs.Join("proc", strconv.Itoa(pid), "io") data, err := os.ReadFile(path) if err != nil { return state, fmt.Errorf("error fetching IO metrics: %w", err) } for _, metric := range strings.Split(string(data), "\n") { raw := strings.Split(metric, ": ") if len(raw) < 2 { continue } value, err := strconv.ParseUint(raw[1], 10, 64) if err != nil { return state, fmt.Errorf("error converting counters '%s' in io stat file: %w", raw, err) } switch raw[0] { case "rchar": state.ReadChar = opt.UintWith(value) case "wchar": state.WriteChar = opt.UintWith(value) case "syscr": state.ReadSyscalls = opt.UintWith(value) case "syscw": state.WriteSyscalls = opt.UintWith(value) case "read_bytes": state.ReadBytes = opt.UintWith(value) case "write_bytes": state.WriteBytes = opt.UintWith(value) case "cancelled_write_bytes": state.CancelledWriteBytes = opt.UintWith(value) } } return state, nil } func getCPUTime(hostfs resolve.Resolver, pid int) (ProcCPUInfo, error) { state := ProcCPUInfo{} pathCPU := hostfs.Join("proc", strconv.Itoa(pid), "stat") data, err := os.ReadFile(pathCPU) if err != nil { return state, fmt.Errorf("error opening file %s: %w", pathCPU, err) } fields := strings.Fields(string(data)) user, err := strconv.ParseUint(fields[13], 10, 64) if err != nil { return state, fmt.Errorf("error parsing user CPU times for pid %d: %w", pid, err) } sys, err := strconv.ParseUint(fields[14], 10, 64) if err != nil { return state, fmt.Errorf("error parsing system CPU times for pid %d: %w", pid, err) } btime, err := getLinuxBootTime(hostfs) if err != nil { return state, fmt.Errorf("error feting boot time for pid %d: %w", pid, err) } // convert to milliseconds from USER_HZ // This effectively means our definition of "ticks" throughout the process code is a millisecond state.User.Ticks = opt.UintWith(user * (1000 / ticks)) state.System.Ticks = opt.UintWith(sys * (1000 / ticks)) state.Total.Ticks = opt.UintWith(opt.SumOptUint(state.User.Ticks, state.System.Ticks)) startTime, err := strconv.ParseUint(fields[21], 10, 64) if err != nil { return state, fmt.Errorf("error parsing start time value %s for pid %d: %w", fields[21], pid, err) } startTime /= ticks startTime += btime startTime *= 1000 state.StartTime = unixTimeMsToTime(startTime) return state, nil } func getArgs(hostfs resolve.Resolver, pid int) ([]string, error) { path := hostfs.Join("proc", strconv.Itoa(pid), "cmdline") data, err := os.ReadFile(path) if err != nil { return nil, fmt.Errorf("error opening file %s: %w", path, err) } bbuf := bytes.NewBuffer(data) var args []string for { arg, err := bbuf.ReadBytes(0) if err == io.EOF { break } trimmedArg := string(arg[0 : len(arg)-1]) args = append(args, trimmedArg) } return args, nil } func getFDStats(hostfs resolve.Resolver, pid int) (ProcFDInfo, error) { state := ProcFDInfo{} path := hostfs.Join("proc", strconv.Itoa(pid), "limits") data, err := os.ReadFile(path) if err != nil { return state, fmt.Errorf("error opening file %s: %w", path, err) } for _, line := range strings.Split(string(data), "\n") { if strings.HasPrefix(line, "Max open files") { fields := strings.Fields(line) if len(fields) == 6 { softLimit, err := strconv.ParseUint(fields[3], 10, 64) if err != nil { return state, fmt.Errorf("error parsing limits value %s for pid %d: %w", fields[3], pid, err) } state.Limit.Soft = opt.UintWith(softLimit) hardLimit, err := strconv.ParseUint(fields[4], 10, 64) if err != nil { return state, fmt.Errorf("error parsing limits value %s for pid %d: %w", fields[3], pid, err) } state.Limit.Hard = opt.UintWith(hardLimit) } } } pathFD := hostfs.Join("proc", strconv.Itoa(pid), "fd") fds, err := os.ReadDir(pathFD) if errors.Is(err, os.ErrPermission) { // ignore permission errors, passthrough other data return state, nil } else if err != nil { return state, fmt.Errorf("error reading FD directory for pid %d: %w", pid, err) } state.Open = opt.UintWith(uint64(len(fds))) return state, nil } // getLinuxBootTime fetches the static unix time for when the system was booted. func getLinuxBootTime(hostfs resolve.Resolver) (uint64, error) { if bootTime != 0 { return bootTime, nil } path := hostfs.Join("proc", "stat") // grab system boot time data, err := os.ReadFile(path) if err != nil { return 0, fmt.Errorf("error opening file %s: %w", path, err) } statVals := strings.Split(string(data), "\n") for _, line := range statVals { if strings.HasPrefix(line, "btime") { btime, err := strconv.ParseUint(line[6:], 10, 64) if err != nil { return 0, fmt.Errorf("error reading boot time: %w", err) } bootTime = btime return btime, nil } } return 0, fmt.Errorf("no boot time find in file %s: %w", path, err) } func getProcStatus(hostfs resolve.Resolver, pid int) (map[string]string, error) { status := make(map[string]string, 42) path := hostfs.Join("proc", strconv.Itoa(pid), "status") data, err := os.ReadFile(path) if err != nil { return nil, fmt.Errorf("error opening file %s: %w", path, err) } for _, line := range strings.Split(string(data), "\n") { fields := strings.SplitN(line, ":", 2) if len(fields) == 2 { status[fields[0]] = strings.TrimSpace(fields[1]) } } return status, err } func getProcState(b byte) PidState { state, ok := PidStates[b] if ok { return state } return Unknown } func FillMetricsRequiringMoreAccess(_ int, state ProcState) (ProcState, error) { return state, nil }