internal/resource/oom_monitor_linux.go (202 lines of code) (raw):

// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux package resource import ( "bufio" "context" "fmt" "os" "path/filepath" "strings" "time" "unsafe" "github.com/GoogleCloudPlatform/galog" "github.com/GoogleCloudPlatform/google-guest-agent/internal/utils/file" "golang.org/x/sys/unix" "github.com/GoogleCloudPlatform/google-guest-agent/internal/events" ) // oomV1Watcher implements the OOM watcher based on cgroupv1. type oomV1Watcher struct { // name is the name of the process that is being monitored. name string // fd is the eventfd that cgroup notifies on when an OOM event occurs. fd int // procCgroupDir is the process cgroup directory path where the constraints // and control files are located. procCgroupDir string } // NewOOMWatcher initializes a new watcher for memory event for a cgroup. // Reference: https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt // section "10. OOM Control". func (c cgroupv1Client) NewOOMWatcher(ctx context.Context, constraint Constraint, _ time.Duration) (events.Watcher, error) { dir := filepath.Join(c.cgroupsDir, c.memoryController, constraint.Name) if !file.Exists(dir, file.TypeDir) { return nil, fmt.Errorf("cgroup directory %q not found, cannot set up oom watcher", dir) } // https://man7.org/linux/man-pages/man2/eventfd.2.html fd, err := unix.Eventfd(0, unix.EFD_CLOEXEC) if err != nil { return nil, fmt.Errorf("failed to create eventfd: %w", err) } oomControlFile := filepath.Join(dir, "memory.oom_control") eventFile, err := os.Open(oomControlFile) if err != nil { return nil, fmt.Errorf("failed to open %q: %w", oomControlFile, err) } defer eventFile.Close() eventControlPath := filepath.Join(dir, "cgroup.event_control") data := fmt.Sprintf("%d %d", fd, eventFile.Fd()) if err := os.WriteFile(eventControlPath, []byte(data), 0755); err != nil { return nil, fmt.Errorf("failed to write %q: %w", eventControlPath, err) } watcher := &oomV1Watcher{fd: fd, procCgroupDir: dir, name: constraint.Name} return watcher, nil } // ID returns the unique ID of the watcher. func (c *oomV1Watcher) ID() string { return fmt.Sprintf("cgroupv1_oom_watcher_%s", c.name) } // Events returns the list of events that the watcher supports. func (c *oomV1Watcher) Events() []string { return []string{fmt.Sprintf("cgroupv1_oom_event_%s", c.name)} } // Run runs the watcher for the cgroup, and it waits until either an OOM event // happens or the cgroup is removed. If an OOM kill is detected, a non-nil // error is returned. The watcher is expected to run as long as the process and // the cgroup are active. When the cgroup is removed, the watcher takes // responsibility for closing the eventfd. This watcher is executed for each // plugin process. Plugin removal handles the deletion of cgroups associated // with it. When the cgroup is deleted, the watcher detects it, closes and // releases the file descriptor, and unregisters the watcher. func (c *oomV1Watcher) Run(ctx context.Context, evType string) (bool, any, error) { galog.V(2).Debugf("Running watcher for cgroup: %s, process: %s", c.procCgroupDir, c.name) buf := make([]byte, 8) // Cgroup notifies the eventfd when an OOM event occurs. This is a blocking // call and returns when an event is received. OOM or cgroup removal causes // an event to be sent on eventfd. if _, err := unix.Read(c.fd, buf); err != nil { // Close the fd only when watcher is removed. If any error occurs, it is not // actionable, just return original error. unix.Close(c.fd) return false, nil, fmt.Errorf("failed to read eventfd: %w", err) } now := time.Now() eventControlPath := filepath.Join(c.procCgroupDir, "cgroup.event_control") // When a cgroup is removed, an event is sent on eventfd. If the group is // gone, stop watching, release fd and return error instead of notifying. if !file.Exists(eventControlPath, file.TypeFile) { // If any error occurs, it is not actionable, just return actual error. unix.Close(c.fd) return false, nil, fmt.Errorf("cgroup %s is removed, removing watcher", c.procCgroupDir) } // Renew and continue watching for a new event for the cgroup. galog.V(2).Debugf("Identified oom event for cgroup: %s, process: %s", c.procCgroupDir, c.name) return true, &OOMEvent{Name: c.name, Timestamp: now}, nil } // oomV2Watcher implements the cgroupv2 based OOM watcher. type oomV2Watcher struct { // name is the name of the process that is being monitored. name string // inotifyFd is file descriptor on which inotify modify event for file // `memory.events` is received. inotifyFd int // epollFd is the epoll file descriptor on which inotifyFd is registered. // Watcher monitors this fd for I/O events. epollFd int // memoryEventsFile is the file path of the `memory.events` file for this // process. On `inotify` modify event, the watcher reads this file to verify // if there is a new oom kill event. memoryEventsFile string // prevOOMKillCount is the last known count of OOM kills for this cgroup. This // is used to track delta and detect new OOM kill event. prevOOMKillCount int // epollWaitTimeout is the interval in milliseconds to wait for an inotify // epoll event. epollWaitTimeout int } // NewOOMWatcher initializes a new watcher for memory event for a cgroup. func (c cgroupv2Client) NewOOMWatcher(ctx context.Context, constraint Constraint, interval time.Duration) (events.Watcher, error) { // https://man7.org/linux/man-pages/man7/inotify.7.html inotifyFd, err := unix.InotifyInit() if err != nil { return nil, fmt.Errorf("unable to initialize inotify instance: %w", err) } memEventsPath := filepath.Join(c.cgroupsDir, guestAgentCgroupDir, constraint.Name, "memory.events") // We don't care about watch descriptor and can be ignored. We directly // monitor the inotifyFd for I/O events and close it when we are done. _, err = unix.InotifyAddWatch(inotifyFd, memEventsPath, unix.IN_MODIFY) if err != nil { unix.Close(inotifyFd) return nil, fmt.Errorf("unable to add inotify file modify watcher for %q: %w", memEventsPath, err) } epollFd, err := unix.EpollCreate1(0) if err != nil { unix.Close(inotifyFd) return nil, fmt.Errorf("unable to create epoll instance: %w", err) } watcher := &oomV2Watcher{inotifyFd: inotifyFd, memoryEventsFile: memEventsPath, name: constraint.Name, epollFd: epollFd, epollWaitTimeout: int(interval.Milliseconds())} // https://man7.org/linux/man-pages/man2/epoll_ctl.2.html // Create poll event type to wait for `inotifyFd` to be available for reading. pollEvent := unix.EpollEvent{Events: unix.EPOLLIN, Fd: int32(inotifyFd)} if err := unix.EpollCtl(epollFd, unix.EPOLL_CTL_ADD, inotifyFd, &pollEvent); err != nil { watcher.close() return nil, fmt.Errorf("unable to add inotify fd [%d] to epoll: %w", inotifyFd, err) } return watcher, nil } // ID returns the unique ID of the watcher. func (c *oomV2Watcher) ID() string { return fmt.Sprintf("cgroupv2_oom_watcher_%s", c.name) } // Events returns the list of events that the watcher supports. func (c *oomV2Watcher) Events() []string { return []string{fmt.Sprintf("cgroupv2_oom_event_%s", c.name)} } // close closes the inotify and epoll file descriptors. This is called when the // watcher is removed and error is not actionable, just log it. func (c *oomV2Watcher) close() { galog.Debugf("Closing inotify fd: %d, epoll fd: %d for process: %s", c.inotifyFd, c.epollFd, c.name) if err := unix.Close(c.inotifyFd); err != nil { galog.Debugf("Failed to close inotify fd: %v", err) } if err := unix.Close(c.epollFd); err != nil { galog.Debugf("Failed to close epoll fd: %v", err) } } // Run runs the cgroupv2 based oom watcher to detect oom event for a process. // // Each time a new OOM kill occurs, the cgroup writes the event to the file. A // watcher process monitors this file for modifications using inotify. However, // since the cgroup V2 filesystem doesn't support file removal inotify events, // we use epoll to wait for an inotify event on the fd with a timeout. If an // event is received, the file is read to verify if there's a new OOM kill // event. If no event is received or epoll timeouts we check if the cgroup // still exists and whether we should continue monitoring it. All file // descriptors are automatically closed when the watcher is removed. func (c *oomV2Watcher) Run(ctx context.Context, evType string) (bool, any, error) { galog.Debugf("Running watcher for cgroup: %s, process: %s", c.memoryEventsFile, c.name) // Return from watcher only when there is a new oom kill event. for ctx.Err() == nil { // If cgroup is removed, stop watching and return error. This can happen // when the plugin is removed. if !file.Exists(c.memoryEventsFile, file.TypeFile) { galog.Debugf("Cgroup %s is removed, removing watcher", c.memoryEventsFile) c.close() return false, nil, fmt.Errorf("cgroup %s is removed, removing watcher", c.memoryEventsFile) } events := make([]unix.EpollEvent, 1) // Wait for inotify event with specified timeout. This is a blocking call // and returns when an event is received or timeout occurs. Timeout allows // us to check if the cgroup is still active. n, err := unix.EpollWait(c.epollFd, events, c.epollWaitTimeout) if err != nil { c.close() return false, nil, fmt.Errorf("failed to epoll wait: %w", err) } if n == 0 { galog.V(2).Debugf("Timedout, no epoll event found, continue waiting") continue } // We shouldn't get any other events than what we are watching for. This // is just a safety check to avoid blocking reads and any unexpected errors. if events[0].Fd != int32(c.inotifyFd) { galog.Debugf("Ignoring unknown epoll event: %+v", events[0]) continue } // In this case, we are only watching for single `inotify` file descriptor. // Try to read the event and check if there is a new oom kill event directly. // If there were multiple file descriptors, we would have to check the // `events` array for the file descriptor that we are interested in. isOOM, err := c.readInotifyEvent() if err != nil { galog.Debugf("Failed to check oom event: %v", err) continue } // We detected a new oom kill event, notify subscribers and continue // watching for a new event. if isOOM { galog.V(2).Debugf("Detected oom event for cgroup: %s, process: %s", c.memoryEventsFile, c.name) return true, &OOMEvent{Name: c.name, Timestamp: time.Now()}, nil } } // We will reach here only when the context is cancelled. Cancel the watcher // release descriptors and return original ctx error. c.close() return false, nil, ctx.Err() } // readInotifyEvent reads the inotify event and returns true if new oom kill // event is detected. func (c *oomV2Watcher) readInotifyEvent() (bool, error) { galog.V(2).Debugf("Attempting to read inotify event for cgroup: %s, process: %s", c.memoryEventsFile, c.name) // When reading inotify events, the recommended size for the buffer is a // multiple of the size of unix.InotifyEvent plus the maximum filename length. // This ensures we can always read at least one complete event, even if it // includes the maximum allowed filename length. buf := make([]byte, unix.SizeofInotifyEvent+unix.PathMax+1) // Read call here will not block as we have already waited for `fd` to be // ready for reading in epoll. readBytes, err := unix.Read(c.inotifyFd, buf) if err != nil { unix.Close(c.inotifyFd) return false, fmt.Errorf("failed to read inotify event: %w", err) } if readBytes < unix.SizeofInotifyEvent { return false, fmt.Errorf("invalid inotify event size: %d, expected at least: %d", readBytes, unix.SizeofInotifyEvent) } var offset uint32 for offset <= uint32(readBytes-unix.SizeofInotifyEvent) { event := (*unix.InotifyEvent)(unsafe.Pointer(&buf[offset])) offset += unix.SizeofInotifyEvent + event.Len if event.Mask&unix.IN_MODIFY != unix.IN_MODIFY { galog.V(2).Debugf("Ignoring unknown inotify event: %+v", event) continue } galog.Debugf("Reading oom kill count from %q", c.memoryEventsFile) newOOMKillCount, err := readOOMKillCount(c.memoryEventsFile) if err != nil { return false, fmt.Errorf("failed to read oom kill count from %q: %w", c.memoryEventsFile, err) } galog.Debugf("New oom kill count: %d, prev oom kill count: %d", newOOMKillCount, c.prevOOMKillCount) if newOOMKillCount != c.prevOOMKillCount { c.prevOOMKillCount = newOOMKillCount return true, nil } } return false, nil } // readOOMKillCount supports reading flat-keyed file and returns the number of // times process belonging to this cgroup was killed by any kind of OOM killer. // Format of flat-keyed file is simple `key value` space separated. func readOOMKillCount(path string) (int, error) { var count int file, err := os.Open(path) if err != nil { return 0, fmt.Errorf("error opening %q file: %w", path, err) } defer file.Close() scanner := bufio.NewScanner(file) for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) if strings.HasPrefix(line, "oom_kill") { // Format: oom_kill <count>. _, err := fmt.Sscanf(line, "oom_kill %d", &count) if err != nil { return 0, fmt.Errorf("error parsing line %q from %q: %w", line, path, err) } break } } if err := scanner.Err(); err != nil { return 0, fmt.Errorf("error reading %q file: %w", path, err) } return count, nil }