bistro/processes/AsyncCGroupReaper.cpp (182 lines of code) (raw):

/* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include "bistro/bistro/processes/AsyncCGroupReaper.h" #include <signal.h> #include <boost/filesystem.hpp> #include <folly/File.h> #include <folly/FileUtil.h> #include <folly/gen/File.h> #include "bistro/bistro/utils/Exception.h" // Future: make some dedicated tests patterned on the integration tests with // TestTaskSubprocessQueue, but less clunky, and deeper. DEFINE_int32( cgroup_freeze_timeout_ms, 10000, "If a cgroup fails to freeze for this long, it will be thawed and re-frozen " "to try to mitigate a race inherent between OOM-notifier & freezer. If you " "set this too low, a freeze will never succeed, breaking cgroup killing." ); namespace facebook { namespace bistro { namespace { // Make via asyncCGroupReaper below. Patterned on AsyncSubprocess. class AsyncCGroupReaper : public folly::AsyncTimeout { public: AsyncCGroupReaper( folly::EventBase* event_base, cpp2::CGroupOptions cgopts, std::string cgname, uint32_t min_wait_ms, uint32_t max_wait_ms ) : folly::AsyncTimeout(event_base), cgroupOpts_(std::move(cgopts)), cgroupName_(std::move(cgname)), minWaitMs_(min_wait_ms), maxWaitMs_(max_wait_ms), curWaitMs_(min_wait_ms) {} // Since Promise has a nontrivial destructor, it may be bad to `delete // this` in the constructor, possibly even "undefined behavior"-bad :) void initialize(folly::Future<folly::Unit>* empty_cgroups) { // DO NOT use objects with non-empty destructors here, we `delete this`. *empty_cgroups = emptyCGroups_.getFuture(); // Reduce latency: check eagerly if there are no groups to wait for. if (auto nonempty_subsystem = findNonEmptySubsystem()) { // Make one kill attempt immediately, since it takes at least 2 calls // to actually signal the cgroup (1 to freeze, and 1 to signal). In // the common case of "no D state tasks", this should cut down cgroup // kill latency to 1-5 minWaitMs_ intervals. workOnKillingCGroupTasks(*nonempty_subsystem); // noexcept // The first delay is exempt from exponential backoff. myScheduleTimeout(minWaitMs_); // If AsyncCGroupReaper were created outside the EventBase thread, the // class could already be destroyed by timeoutExpired() by this point. return; // Do not self-destruct. } emptyCGroups_.setValue(); delete this; // Better do **nothing** else, `this` is gone. } void timeoutExpired() noexcept override { // DO NOT use objects with non-empty destructors here, we `delete this`. totalWaitMs_ += curWaitMs_; // Check before trying to kill. The other order is less efficient -- a // successful kill is not instant, so an immediate areCGroupEmpty() // check would usually fail anyway. if (auto nonempty_subsystem = findNonEmptySubsystem()) { LOG(WARNING) << "Trying to reap intransigent task with cgroup " << cgroupName_ << " for over " << totalWaitMs_ << " ms"; workOnKillingCGroupTasks(*nonempty_subsystem); myScheduleTimeout(maxWaitMs_); // Timeout exponentially grows to max. return; // Not done yet, don't self-destruct. } LOG(WARNING) << "Intransigent task with cgroup " << cgroupName_ << " exited after " << totalWaitMs_ << " ms"; emptyCGroups_.setValue(); delete this; // Better do **nothing** else, `this` is gone. } private: void myScheduleTimeout(uint32_t ms) { curWaitMs_ = std::min(ms, curWaitMs_ << 2); // 4x exponential backoff scheduleTimeout(curWaitMs_); } std::string cgroupDir(const std::string& subsystem) const { return (boost::filesystem::path(*cgroupOpts_.root_ref()) / subsystem / *cgroupOpts_.slice_ref() / cgroupName_) .native(); } /** * Returns false if none of the task's cgroups have processes in them. * Removes empty cgroup directories, which is important when the system * `release_agent` reaper is not configured. */ folly::Optional<std::string> findNonEmptySubsystem() noexcept { // While checking /cgroup.procs files, keep in mind that the kernel // `release_agent` can remove a cgroup from under us at any time. for (const auto& subsystem : *cgroupOpts_.subsystems_ref()) { auto dir = cgroupDir(subsystem); boost::system::error_code ec; if (!boost::filesystem::is_directory(dir, ec) || ec) { continue; // cgroup was likely already reaped } // Don't trust `filesystem::is_empty` since sysfs metadata is bogus. try { folly::File procs_file(dir + "/cgroup.procs"); char c; // Reading one byte is enough to prove non-emptiness. ssize_t bytes_read = folly::readFull(procs_file.fd(), &c, 1); if (bytes_read == 1) { if (c < '0' || c > '9') { LOG(WARNING) << dir << "/cgroup.procs starts with bad char: " << c; } return subsystem; // At least one cgroup contains data. } else if (bytes_read == -1) { PLOG(WARNING) << dir << "/cgroup.procs is unreadable"; } else if (bytes_read == 0) { // Empty cgroup, but the directory still exists -- try to reap it. if (boost::filesystem::remove(dir, ec) && !ec) { LOG(INFO) << "Removed empty cgroup " << dir; } else if (ec) { LOG(WARNING) << "Failed to remove empty cgroup: " << dir << ": " << ec.message(); } // else: no file or directory existed at `dir`, no error occurred. } else { LOG(FATAL) << "read() returned bad value: " << bytes_read; } } catch (const std::exception& ex) { // Maybe we raced the system `release_agent` to reap the directory? LOG(WARNING) << dir << "/cgroup.procs is unreadable: " << ex.what(); } // Either the cgroup is empty, or a read error made us assume it's gone. } return folly::none; } // Helper for workOnKillingCGroupTasks. template <class... Args> void badFreezer(Args&&... args) noexcept { LOG(WARNING) << "Not sending SIGKILL to tasks in the " << cgroupName_ << " cgroups, since the `freezer` subsystem" << folly::to<std::string>(std::forward<Args>(args)...) << ", which would make it easy to kill the wrong processes."; } // Helper for workOnKillingCGroupTasks. // Nothing to be done on error, so this does not have a return value. void freezerWrite( const std::string& freezer_state_path, const std::string& value) noexcept { if (!folly::writeFile( // No O_CREAT. If the path is bad, append is safer. value, freezer_state_path.c_str(), O_WRONLY | O_APPEND )) { badFreezer("'s state is unwritable: ", strError()); } } // Helper for workOnKillingCGroupTasks(), do NOT call directly. void killCGroupTasks(const std::string& subsystem) noexcept { // Uses /cgroup.procs instead of /tasks since it seems awkward and // unnecessary to signal each thread of a running process individually. std::unordered_set<uint64_t> pids; // cgroup.procs need not be unique try { pids = folly::gen::byLine( folly::File(cgroupDir(subsystem) + "/cgroup.procs") ) | folly::gen::eachTo<uint64_t>() | folly::gen::as<decltype(pids)>(); } catch (const std::exception& ex) { // Can happen if the system `release_agent` reaped the cgroup since // e.g. all processes quit after our last findNonEmptySubsystem(). LOG(WARNING) << "Killing cgroup " << cgroupName_ << ": " << ex.what(); } for (auto pid : pids) { // FATAL since none of the POSIX error conditions can occur, unless // we have a serious bug like signaling the wrong PID. PLOG_IF(FATAL, ::kill(pid, SIGKILL) == -1) << "Failed to kill " << pid << " from cgroup " << cgroupName_; } } /** * Only sends signals if the `freezer` subsystem is available, or if the * (dangerous) flag `killWithoutFreezer` is set. * * == How is `freezer` used? == * * Signaling a cgroup is racy: between the time we read * `/cgroup.procs` and the time we send the signal, a process could exit, * and its PID could be recycled -- killing the wrong process. This * mitigates the race by freezing the cgroup, sending SIGKILL, and thawing * the cgroup. Since freezing the cgroup takes considerable time, the * work is spread over multiple calls to workOnKillingCGroupTasks() -- * this is not a standalone synchronous operation. * * This gets called repeatedly, every curWaitMs_. Each iteration focuses * on doing one state transition in this list: * - THAWED => FREEZING or FROZEN * - FREEZING => THAWED (if the freeze times out) * - FROZEN => signaled & THAWED * Some of these take time, so it makes sense not to do them synchronously, * and instead to spread them out over multiple iterations. */ void workOnKillingCGroupTasks(const std::string& nonempty_subsys) noexcept { // Signaling cgroups without `freezer` is racy and dangerous, see the // docstring in AsyncCGroupReaper.h. if (std::find( cgroupOpts_.subsystems_ref()->begin(), cgroupOpts_.subsystems_ref()->end(), "freezer") == cgroupOpts_.subsystems_ref()->end()) { // The client wants a racy, un-frozen kill, so go for it. if (*cgroupOpts_.killWithoutFreezer_ref()) { killCGroupTasks(nonempty_subsys); } else { badFreezer(" is not enabled"); } return; } auto freezer_state_path = cgroupDir("freezer") + "/freezer.state"; std::string freezer_state; if (!folly::readFile(freezer_state_path.c_str(), freezer_state)) { badFreezer("'s state is unreadable: ", strError()); } else if (freezer_state == "THAWED\n") { freezeWaitedMs_ = 0; freezerWrite(freezer_state_path, "FROZEN"); // The next workOnKillingCGroupTasks() normally sees FREEZING or FROZEN. } else if (freezer_state == "FREEZING\n") { freezeWaitedMs_ += curWaitMs_; // Freezing can race with OOM-notifier logic, for example, see: // https://issues.apache.org/jira/browse/MESOS-1689 & MESOS-1758 // // Our simple mitigation is to time out a cgroup stuck in FREEZING, // re-thaw it, and try again. // // Future: Mesos decided to put its tasks into PID namespaces, which // lets the kernel kill the whole subtree. I am not sure if this // approach is good for Bistro's low-overhead isolation, since PID // namespaces require a chroot & a separate /proc mount for the child // processes to have a good experience. Relevant diff: // https://reviews.apache.org/r/25966/ if (freezeWaitedMs_ > std::max(1, FLAGS_cgroup_freeze_timeout_ms)) { // freezeWaitedMs_ is not zeroed, so this can retry the next time. freezerWrite(freezer_state_path, "THAWED"); // The next workOnKillingCGroupTasks() normally tries to freeze again. } } else if (freezer_state == "FROZEN\n") { // We are now as sure as possible that signaling these PIDs is safe. // We can only kill `freezer` tasks -- if another subsystem has other // PIDs, it's not safe to signal them anyhow. killCGroupTasks("freezer"); // Non-D-state processes will receive the SIGKILL and exit shortly after. freezerWrite(freezer_state_path, "THAWED"); } else { badFreezer("'s state is invalid (", freezer_state, ")"); } } const cpp2::CGroupOptions cgroupOpts_; const std::string cgroupName_; // Start with a short timeout, in case the leaked processes die normally. const uint32_t minWaitMs_; // Intransigent processes that are not responding to SIGKILL are probably // in D state, so we exponentially back off to a longer delay. const uint32_t maxWaitMs_; // Used for exponential backoff, and for freeze timeouts. uint32_t curWaitMs_; uint64_t totalWaitMs_{0}; // Sum of all curWaitMs_ // Let freeze attempts time out, since freezing can race with OOM-killer. uint64_t freezeWaitedMs_{0}; folly::Promise<folly::Unit> emptyCGroups_; // Fulfilled on self-destruction }; } // anonymous namespace folly::Future<folly::Unit> asyncCGroupReaper( folly::EventBase* event_base, cpp2::CGroupOptions cgopts, std::string cgname, uint32_t min_wait_ms, // Set to pollMs(TaskSubprocessOptions) // Only D-state tasks should end up retrying for a long time. Note that // this should not be overly long, since freezing + signaling takes at // least two calls to workOnKillingCGroupTasks(). uint32_t max_wait_ms) { folly::Future<folly::Unit> empty_cgroups; (new AsyncCGroupReaper( event_base, std::move(cgopts), std::move(cgname), min_wait_ms, max_wait_ms ))->initialize(&empty_cgroups); return empty_cgroups; } }} // namespace facebook::bistro