syncd/TimerWatchdog.cpp (120 lines of code) (raw):

#include "TimerWatchdog.h" #include "swss/logger.h" #include <chrono> #define MUTEX std::lock_guard<std::mutex> _lock(m_mutex); using namespace syncd; TimerWatchdog::TimerWatchdog( _In_ int64_t warnTimespan): m_run(true), m_warnTimespan(warnTimespan), m_callback(nullptr), m_kco(nullptr) { SWSS_LOG_ENTER(); int64_t start = getTimeSinceEpoch(); m_startTimestamp = start; m_endTimestamp = start; m_lastCheckTimestamp = start; m_thread = std::make_shared<std::thread>(&TimerWatchdog::threadFunction, this); // m_thread->detach() } TimerWatchdog::~TimerWatchdog() { SWSS_LOG_ENTER(); m_run = false; m_thread->join(); } void TimerWatchdog::setStartTime() { MUTEX; SWSS_LOG_ENTER(); m_kco = nullptr; do { m_startTimestamp = getTimeSinceEpoch(); } while (m_startTimestamp <= m_endTimestamp); // make sure new start time is always past last end time } void TimerWatchdog::setEndTime() { MUTEX; SWSS_LOG_ENTER(); do { m_endTimestamp = getTimeSinceEpoch(); } while (m_endTimestamp <= m_startTimestamp); // make sure new end time is always past last start time auto span = m_endTimestamp - m_startTimestamp; if (span > m_warnTimespan) { SWSS_LOG_ERROR("event '%s' took %ld ms to execute", m_eventName.c_str(), span/1000); logEventData(); } m_eventName = ""; m_kco = nullptr; } void TimerWatchdog::logEventData() { SWSS_LOG_ENTER(); // this api must be executed under mutex if (m_kco) { const auto& key = kfvKey(*m_kco); const auto& op = kfvOp(*m_kco); const auto& values = kfvFieldsValues(*m_kco); SWSS_LOG_ERROR("op: %s, key: %s", op.c_str(), key.c_str()); for (const auto &v: values) { SWSS_LOG_ERROR("fv: %s: %s", fvField(v).c_str(), fvValue(v).c_str()); } } } void TimerWatchdog::setEventData( _In_ const std::string& eventName, _In_ const swss::KeyOpFieldsValuesTuple* kco) { MUTEX; SWSS_LOG_ENTER(); m_eventName = eventName; m_kco = kco; } void TimerWatchdog::setCallback( _In_ std::function<void(uint64_t)> callback) { MUTEX; SWSS_LOG_ENTER(); m_callback = callback; } void TimerWatchdog::threadFunction() { SWSS_LOG_ENTER(); SWSS_LOG_NOTICE("starting timer watchdog thread"); while (m_run) { std::this_thread::sleep_for(std::chrono::seconds(1)); MUTEX; // to protect event name // we make local copies, since executing functions can be so fast that // when we will read second time start timestamp it can be different // than previous one int64_t start = m_startTimestamp; int64_t end = m_endTimestamp; int64_t now = getTimeSinceEpoch(); // now needs to be obtained after obtaining start int64_t span = end - start; if (span < 0 && start > m_lastCheckTimestamp) { // this means start > end, so new function is currently executing, // or that function hanged, so see how long that function is // executing, this negative span can be arbitrary long even hours, // and that is fine, since we don't know when OA makes next // function call span = now - start; // this must be always non negative SWSS_LOG_NOTICE("time span %ld ms for '%s'", span/1000, m_eventName.c_str()); // TODO remove this if (span < 0) SWSS_LOG_THROW("negative span 'now - start': %ld - %ld", now, start); if (span > m_warnTimespan) { m_lastCheckTimestamp = start; // function probably hanged SWSS_LOG_ERROR("time span WD exceeded %ld ms for %s", span/1000, m_eventName.c_str()); logEventData(); } continue; } m_lastCheckTimestamp = start; } SWSS_LOG_NOTICE("ending timer watchdog thread"); } int64_t TimerWatchdog::getTimeSinceEpoch() { SWSS_LOG_ENTER(); return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now().time_since_epoch()).count(); }