core/monitor/Monitor.h (150 lines of code) (raw):
/*
* Copyright 2022 iLogtail Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <condition_variable>
#include <future>
#include <mutex>
#include <string>
#include <unordered_map>
#include "MetricManager.h"
#include "MetricTypes.h"
#if defined(_MSC_VER)
#include <Windows.h>
#endif
namespace sls_logs {
class LogGroup;
}
namespace logtail {
struct CpuStat {
#if defined(__linux__)
uint64_t mSysTotalTime;
uint64_t mSysTime;
uint64_t mUserTime;
#elif defined(_MSC_VER)
ULARGE_INTEGER mLastCPU;
ULARGE_INTEGER mLastSysCPU;
ULARGE_INTEGER mLastUserCPU;
int mNumProcessors;
HANDLE mSelf;
#endif
// Common info.
int32_t mViolateNum;
float mCpuUsage;
CpuStat() { Reset(); }
void Reset();
};
struct MemStat {
int64_t mRss;
int32_t mViolateNum;
void Reset() {
mRss = 0;
mViolateNum = 0;
}
};
struct OsCpuStat {
int64_t mNoIdle;
int64_t mTotal;
float mOsCpuUsage;
void Reset() {
mNoIdle = 0;
mTotal = 0;
mOsCpuUsage = 0;
}
};
class LogtailMonitor {
public:
LogtailMonitor(const LogtailMonitor&) = delete;
LogtailMonitor& operator=(const LogtailMonitor&) = delete;
static LogtailMonitor* GetInstance();
bool Init();
void Stop();
uint32_t GetCpuCores();
// GetRealtimeCpuLevel return a value to indicates current CPU usage level.
// LogInput use it to do flow control.
float GetRealtimeCpuLevel() { return mRealtimeCpuStat.mCpuUsage / mScaledCpuUsageUpLimit; }
private:
LogtailMonitor();
~LogtailMonitor() = default;
void Monitor();
// GetCpuStat gets current CPU statistics of Logtail process and save it to @cpuStat.
// @return true if get successfully.
bool GetCpuStat(CpuStat& cpuStat);
// GetMemStat gets current memory statistics of Logtail process (save to mMemStat).
bool GetMemStat();
// CalCpuStat calculates CPU usage use (@curCpu - @savedCpu) and
// set @curCpu to @savedCpu after calculation.
void CalCpuStat(const CpuStat& curCpu, CpuStat& savedCpu);
// CheckSoftCpuLimit checks if current cpu usage exceeds limit.
// @return true if the cpu usage exceeds limit continuously.
bool CheckSoftCpuLimit();
// CheckSoftMemLimit checks if the memory usage exceeds limit.
// @return true if the memory usage exceeds limit continuously.
bool CheckSoftMemLimit();
bool CheckHardMemLimit();
// SendStatusProfile collects status profile and send them to server.
// @suicide indicates if the target LogStore is logtail_suicide_profile.
// Because sending is an asynchronous procedure, the caller should wait for
// several seconds after calling this method and before _exit(1).
bool SendStatusProfile(bool suicide);
// DumpMonitorInfo dumps simple monitor information to local.
bool DumpMonitorInfo(time_t monitorTime);
#if defined(__linux__)
// GetLoadAvg gets system load information.
std::string GetLoadAvg();
// CalCpuCores calculates the number of cores in CPU.
bool CalCpuCores();
// CalOsCpuStat calculates system CPU usage and save it into @mOsCpuStatForScale.
bool CalOsCpuStat();
// CheckScaledCpuUsageUpLimit updates mScaledCpuUsageUpLimit according to current
// status and limits in configurations, so that Logtail can adjust its CPU usage.
void CheckScaledCpuUsageUpLimit();
#endif
// IsHostIpChanged checks if the host ip is changed during running.
bool IsHostIpChanged();
void Suicide();
std::future<void> mThreadRes;
std::mutex mThreadRunningMux;
bool mIsThreadRunning = true;
std::atomic_bool mShouldSuicide;
std::condition_variable mStopCV;
// Control report status profile frequency.
int32_t mStatusCount;
// Use to calculate realtime CPU level (updated every 1s).
CpuStat mRealtimeCpuStat;
// Use to calculate CPU limit, updated regularly (30s by default).
CpuStat mCpuStat;
// Memory usage statistics.
MemStat mMemStat;
// Current scale up level, updated by CheckScaledCpuUsageUpLimit.
float mScaledCpuUsageUpLimit;
#if defined(__linux__)
const static int32_t CPU_STAT_FOR_SCALE_ARRAY_SIZE = 2;
int32_t mCpuCores = 0;
CpuStat mCpuStatForScale;
OsCpuStat mOsCpuStatForScale;
// mCpuArrayForScale and mOsCpuArrayForScale store lastest two CPU usage of
// ilogtail process and global.
float mCpuArrayForScale[CPU_STAT_FOR_SCALE_ARRAY_SIZE];
float mOsCpuArrayForScale[CPU_STAT_FOR_SCALE_ARRAY_SIZE];
int32_t mCpuArrayForScaleIdx;
float mScaledCpuUsageStep;
#endif
#ifdef APSARA_UNIT_TEST_MAIN
friend class ConfigUpdatorUnittest;
#endif
};
class LoongCollectorMonitor {
public:
static LoongCollectorMonitor* GetInstance();
void Init();
void Stop();
bool GetAgentMetric(SelfMonitorMetricEvent& event);
void SetAgentMetric(const SelfMonitorMetricEvent& event);
bool GetRunnerMetric(const std::string& runnerName, SelfMonitorMetricEvent& event);
void SetRunnerMetric(const std::string& runnerName, const SelfMonitorMetricEvent& event);
void SetAgentCpu(double cpu) { SET_GAUGE(mAgentCpu, cpu); }
void SetAgentMemory(uint64_t mem) { SET_GAUGE(mAgentMemory, mem); }
void SetAgentGoMemory(uint64_t mem) { SET_GAUGE(mAgentGoMemory, mem); }
void SetAgentGoRoutinesTotal(uint64_t total) { SET_GAUGE(mAgentGoRoutinesTotal, total); }
void SetAgentOpenFdTotal(uint64_t total) {
#ifndef APSARA_UNIT_TEST_MAIN
SET_GAUGE(mAgentOpenFdTotal, total);
#endif
}
void SetAgentConfigTotal(uint64_t total) {
#ifndef APSARA_UNIT_TEST_MAIN
SET_GAUGE(mAgentConfigTotal, total);
#endif
}
static std::string mHostname;
static std::string mIpAddr;
static std::string mOsDetail;
static std::string mUsername;
static int32_t mSystemBootTime;
static std::string mStartTime;
private:
LoongCollectorMonitor();
~LoongCollectorMonitor();
// 一个全局级别指标的副本,由 SelfMonitorServer::PushSelfMonitorMetricEvents 更新,格式为:
// {MetricCategory: {key:MetricValue}}
// 现支持 Agent 和 Runner 指标的保存、获取
struct GlobalMetrics {
SelfMonitorMetricEvent mAgentMetric;
std::unordered_map<std::string, SelfMonitorMetricEvent> mRunnerMetrics;
};
std::mutex mGlobalMetricsMux;
GlobalMetrics mGlobalMetrics;
// MetricRecord
MetricsRecordRef mMetricsRecordRef;
DoubleGaugePtr mAgentCpu;
IntGaugePtr mAgentMemory;
IntGaugePtr mAgentGoMemory;
IntGaugePtr mAgentGoRoutinesTotal;
IntGaugePtr mAgentOpenFdTotal;
IntGaugePtr mAgentConfigTotal;
};
} // namespace logtail