nodemanager/utils/System.h (168 lines of code) (raw):

#ifndef SYSTEM_H #define SYSTEM_H #include <string> #include <map> #include "String.h" #include "Logger.h" #include "../common/ErrorCodes.h" #include "Enumerable.h" namespace hpc { namespace utils { enum class IpAddressVersion { V4 = 4, V6 = 6 }; struct System { public: typedef std::tuple<std::string, std::string, std::string, std::string, bool> NetInfo; // name, uuid, pci.bus_id, pci.device_id, total_memory, max_sm_clock, gpu %, fan %, memory %, used mem MB, power Watt, SM Clock MH, temperature C typedef struct _GpuInfo { std::string Name; std::string Uuid; std::string PciBusId; std::string DeviceId; float TotalMemoryMB; float MaxSMClock; float FanPercentage; float UsedMemoryMB; float PowerWatt; float CurrentSMClock; float Temperature; float GpuUtilization; // take 00:01 from B174:00:01.0 std::string GetPciBusDevice() const { auto tokens = String::Split(this->PciBusId, '.'); auto ids = String::Split(tokens[0], ':'); return String::Join(':', ids[1], ids[2]); } float GetUsedMemoryPercentage() const { return 100 * this->UsedMemoryMB / this->TotalMemoryMB; } } GpuInfo; typedef struct _GpuInfoList { std::vector<GpuInfo> GpuInfos; float GetTotalMemoryMB() const { return Enumerable::Sum<std::vector<GpuInfo>, float, GpuInfo>(this->GpuInfos, [] (const GpuInfo& i) { return i.TotalMemoryMB; }); } float GetMaxSMClock() const { return Enumerable::Avg<std::vector<GpuInfo>, float, GpuInfo>(this->GpuInfos, [] (const GpuInfo& i) { return i.MaxSMClock; }); } float GetFanPercentage() const { return Enumerable::First<std::vector<GpuInfo>, float, GpuInfo>(this->GpuInfos, [] (const GpuInfo& i) { return i.FanPercentage; }); } float GetUsedMemoryMB() const { return Enumerable::Sum<std::vector<GpuInfo>, float, GpuInfo>(this->GpuInfos, [] (const GpuInfo& i) { return i.UsedMemoryMB; }); } float GetPowerWatt() const { return Enumerable::Sum<std::vector<GpuInfo>, float, GpuInfo>(this->GpuInfos, [] (const GpuInfo& i) { return i.PowerWatt; }); } float GetCurrentSMClock() const { return Enumerable::Avg<std::vector<GpuInfo>, float, GpuInfo>(this->GpuInfos, [] (const GpuInfo& i) { return i.CurrentSMClock; }); } float GetTemperature() const { return Enumerable::Avg<std::vector<GpuInfo>, float, GpuInfo>(this->GpuInfos, [] (const GpuInfo& i) { return i.Temperature; }); } float GetGpuUtilization() const { return Enumerable::Avg<std::vector<GpuInfo>, float, GpuInfo>(this->GpuInfos, [] (const GpuInfo& i) { return i.GpuUtilization; }); } float GetUsedMemoryPercentage() const { return 100 * this->GetUsedMemoryMB() / this->GetTotalMemoryMB(); } std::vector<std::string> gpuInstanceNames; std::vector<std::string> GetGpuInstanceNames() { if (this->gpuInstanceNames.size() != this->GpuInfos.size()) { this->gpuInstanceNames.clear(); for (size_t i = 0; i < this->GpuInfos.size(); i++) { auto instanceName = String::Join("", GpuInfos[i].Name, '(', i, ')'); this->gpuInstanceNames.push_back(instanceName); } // this->gpuInstanceNames.push_back("_Total"); } return this->gpuInstanceNames; } } GpuInfoList; static std::vector<NetInfo> GetNetworkInfo(); static std::vector<std::string> GetIbDevices(); static std::string GetIpAddress(IpAddressVersion version, const std::string& name); static void CPUUsage(uint64_t &total, uint64_t &idle); static void Memory(uint64_t &available, uint64_t &total); static void CPU(int &cores, int &sockets); static std::map<std::string, uint64_t> GetNetworkUsage(); static void IbNetworkUsage(std::map<std::string, uint64_t> & networkUsage, bool logFailure = false); static int Vmstat(float &pagesPerSec, float &contextSwitchesPerSec); static int Iostat(float &bytesPerSec); static int IostatX(float &queueLength); static int FreeSpace(float &freeSpaceKB); static const std::string& GetNodeName(); static bool IsCGroupInstalled(); static const std::string& GetDistroInfo(); static int CreateUser( const std::string& userName, const std::string& password, bool isAdmin); static int AddSshKey( const std::string& userName, const std::string& key, const std::string& fileName, const std::string& filePermission, std::string& filePath); static int RemoveSshKey( const std::string& userName, const std::string& fileName); static int AddAuthorizedKey( const std::string& userName, const std::string& key, const std::string& filePermission, std::string& filePath); static int RemoveAuthorizedKey( const std::string& userName, const std::string& key); static int GetHomeDir(const std::string& userName, std::string& homeDir); static int DeleteUser(const std::string& userName); static int CreateTempFolder(char* folderTemplate, const std::string& userName); static int WriteStringToFile(const std::string& fileName, const std::string& contents); static int QueryGpuInfo(GpuInfoList& gpuInfo); template <typename ... Args> static int ExecuteCommandIn(const std::string& input, const std::string& cmd, const Args& ... args) { std::string command = String::Join(" ", cmd, args...); //Logger::Debug("Executing cmd: {0}", command); FILE* stream = popen(command.c_str(), "w"); int exitCode = (int)hpc::common::ErrorCodes::PopenError; if (stream) { if (!input.empty()) { fputs(input.c_str(), stream); } int ret = pclose(stream); exitCode = WEXITSTATUS(ret); } else { Logger::Error("Error when popen {0}", command); } return exitCode; } template <typename ... Args> static int ExecuteCommandOut(std::string& output, const std::string& cmd, const Args& ... args) { std::string command = String::Join(" ", cmd, args...); //Logger::Debug("Executing cmd: {0}", command); int exitCode = (int)hpc::common::ErrorCodes::PopenError; std::ostringstream result; FILE* stream = popen(command.c_str(), "r"); if (stream) { char buffer[512]; while (fgets(buffer, sizeof(buffer), stream) != nullptr) { result << buffer; } int ret = pclose(stream); exitCode = WEXITSTATUS(ret); } else { int err = errno; Logger::Error("Error when popen {0}, errno {1}", command, err); result << "error when popen " << command << ", errno " << err << std::endl; } output = result.str(); if (exitCode != 0) { Logger::Warn("Executing {0}, error code {1}", command, exitCode); } return exitCode; } protected: private: }; } } #endif // SYSTEM_H