cluster-trace-gpu-v2020/simulator/node.py (213 lines of code) (raw):

from __future__ import absolute_import from __future__ import division from __future__ import print_function from utils import print_fn ''' Class Node ''' class Node(object): def __init__(self, id, num_gpus=8, num_cpus=96, mem=720, job_runn_list=None, gpu_type=0): self.id = id self.num_gpus = num_gpus * 100 # in % self.idl_gpus = self.num_gpus # in % self.svc_gpus = 0 # occupied by higher-priority services self.job_gpus = 0 # sum([j['num_gpu'] for j in self.job_runn_list]) # num_gpus = svc_gpus + job_gpus + idl_gpus # = svc_gpus + cur_gpus self.gpu_type = gpu_type self.num_cpus = num_cpus * 100 # in % self.idl_cpus = self.num_cpus # in % self.svc_cpus = 0 self.job_cpus = 0 self.mem = mem self.idl_mem = mem self.svc_mem = 0 self.job_mem = 0 self.network_in = 0 # bw, or traffic amount self.network_out = 0 if job_runn_list is None: self.job_runn_list = list() else: self.job_runn_list = list(job_runn_list) print_fn(' Node[%3d]:(%3d GPUs,%4d CPUs) %s' % (id, num_gpus, num_cpus, gpu_type)) @property def util_rate(self): cpus_util = 1 - self.idl_cpus / self.num_cpus if self.num_gpus > 0: gpus_util = 1 - self.idl_gpus / self.num_gpus util_rate = round(100 * (gpus_util + cpus_util) / 2) else: util_rate = round(100 * cpus_util) return util_rate def __repr__(self): self.update_idl_gpus() self.update_idl_cpus() return 'N[%3d]: [(j %3d,i %3d)/%3d GPUs, (j %4d,i %4d)/%4d CPUs -- %3d Util.] %s' % ( self.id, self.job_gpus, self.idl_gpus, self.num_gpus, self.job_cpus, self.idl_cpus, self.num_cpus, self.util_rate, self.gpu_type) def check_rsrc(self): assert self.num_gpus == self.svc_gpus + self.idl_gpus + self.job_gpus assert self.num_cpus == self.svc_cpus + self.idl_cpus + self.job_cpus def update_idl_gpus(self): self.idl_gpus = self.num_gpus - self.svc_gpus - self.job_gpus def update_idl_cpus(self): self.idl_cpus = self.num_cpus - self.svc_cpus - self.job_cpus '''alloc/release job''' def alloc_job(self, job): if self.alloc_res(num_gpus=job['num_gpu'], num_cpus=job['num_cpu']): self.job_runn_list.append(job) self.job_gpus += job['num_gpu'] self.job_cpus += job['num_cpu'] return True else: return False def release_job(self, job): if self.release_res(num_gpus=job['num_gpu'], num_cpus=job['num_cpu']): self.job_runn_list.remove(job) self.job_gpus -= job['num_gpu'] self.job_cpus -= job['num_cpu'] return True else: return False '''alloc/release srv''' def set_svc_res_by_ratio(self, ratio=0): self.svc_gpus = int(ratio * self.num_gpus) self.svc_cpus = int(ratio * self.num_cpus) self.update_idl_gpus() self.update_idl_cpus() '''alloc/release resource''' def alloc_res(self, num_gpus=0, num_cpus=0): # alloc job resource gpu = self.alloc_gpus(num_gpus) cpu = self.alloc_cpus(num_cpus) if not cpu and not gpu: return False elif not cpu and gpu: self.release_gpus(num_gpus) return False elif cpu and not gpu: self.release_cpus(num_cpus) return False return True def release_res(self, num_gpus, num_cpus): # input is gpu and cpu cpu = self.release_cpus(num_cpus) gpu = self.release_gpus(num_gpus) return cpu and gpu '''alloc/release resource with best efforts''' def alloc_gpu_best_effort(self, num_gpus=0): """return: num_gpus_left_to_alloc""" assert num_gpus >= 0 if num_gpus <= self.idl_gpus: self.svc_gpus += num_gpus self.idl_gpus -= num_gpus num_gpus = 0 else: self.svc_gpus += self.idl_gpus num_gpus -= self.idl_gpus self.idl_gpus = 0 return num_gpus def alloc_cpu_best_effort(self, num_cpus=0): """return: num_cpus_left_to_alloc""" assert num_cpus >= 0 if num_cpus <= self.idl_cpus: self.svc_cpus += num_cpus self.idl_cpus -= num_cpus num_cpus = 0 else: self.svc_cpus += self.idl_cpus num_cpus -= self.idl_cpus self.idl_cpus = 0 return num_cpus def release_gpu_best_effort(self, num_gpus=0): """return: num_gpus_left_to_release""" assert num_gpus >= 0 if num_gpus <= self.svc_gpus: self.idl_gpus += num_gpus self.svc_gpus -= num_gpus num_gpus = 0 else: self.idl_gpus += self.svc_gpus num_gpus -= self.svc_gpus self.svc_gpus = 0 return num_gpus def release_cpu_best_effort(self, num_cpus=0): """return: num_cpus_left_to_release""" assert num_cpus >= 0 if num_cpus <= self.svc_cpus: self.idl_cpus += num_cpus self.svc_cpus -= num_cpus num_cpus = 0 else: self.idl_cpus += self.svc_cpus num_cpus -= self.svc_cpus self.svc_cpus = 0 return num_cpus ''' GPU ''' def get_idl_gpus(self): return self.idl_gpus def alloc_gpus(self, num_gpus=0): ''' If enough free gpus, allocate gpus Return: True, for success; False, for failure ''' if num_gpus > self.idl_gpus: return False else: self.idl_gpus -= num_gpus return True def release_gpus(self, num_gpus=0): ''' release using gpus back to free list ''' if self.idl_gpus + num_gpus > self.num_gpus: self.idl_gpus = self.num_gpus return False else: self.idl_gpus += num_gpus return True ''' CPU ''' def get_idl_cpus(self): return self.idl_cpus def alloc_cpus(self, num_cpus=0): ''' If enough free cpus, allocate gpus Return: True, for success; False, for failure ''' if num_cpus > self.idl_cpus: return False else: self.idl_cpus -= num_cpus return True def release_cpus(self, num_cpus=0): ''' release using cpus back to free list ''' if self.idl_cpus + num_cpus > self.num_cpus: self.idl_cpus = self.num_cpus return False else: self.idl_cpus += num_cpus return True '''network''' def add_network_load(self, in_load=0, out_load=0): self.network_in += in_load self.network_out += out_load self.network_in = round(self.network_in, 1) self.network_out = round(self.network_in, 1) def release_network_load(self, in_load=0, out_load=0): self.network_in -= in_load self.network_out -= out_load self.network_in = round(self.network_in, 1) self.network_out = round(self.network_in, 1) def set_network_load(self, in_load=0, out_load=0): self.network_in = in_load self.network_out = out_load self.network_in = round(self.network_in, 1) self.network_out = round(self.network_in, 1) def init_node(self, num_gpus=0, num_cpus=0, mem=0): if num_gpus != 0: self.num_gpus = num_gpus self.idl_gpus = num_gpus if num_cpus != 0: self.num_cpus = num_cpus self.idl_cpus = num_cpus if mem != 0: self.mem = mem self.idl_mem = mem