def tic_job()

in cluster-trace-gpu-v2020/simulator/cluster.py [0:0]


    def tic_job(self, delta=1):
        # Unlike tic_svc(), it receives simulator's cur_time as its own cur_time
        # Here it returns a "cur_time" value to the simulator
        # If succeed: return cur_time >= 0
        # Else: return cur_time < 0 ==> exit_flag = 1
        self.cur_time += delta
        if self.export_cluster_util and self.cur_time % 10000 == 0:
            self.record_cluster_util()
        self.retrieve_job_from_full_list()  # update self.job_list
        job_runn_list = self.job_runn_list
        if len(job_runn_list) > 0:
            for job in job_runn_list:
                job['on_time'] += delta
                job['progress'] = job['on_time'] * job['num_gpu']
                
                # Job done logic
                if job['on_time'] >= job['duration']:
                    over_tic_time = job['on_time'] - job['duration']  # only if delta > 1
                    job['on_time'] -= over_tic_time
                    job['progress'] -= over_tic_time * job['num_gpu']
                    job['done'] = 1

                    host_node_id = job['node']
                    host_node = self.node_dict.get(host_node_id)
                    suc = host_node.release_job(job=job)
                    assert suc

                    job['jct'] = self.cur_time - over_tic_time - job['submit_time']  # deduct submit_time

                    self.job_history.add_done_job(job)

                    print_fn("%sDONE: %s || %s" % (self.log_prefix, _repr_job_done(job), job))
                
            return self.cur_time  # exit_flag = 0, still going

        # len(job_runn_list) <= 0,
        elif len(self.job_list) > 0:  # empty cluster with job pending
            self.idle_cluster_counter += 1
            print_fn("%sIDLE cluster until jobs: %s" % (self.log_prefix, [_repr_job_preempt(e) for e in self.job_list]))

            if self.idle_cluster_counter % 10000 == 0:
                print_fn('{} idle cluster: {}'.format(self.idle_cluster_counter, [_repr_job_preempt(e) for e in self.job_list]), level=2)
            return self.cur_time  # exit_flag = 0, still going

        elif len(self.job_full_list) > 0:  # i.e., empty cluster waiting for jobs to come
            wake_time = self.job_full_list[-1]['submit_time'] - delta  # the submit_time of the earliest job
            assert self.cur_time <= wake_time  # if ==, i.e., the stride is unnecessary
            self.cur_time = wake_time
            return self.cur_time  # exit_flag = 0, still going

        else:  # no running job, no pending job, no coming job => exit.
            return -1  # exit