def calculate_gpu_utilization()

in tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py [0:0]


    def calculate_gpu_utilization(self, global_start_time, global_end_time, steps_start_time, steps_end_time):
        # Make bucket_size to 10-power's of us, and number of buckets to (10, 100].
        # 10-power's of us, in order to straight forward for user to understand.
        # If number of buckets are too many, the value of gpu utilization will be either 0 or 1.
        def get_bucket_info(range_micro_seconds):
            max_buckets = 100
            bucket_size = 1
            while range_micro_seconds / bucket_size > max_buckets:
                bucket_size *= 10
            buckets = int(range_micro_seconds / bucket_size)
            unit = bucket_size
            unit_str = "us"
            if unit >= 1000:
                unit /= 1000
                unit_str = "ms"
                if unit >= 1000:
                    unit /= 1000
                    unit_str = "s"
            return int(bucket_size), int(buckets), int(unit), unit_str

        gpu_utilization_timeline = [[] for _ in range(consts.MAX_GPU_PER_NODE)]
        for gpu_id in self.gpu_ids:
            self.kernel_ranges_per_device[gpu_id] = merge_ranges(self.kernel_ranges_per_device[gpu_id])

            # Top-level number still consider steps, to be consistent with overview's breakdown.
            kernel_ranges_all_steps = intersection_ranges_lists(
                self.kernel_ranges_per_device[gpu_id], [(steps_start_time, steps_end_time)])
            ranges_sum = get_ranges_sum(kernel_ranges_all_steps)
            self.gpu_utilization[gpu_id] = ranges_sum / (steps_end_time - steps_start_time)

            # The timeline will use "PyTorch Profiler (0)" as start,
            # in order to draw previous step's kernels' gpu utilization.
            bucket_size, buckets, self.gpu_util_timeline_unit_size, self.gpu_util_timeline_unit_name = \
                get_bucket_info(global_end_time - global_start_time)
            buckets_ranges = []
            for i in range(buckets):
                buckets_ranges.append((global_start_time + i * bucket_size,
                                       global_start_time + (i + 1) * bucket_size if i < buckets - 1
                                       else global_end_time))  # The last bucket may be longer.
            gpu_utilization_timeline[gpu_id] = [0] * buckets
            if len(self.kernel_ranges_per_device[gpu_id]) > 0:
                current_range_index = 0
                current_range = self.kernel_ranges_per_device[gpu_id][current_range_index]
                current_bucket_index = 0
                current_bucket = buckets_ranges[0]
                while current_range_index < len(self.kernel_ranges_per_device[gpu_id]) and current_bucket_index < buckets:
                    if current_bucket[1] <= current_range[0]:
                        current_bucket_index += 1
                        current_bucket = buckets_ranges[current_bucket_index] if current_bucket_index < buckets \
                            else None
                    elif current_bucket[0] >= current_range[1]:
                        current_range_index += 1
                        if current_range_index < len(self.kernel_ranges_per_device[gpu_id]):
                            current_range = self.kernel_ranges_per_device[gpu_id][current_range_index]
                    else:
                        left_bound = max(current_range[0], current_bucket[0])
                        right_bound = min(current_range[1], current_bucket[1])
                        gpu_utilization_timeline[gpu_id][current_bucket_index] += (right_bound - left_bound)
                        if current_bucket[1] < current_range[1]:
                            current_bucket_index += 1
                            current_bucket = buckets_ranges[current_bucket_index] if current_bucket_index < buckets \
                                else None
                        else:
                            current_range_index += 1
                            if current_range_index < len(self.kernel_ranges_per_device[gpu_id]):
                                current_range = self.kernel_ranges_per_device[gpu_id][current_range_index]
                for i_bucket in range(buckets):
                    bucket_size = buckets_ranges[i_bucket][1] - buckets_ranges[i_bucket][0]
                    gpu_utilization_timeline[gpu_id][i_bucket] /= bucket_size
                    start_time = buckets_ranges[i_bucket][0]
                    self.gpu_util_buckets[gpu_id].append((start_time, gpu_utilization_timeline[gpu_id][i_bucket]))
                start_time = buckets_ranges[-1][1]
                self.gpu_util_buckets[gpu_id].append((start_time, 0))

        self.kernel_ranges_per_device = None  # Release memory.