in tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py [0:0]
def calculate_gpu_utilization(self, global_start_time, global_end_time, steps_start_time, steps_end_time):
# Make bucket_size to 10-power's of us, and number of buckets to (10, 100].
# 10-power's of us, in order to straight forward for user to understand.
# If number of buckets are too many, the value of gpu utilization will be either 0 or 1.
def get_bucket_info(range_micro_seconds):
max_buckets = 100
bucket_size = 1
while range_micro_seconds / bucket_size > max_buckets:
bucket_size *= 10
buckets = int(range_micro_seconds / bucket_size)
unit = bucket_size
unit_str = "us"
if unit >= 1000:
unit /= 1000
unit_str = "ms"
if unit >= 1000:
unit /= 1000
unit_str = "s"
return int(bucket_size), int(buckets), int(unit), unit_str
gpu_utilization_timeline = [[] for _ in range(consts.MAX_GPU_PER_NODE)]
for gpu_id in self.gpu_ids:
self.kernel_ranges_per_device[gpu_id] = merge_ranges(self.kernel_ranges_per_device[gpu_id])
# Top-level number still consider steps, to be consistent with overview's breakdown.
kernel_ranges_all_steps = intersection_ranges_lists(
self.kernel_ranges_per_device[gpu_id], [(steps_start_time, steps_end_time)])
ranges_sum = get_ranges_sum(kernel_ranges_all_steps)
self.gpu_utilization[gpu_id] = ranges_sum / (steps_end_time - steps_start_time)
# The timeline will use "PyTorch Profiler (0)" as start,
# in order to draw previous step's kernels' gpu utilization.
bucket_size, buckets, self.gpu_util_timeline_unit_size, self.gpu_util_timeline_unit_name = \
get_bucket_info(global_end_time - global_start_time)
buckets_ranges = []
for i in range(buckets):
buckets_ranges.append((global_start_time + i * bucket_size,
global_start_time + (i + 1) * bucket_size if i < buckets - 1
else global_end_time)) # The last bucket may be longer.
gpu_utilization_timeline[gpu_id] = [0] * buckets
if len(self.kernel_ranges_per_device[gpu_id]) > 0:
current_range_index = 0
current_range = self.kernel_ranges_per_device[gpu_id][current_range_index]
current_bucket_index = 0
current_bucket = buckets_ranges[0]
while current_range_index < len(self.kernel_ranges_per_device[gpu_id]) and current_bucket_index < buckets:
if current_bucket[1] <= current_range[0]:
current_bucket_index += 1
current_bucket = buckets_ranges[current_bucket_index] if current_bucket_index < buckets \
else None
elif current_bucket[0] >= current_range[1]:
current_range_index += 1
if current_range_index < len(self.kernel_ranges_per_device[gpu_id]):
current_range = self.kernel_ranges_per_device[gpu_id][current_range_index]
else:
left_bound = max(current_range[0], current_bucket[0])
right_bound = min(current_range[1], current_bucket[1])
gpu_utilization_timeline[gpu_id][current_bucket_index] += (right_bound - left_bound)
if current_bucket[1] < current_range[1]:
current_bucket_index += 1
current_bucket = buckets_ranges[current_bucket_index] if current_bucket_index < buckets \
else None
else:
current_range_index += 1
if current_range_index < len(self.kernel_ranges_per_device[gpu_id]):
current_range = self.kernel_ranges_per_device[gpu_id][current_range_index]
for i_bucket in range(buckets):
bucket_size = buckets_ranges[i_bucket][1] - buckets_ranges[i_bucket][0]
gpu_utilization_timeline[gpu_id][i_bucket] /= bucket_size
start_time = buckets_ranges[i_bucket][0]
self.gpu_util_buckets[gpu_id].append((start_time, gpu_utilization_timeline[gpu_id][i_bucket]))
start_time = buckets_ranges[-1][1]
self.gpu_util_buckets[gpu_id].append((start_time, 0))
self.kernel_ranges_per_device = None # Release memory.