in utils/utils.py [0:0]
def extract_averages(file_path,args):
attention_avg_sum = 0.0
mlp_avg_sum = 0.0
other_avgs = {}
grad_forward = 0.0
grad_backward = 0.0
section_header_re = re.compile(r"^(\w+):")
time_gpu_avg_re = re.compile(r"time_gpu_avg:\s+(\d+(\.\d+)?)")
time_gpu_min_re = re.compile(r"time_gpu_min:\s+(\d+(\.\d+)?)")
with open(file_path, "r") as file:
current_section = None
for line in file:
header_match = section_header_re.match(line)
if header_match:
current_section = header_match.group(1).strip()
avg_match = time_gpu_avg_re.search(line)
min_match = time_gpu_min_re.search(line)
if current_section == "param_time":
if min_match:
grad_forward = float(min_match.group(1)) * 1000 #us
if avg_match:
grad_backward = float(avg_match.group(1)) * 1000
elif avg_match and current_section:
avg_value = float(avg_match.group(1)) * 1000
if "atten" in current_section or current_section == "layernorm":
if args.recompute_activations and 'flash' in current_section:
attention_avg_sum += avg_value*2
else:
attention_avg_sum += avg_value
elif "mlp" in current_section or current_section == "layernorm2":
mlp_avg_sum += avg_value
else:
other_avgs[current_section] = avg_value
# 四舍五入并转换为整数
attention_forward = round(attention_avg_sum)
attention_backward = attention_forward
mlp_forward = round(mlp_avg_sum)
mlp_backward = mlp_forward
grad_backward = round(grad_backward)
grad_forward = round(grad_forward)
other_avgs_int = {k: round(v) for k, v in other_avgs.items() if k != "param_time"}
a100_compute_cache = {
"attention_forward": attention_forward,
"attention_backward": attention_backward,
"mlp_forward": mlp_forward,
"mlp_backward": mlp_backward,
"grad_forward": grad_forward,
"grad_backward": grad_backward,
}
a100_compute_cache.update(other_avgs_int)
return a100_compute_cache