def extract_averages()

in utils/utils.py [0:0]


def extract_averages(file_path,args):
    attention_avg_sum = 0.0
    mlp_avg_sum = 0.0
    other_avgs = {}
    grad_forward = 0.0
    grad_backward = 0.0

    section_header_re = re.compile(r"^(\w+):")
    time_gpu_avg_re = re.compile(r"time_gpu_avg:\s+(\d+(\.\d+)?)")
    time_gpu_min_re = re.compile(r"time_gpu_min:\s+(\d+(\.\d+)?)")

    with open(file_path, "r") as file:
        current_section = None

        for line in file:
            header_match = section_header_re.match(line)
            if header_match:
                current_section = header_match.group(1).strip()

            avg_match = time_gpu_avg_re.search(line)
            min_match = time_gpu_min_re.search(line)
            if current_section == "param_time":
                if min_match:
                    grad_forward = float(min_match.group(1)) * 1000 #us
                if avg_match:
                    grad_backward = float(avg_match.group(1)) * 1000
            elif avg_match and current_section:
                avg_value = float(avg_match.group(1)) * 1000
                if "atten" in current_section or current_section == "layernorm":
                    
                    if args.recompute_activations and 'flash' in current_section:
                        attention_avg_sum += avg_value*2
                    else:
                        attention_avg_sum += avg_value
                elif "mlp" in current_section or current_section == "layernorm2":
                    mlp_avg_sum += avg_value
                else:
                    other_avgs[current_section] = avg_value

    # 四舍五入并转换为整数
    attention_forward = round(attention_avg_sum)
    attention_backward = attention_forward
    mlp_forward = round(mlp_avg_sum)
    mlp_backward = mlp_forward
    grad_backward = round(grad_backward)
    grad_forward = round(grad_forward)
    other_avgs_int = {k: round(v) for k, v in other_avgs.items() if k != "param_time"}

    a100_compute_cache = {
        "attention_forward": attention_forward,
        "attention_backward": attention_backward,
        "mlp_forward": mlp_forward,
        "mlp_backward": mlp_backward,
        "grad_forward": grad_forward,
        "grad_backward": grad_backward,
    }
    a100_compute_cache.update(other_avgs_int)

    return a100_compute_cache