def workload_generate_aiob()

in workload_generator/AIOB_simAI_workload_generator.py [0:0]


    def workload_generate_aiob(self):
        # args.world_size --> total gpus number
        self.ga_num = self.args.global_batch // (self.args.micro_batch * self.args.dp_num)
        if self.ga_num < 1:
            print(
                "[WARN]: ga num < 1, please confirm global_batch num and micro_batch num"
            )
        default_compute_time = 1
        compute_time = 0
        tp_comm_size = (
            2 * self.args.micro_batch * self.args.seq_length * self.args.hidden_size
        )
        layers = self.get_model_details()
        total_params, moe_param_count = self._get_total_params()
        # self.workload.append(Work_Item(name="norm", forward_compute_time=0,
        #                         forward_comm = "BROADCAST", forward_comm_size= 8*self.args.micro_batch*self.args.seq_length,
        #                         backward_compute_time=default_compute_time, backward_comm="NONE", backward_comm_size=0,
        #                         dp_compute_time=default_compute_time, dp_comm="NONE", dp_comm_size=0
        #                         ))
        forward_compute_time = _get_aiob_compute_time(
            self.compute_cache, "forward", "grad"
        )
        backward_compute_time = _get_aiob_compute_time(
            self.compute_cache, "backward", "grad"
        )
        self.workload.append(
            Work_Item(
                name="grad_gather",
                forward_compute_time=default_compute_time,
                forward_comm="NONE",
                forward_comm_size=0,
                backward_compute_time=default_compute_time,
                backward_comm="NONE",
                backward_comm_size=0,
                dp_compute_time=default_compute_time,
                dp_comm="ALLGATHER",
                dp_comm_size=2 * (total_params-moe_param_count),
            )
        )
        self.workload.append(
            Work_Item(
                name="grad_param_comm",
                forward_compute_time=default_compute_time,
                forward_comm="NONE",
                forward_comm_size=0,
                backward_compute_time=default_compute_time,
                backward_comm="NONE",
                backward_comm_size=0,
                dp_compute_time=default_compute_time,
                dp_comm="REDUCESCATTER",
                dp_comm_size=4 * (total_params-moe_param_count),
            )
        )
        self.workload.append(
            Work_Item(
                name="grad_param_compute",
                forward_compute_time=default_compute_time,
                forward_comm="NONE",
                forward_comm_size=0,
                backward_compute_time=forward_compute_time + backward_compute_time,
                backward_comm="NONE",
                backward_comm_size=0,
                dp_compute_time=default_compute_time,
                dp_comm="NONE",
                dp_comm_size=0,
            )
        )

        if not self.args.enable_sequence_parallel:
            self.workload.append(
                Work_Item(
                    name="layernorm",
                    forward_compute_time=default_compute_time,
                    forward_comm="NONE",
                    forward_comm_size=0,
                    backward_compute_time=default_compute_time,
                    backward_comm="ALLREDUCE",
                    backward_comm_size=2 * total_params,
                    dp_compute_time=default_compute_time,
                    dp_comm="NONE",
                    dp_comm_size=0,
                )
            )
        if args.tensor_model_parallel_size == 1 :
            emd_backward_comm = "NONE"
        else:
            emd_backward_comm = "ALLREDUCE"
        self.workload.append(
            Work_Item(
                name="embedding_grads",
                forward_compute_time=default_compute_time,
                forward_comm="NONE",
                forward_comm_size=0,
                backward_compute_time=default_compute_time,
                backward_comm=emd_backward_comm,
                backward_comm_size=tp_comm_size,
                dp_compute_time=default_compute_time,
                dp_comm="NONE",
                dp_comm_size=0,
            )
        )
        if self.args.expert_model_parallel_size != self.args.dp_num:
            self.workload.append(Work_Item(name="moe_grad_norm1", forward_compute_time=default_compute_time,
                                    forward_comm = "NONE", forward_comm_size= 0,
                                    backward_compute_time=default_compute_time, backward_comm="NONE", backward_comm_size=0,
                                    dp_compute_time=default_compute_time, dp_comm="ALLGATHER_DP_EP", dp_comm_size=2*moe_param_count
                                    ))
            self.workload.append(Work_Item(name="moe_grad_norm2", forward_compute_time=default_compute_time,
                                    forward_comm = "NONE", forward_comm_size= 0,
                                    backward_compute_time=default_compute_time, backward_comm="NONE", backward_comm_size=0,
                                    dp_compute_time=default_compute_time, dp_comm="REDUCESCATTER_DP_EP", dp_comm_size=4*moe_param_count
                                    ))
        for _ in range(self.ga_num):
            for layer in layers:
                name = layer.layer_name
                forward_comm = backward_comm = backward_comm_2 = "NONE"
                forward_comm_size = tp_comm_size
                emb_comm_size = tp_comm_size
                backward_comm_size = 0
                dp_comm = "NONE"
                dp_comm_size = 0
                if self.args.enable_sequence_parallel:
                    if "embedding" in name:
                        if args.tensor_model_parallel_size == 1 :
                            forward_comm = "NONE"
                            backward_comm = "NONE"
                        else:
                            forward_comm = "ALLREDUCE"
                            backward_comm = "NONE"
                        emb_compute_time = _get_aiob_compute_time(
                            self.compute_cache, "", "embedding"
                        )
                        self.workload.append(
                            Work_Item(
                                name=name,
                                forward_compute_time=emb_compute_time,
                                forward_comm=forward_comm,
                                forward_comm_size=emb_comm_size ,
                                backward_compute_time=default_compute_time,
                                backward_comm=backward_comm,
                                backward_comm_size=backward_comm_size,
                                dp_compute_time=backward_compute_time,
                                dp_comm=dp_comm,
                                dp_comm_size=dp_comm_size,
                            )
                        )
                    if "row" in name:
                        
                        forward_compute_time = _get_aiob_compute_time(
                        self.compute_cache, "forward", name.split("_")[0]
                        )
                        backward_compute_time = _get_aiob_compute_time(
                            self.compute_cache, "backward", name.split("_")[0]
                        )

                        if self.args.recompute_activations and 'attention' in name:
                            forward_compute_time *= 2
                        forward_compute_time = int(forward_compute_time / 2)
                        backward_compute_time = int(backward_compute_time / 2)
                        forward_comm_size_sp = tp_comm_size
                        if args.tensor_model_parallel_size == 1 :
                            forward_comm = "NONE"
                            backward_comm = "NONE"
                        else:
                            forward_comm = "REDUCESCATTER"
                            backward_comm = "ALLGATHER"
                        self.workload.append(
                                Work_Item(
                                    name=name,
                                    forward_compute_time=forward_compute_time,
                                    forward_comm=forward_comm,
                                    forward_comm_size=forward_comm_size,
                                    backward_compute_time=backward_compute_time,
                                    backward_comm=backward_comm,
                                    backward_comm_size=forward_comm_size_sp,#sp overlap allgather
                                    dp_compute_time=backward_compute_time,
                                    dp_comm=dp_comm,
                                    dp_comm_size=dp_comm_size,
                                )
                            )

                    elif "column" in name:
                        forward_compute_time = _get_aiob_compute_time(
                        self.compute_cache, "forward", name.split("_")[0]
                        )
                        backward_compute_time = _get_aiob_compute_time(
                            self.compute_cache, "backward", name.split("_")[0]
                        )

                        if self.args.recompute_activations and 'attention' in name:
                            forward_compute_time *= 2
                        forward_compute_time = int(forward_compute_time / 2)
                        backward_compute_time = int(backward_compute_time / 2)
                        if args.tensor_model_parallel_size == 1 :
                            forward_comm = "NONE"
                            backward_comm = "NONE"
                            backward_comm_2 = "NONE"
                        else:
                            forward_comm = "ALLGATHER"
                            backward_comm = "REDUCESCATTER"
                            backward_comm_2 = "ALLGATHER"
                        self.workload.append(
                                Work_Item(
                                    name=name,
                                    forward_compute_time=forward_compute_time,
                                    forward_comm=forward_comm,
                                    forward_comm_size=forward_comm_size,
                                    backward_compute_time=backward_compute_time,
                                    backward_comm=backward_comm,
                                    backward_comm_size=backward_comm_size,
                                    dp_compute_time=backward_compute_time,
                                    dp_comm=dp_comm,
                                    dp_comm_size=dp_comm_size,
                                )
                            )
                    elif "moelayer" in name:
                        forward_compute_time = _get_aiob_compute_time(
                        self.compute_cache, "forward", name.split("_")[0]
                        )
                        backward_compute_time = _get_aiob_compute_time(
                            self.compute_cache, "backward", name.split("_")[0]
                        )
                        if args.tensor_model_parallel_size == 1 :
                            forward_comm1 = "NONE"
                            forward_comm2 = "NONE"
                            forward_comm3 = "ALLTOALL_EP"
                            forward_comm4 = "NONE"
                            forward_comm5 = "NONE"
                            forward_comm6 = "ALLTOALL_EP"
                            forward_comm7 = "NONE"
                        else:
                            forward_comm1 = "ALLGATHER"
                            forward_comm2 = "ALLTOALL"
                            forward_comm3 = "ALLTOALL_EP"
                            forward_comm4 = "ALLGATHER"
                            forward_comm5 = "REDUCESCATTER"
                            forward_comm6 = "ALLTOALL_EP"
                            forward_comm7 = "ALLTOALL"
                        if args.expert_model_parallel_size != 1:
                            self.workload.append(Work_Item(name=name, forward_compute_time=forward_compute_time,
                                        forward_comm = forward_comm1, forward_comm_size= 2*self.mbs*self.seq_len*self.num_experts,
                                        backward_compute_time=backward_compute_time, backward_comm=forward_comm1, backward_comm_size=2*self.mbs*self.seq_len*self.num_experts,
                                        dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
                                        ))
                            self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
                                        forward_comm = forward_comm2, forward_comm_size= tp_comm_size//self.tp,
                                        backward_compute_time=default_compute_time, backward_comm=forward_comm2, backward_comm_size=tp_comm_size//self.tp,
                                        dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
                                        ))
                            self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
                                        forward_comm = forward_comm3, forward_comm_size= tp_comm_size*self.topk//self.tp,
                                        backward_compute_time=default_compute_time, backward_comm=forward_comm3, backward_comm_size=tp_comm_size*self.topk//self.tp,
                                        dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
                                        ))
                            self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
                                        forward_comm = forward_comm4, forward_comm_size= tp_comm_size*self.topk,
                                        backward_compute_time=default_compute_time, backward_comm=forward_comm5, backward_comm_size=tp_comm_size*self.topk,
                                        dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
                                        ))
                            self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
                                        forward_comm = forward_comm5, forward_comm_size= tp_comm_size*self.topk,
                                        backward_compute_time=default_compute_time, backward_comm=forward_comm4, backward_comm_size=tp_comm_size*self.topk,
                                        dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
                                        ))
                            self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
                                        forward_comm = forward_comm6, forward_comm_size= tp_comm_size*self.topk//self.tp,
                                        backward_compute_time=default_compute_time, backward_comm=forward_comm6, backward_comm_size=tp_comm_size*self.topk//self.tp,
                                        dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
                                        ))
                            self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
                                        forward_comm = forward_comm7, forward_comm_size= tp_comm_size//self.tp,
                                        backward_compute_time=default_compute_time, backward_comm=forward_comm7, backward_comm_size=tp_comm_size//self.tp,
                                        dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
                                        ))
                        else:
                            self.workload.append(Work_Item(name=name, forward_compute_time=forward_compute_time,
                                        forward_comm = forward_comm1, forward_comm_size= 2*self.mbs*self.seq_len*self.num_experts,
                                        backward_compute_time=backward_compute_time, backward_comm=forward_comm1, backward_comm_size=2*self.mbs*self.seq_len*self.num_experts,
                                        dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
                                        ))
                            self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
                                        forward_comm = forward_comm2, forward_comm_size= tp_comm_size//self.tp,
                                        backward_compute_time=default_compute_time, backward_comm=forward_comm2, backward_comm_size=tp_comm_size//self.tp,
                                        dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
                                        ))
                            self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
                                        forward_comm = forward_comm3, forward_comm_size=1,
                                        backward_compute_time=default_compute_time, backward_comm=forward_comm3, backward_comm_size=1,
                                        dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
                                        ))
                            self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
                                        forward_comm = forward_comm4, forward_comm_size= tp_comm_size*self.topk,
                                        backward_compute_time=default_compute_time, backward_comm=forward_comm4, backward_comm_size=tp_comm_size*self.topk,
                                        dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
                                        ))
                            self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
                                        forward_comm = forward_comm5, forward_comm_size= tp_comm_size*self.topk,
                                        backward_compute_time=default_compute_time, backward_comm=forward_comm4, backward_comm_size=tp_comm_size*self.topk,
                                        dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
                                        ))
                            self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
                                        forward_comm = forward_comm6, forward_comm_size=1,
                                        backward_compute_time=default_compute_time, backward_comm=forward_comm6, backward_comm_size=1,
                                        dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
                                        ))
                            self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
                                        forward_comm = forward_comm7, forward_comm_size= tp_comm_size//self.tp,
                                        backward_compute_time=default_compute_time, backward_comm=forward_comm7, backward_comm_size=tp_comm_size//self.tp,
                                        dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
                                        ))
                else:
                    if args.tensor_model_parallel_size == 1 :
                        forward_comm = "NONE"
                        backward_comm = "NONE"
                    else:

                        forward_comm = "ALLREDUCE"
                        backward_comm = "NONE"
                    if self.args.recompute_activations and 'attention' in name:
                        forward_compute_time *= 2
                    if "embedding" in name:
                        emb_compute_time = _get_aiob_compute_time(
                            self.compute_cache, "", "embedding"
                        )
                        self.workload.append(
                            Work_Item(
                                name=name,
                                forward_compute_time=emb_compute_time,
                                forward_comm=forward_comm,
                                forward_comm_size=forward_comm_size,
                                backward_compute_time=default_compute_time,
                                backward_comm=backward_comm,
                                backward_comm_size=backward_comm_size,
                                dp_compute_time=backward_compute_time,
                                dp_comm=dp_comm,
                                dp_comm_size=dp_comm_size,
                            )
                        )
                    else:
                        forward_compute_time = _get_aiob_compute_time(
                            self.compute_cache, "forward", name.split("_")[0]
                        )
                        backward_compute_time = _get_aiob_compute_time(
                            self.compute_cache, "backward", name.split("_")[0]
                        )
                        self.workload.append(
                            Work_Item(
                                name=name,
                                forward_compute_time=forward_compute_time,
                                forward_comm=forward_comm,
                                forward_comm_size=forward_comm_size,
                                backward_compute_time=backward_compute_time,
                                backward_comm=backward_comm,
                                backward_comm_size=backward_comm_size,
                                dp_compute_time=backward_compute_time,
                                dp_comm=dp_comm,
                                dp_comm_size=dp_comm_size,
                            )
                        )
            # compute_time = _get_aiob_compute_time(self.compute_cache, "forward", "embedding")
            # self.workload.append(Work_Item(name="embedding_norm", forward_compute_time=compute_time,
            #                         forward_comm = "ALLREDUCE", forward_comm_size= self.args.vocab_size*self.args.hidden_size*2,
            #                         backward_compute_time=default_compute_time, backward_comm="NONE", backward_comm_size=0,
            #                         dp_compute_time=default_compute_time, dp_comm="NONE", dp_comm_size=0
            #                         ))
        for i in range(3):
            self.workload.append(
                Work_Item(
                    name="cross_entropy" + str(i + 1),
                    forward_compute_time=compute_time,
                    forward_comm="ALLREDUCE",
                    forward_comm_size=self.args.seq_length * self.args.micro_batch * 4,
                    backward_compute_time=compute_time,
                    backward_comm="NONE",
                    backward_comm_size=0,
                    dp_compute_time=compute_time,
                    dp_comm="NONE",
                    dp_comm_size=0,
                )
            )

        for i in range(4):
            self.workload.append(
                Work_Item(
                    name="optimizer" + str(i + 1),
                    forward_compute_time=compute_time,
                    forward_comm="ALLREDUCE",
                    forward_comm_size=4,
                    backward_compute_time=compute_time,
                    backward_comm="NONE",
                    backward_comm_size=0,
                    dp_compute_time=compute_time,
                    dp_comm="NONE",
                    dp_comm_size=0,
                )
            )