in workload_generator/AIOB_simAI_workload_generator.py [0:0]
def workload_generate_aiob(self):
# args.world_size --> total gpus number
self.ga_num = self.args.global_batch // (self.args.micro_batch * self.args.dp_num)
if self.ga_num < 1:
print(
"[WARN]: ga num < 1, please confirm global_batch num and micro_batch num"
)
default_compute_time = 1
compute_time = 0
tp_comm_size = (
2 * self.args.micro_batch * self.args.seq_length * self.args.hidden_size
)
layers = self.get_model_details()
total_params, moe_param_count = self._get_total_params()
# self.workload.append(Work_Item(name="norm", forward_compute_time=0,
# forward_comm = "BROADCAST", forward_comm_size= 8*self.args.micro_batch*self.args.seq_length,
# backward_compute_time=default_compute_time, backward_comm="NONE", backward_comm_size=0,
# dp_compute_time=default_compute_time, dp_comm="NONE", dp_comm_size=0
# ))
forward_compute_time = _get_aiob_compute_time(
self.compute_cache, "forward", "grad"
)
backward_compute_time = _get_aiob_compute_time(
self.compute_cache, "backward", "grad"
)
self.workload.append(
Work_Item(
name="grad_gather",
forward_compute_time=default_compute_time,
forward_comm="NONE",
forward_comm_size=0,
backward_compute_time=default_compute_time,
backward_comm="NONE",
backward_comm_size=0,
dp_compute_time=default_compute_time,
dp_comm="ALLGATHER",
dp_comm_size=2 * (total_params-moe_param_count),
)
)
self.workload.append(
Work_Item(
name="grad_param_comm",
forward_compute_time=default_compute_time,
forward_comm="NONE",
forward_comm_size=0,
backward_compute_time=default_compute_time,
backward_comm="NONE",
backward_comm_size=0,
dp_compute_time=default_compute_time,
dp_comm="REDUCESCATTER",
dp_comm_size=4 * (total_params-moe_param_count),
)
)
self.workload.append(
Work_Item(
name="grad_param_compute",
forward_compute_time=default_compute_time,
forward_comm="NONE",
forward_comm_size=0,
backward_compute_time=forward_compute_time + backward_compute_time,
backward_comm="NONE",
backward_comm_size=0,
dp_compute_time=default_compute_time,
dp_comm="NONE",
dp_comm_size=0,
)
)
if not self.args.enable_sequence_parallel:
self.workload.append(
Work_Item(
name="layernorm",
forward_compute_time=default_compute_time,
forward_comm="NONE",
forward_comm_size=0,
backward_compute_time=default_compute_time,
backward_comm="ALLREDUCE",
backward_comm_size=2 * total_params,
dp_compute_time=default_compute_time,
dp_comm="NONE",
dp_comm_size=0,
)
)
if args.tensor_model_parallel_size == 1 :
emd_backward_comm = "NONE"
else:
emd_backward_comm = "ALLREDUCE"
self.workload.append(
Work_Item(
name="embedding_grads",
forward_compute_time=default_compute_time,
forward_comm="NONE",
forward_comm_size=0,
backward_compute_time=default_compute_time,
backward_comm=emd_backward_comm,
backward_comm_size=tp_comm_size,
dp_compute_time=default_compute_time,
dp_comm="NONE",
dp_comm_size=0,
)
)
if self.args.expert_model_parallel_size != self.args.dp_num:
self.workload.append(Work_Item(name="moe_grad_norm1", forward_compute_time=default_compute_time,
forward_comm = "NONE", forward_comm_size= 0,
backward_compute_time=default_compute_time, backward_comm="NONE", backward_comm_size=0,
dp_compute_time=default_compute_time, dp_comm="ALLGATHER_DP_EP", dp_comm_size=2*moe_param_count
))
self.workload.append(Work_Item(name="moe_grad_norm2", forward_compute_time=default_compute_time,
forward_comm = "NONE", forward_comm_size= 0,
backward_compute_time=default_compute_time, backward_comm="NONE", backward_comm_size=0,
dp_compute_time=default_compute_time, dp_comm="REDUCESCATTER_DP_EP", dp_comm_size=4*moe_param_count
))
for _ in range(self.ga_num):
for layer in layers:
name = layer.layer_name
forward_comm = backward_comm = backward_comm_2 = "NONE"
forward_comm_size = tp_comm_size
emb_comm_size = tp_comm_size
backward_comm_size = 0
dp_comm = "NONE"
dp_comm_size = 0
if self.args.enable_sequence_parallel:
if "embedding" in name:
if args.tensor_model_parallel_size == 1 :
forward_comm = "NONE"
backward_comm = "NONE"
else:
forward_comm = "ALLREDUCE"
backward_comm = "NONE"
emb_compute_time = _get_aiob_compute_time(
self.compute_cache, "", "embedding"
)
self.workload.append(
Work_Item(
name=name,
forward_compute_time=emb_compute_time,
forward_comm=forward_comm,
forward_comm_size=emb_comm_size ,
backward_compute_time=default_compute_time,
backward_comm=backward_comm,
backward_comm_size=backward_comm_size,
dp_compute_time=backward_compute_time,
dp_comm=dp_comm,
dp_comm_size=dp_comm_size,
)
)
if "row" in name:
forward_compute_time = _get_aiob_compute_time(
self.compute_cache, "forward", name.split("_")[0]
)
backward_compute_time = _get_aiob_compute_time(
self.compute_cache, "backward", name.split("_")[0]
)
if self.args.recompute_activations and 'attention' in name:
forward_compute_time *= 2
forward_compute_time = int(forward_compute_time / 2)
backward_compute_time = int(backward_compute_time / 2)
forward_comm_size_sp = tp_comm_size
if args.tensor_model_parallel_size == 1 :
forward_comm = "NONE"
backward_comm = "NONE"
else:
forward_comm = "REDUCESCATTER"
backward_comm = "ALLGATHER"
self.workload.append(
Work_Item(
name=name,
forward_compute_time=forward_compute_time,
forward_comm=forward_comm,
forward_comm_size=forward_comm_size,
backward_compute_time=backward_compute_time,
backward_comm=backward_comm,
backward_comm_size=forward_comm_size_sp,#sp overlap allgather
dp_compute_time=backward_compute_time,
dp_comm=dp_comm,
dp_comm_size=dp_comm_size,
)
)
elif "column" in name:
forward_compute_time = _get_aiob_compute_time(
self.compute_cache, "forward", name.split("_")[0]
)
backward_compute_time = _get_aiob_compute_time(
self.compute_cache, "backward", name.split("_")[0]
)
if self.args.recompute_activations and 'attention' in name:
forward_compute_time *= 2
forward_compute_time = int(forward_compute_time / 2)
backward_compute_time = int(backward_compute_time / 2)
if args.tensor_model_parallel_size == 1 :
forward_comm = "NONE"
backward_comm = "NONE"
backward_comm_2 = "NONE"
else:
forward_comm = "ALLGATHER"
backward_comm = "REDUCESCATTER"
backward_comm_2 = "ALLGATHER"
self.workload.append(
Work_Item(
name=name,
forward_compute_time=forward_compute_time,
forward_comm=forward_comm,
forward_comm_size=forward_comm_size,
backward_compute_time=backward_compute_time,
backward_comm=backward_comm,
backward_comm_size=backward_comm_size,
dp_compute_time=backward_compute_time,
dp_comm=dp_comm,
dp_comm_size=dp_comm_size,
)
)
elif "moelayer" in name:
forward_compute_time = _get_aiob_compute_time(
self.compute_cache, "forward", name.split("_")[0]
)
backward_compute_time = _get_aiob_compute_time(
self.compute_cache, "backward", name.split("_")[0]
)
if args.tensor_model_parallel_size == 1 :
forward_comm1 = "NONE"
forward_comm2 = "NONE"
forward_comm3 = "ALLTOALL_EP"
forward_comm4 = "NONE"
forward_comm5 = "NONE"
forward_comm6 = "ALLTOALL_EP"
forward_comm7 = "NONE"
else:
forward_comm1 = "ALLGATHER"
forward_comm2 = "ALLTOALL"
forward_comm3 = "ALLTOALL_EP"
forward_comm4 = "ALLGATHER"
forward_comm5 = "REDUCESCATTER"
forward_comm6 = "ALLTOALL_EP"
forward_comm7 = "ALLTOALL"
if args.expert_model_parallel_size != 1:
self.workload.append(Work_Item(name=name, forward_compute_time=forward_compute_time,
forward_comm = forward_comm1, forward_comm_size= 2*self.mbs*self.seq_len*self.num_experts,
backward_compute_time=backward_compute_time, backward_comm=forward_comm1, backward_comm_size=2*self.mbs*self.seq_len*self.num_experts,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm2, forward_comm_size= tp_comm_size//self.tp,
backward_compute_time=default_compute_time, backward_comm=forward_comm2, backward_comm_size=tp_comm_size//self.tp,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm3, forward_comm_size= tp_comm_size*self.topk//self.tp,
backward_compute_time=default_compute_time, backward_comm=forward_comm3, backward_comm_size=tp_comm_size*self.topk//self.tp,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm4, forward_comm_size= tp_comm_size*self.topk,
backward_compute_time=default_compute_time, backward_comm=forward_comm5, backward_comm_size=tp_comm_size*self.topk,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm5, forward_comm_size= tp_comm_size*self.topk,
backward_compute_time=default_compute_time, backward_comm=forward_comm4, backward_comm_size=tp_comm_size*self.topk,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm6, forward_comm_size= tp_comm_size*self.topk//self.tp,
backward_compute_time=default_compute_time, backward_comm=forward_comm6, backward_comm_size=tp_comm_size*self.topk//self.tp,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm7, forward_comm_size= tp_comm_size//self.tp,
backward_compute_time=default_compute_time, backward_comm=forward_comm7, backward_comm_size=tp_comm_size//self.tp,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
else:
self.workload.append(Work_Item(name=name, forward_compute_time=forward_compute_time,
forward_comm = forward_comm1, forward_comm_size= 2*self.mbs*self.seq_len*self.num_experts,
backward_compute_time=backward_compute_time, backward_comm=forward_comm1, backward_comm_size=2*self.mbs*self.seq_len*self.num_experts,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm2, forward_comm_size= tp_comm_size//self.tp,
backward_compute_time=default_compute_time, backward_comm=forward_comm2, backward_comm_size=tp_comm_size//self.tp,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm3, forward_comm_size=1,
backward_compute_time=default_compute_time, backward_comm=forward_comm3, backward_comm_size=1,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm4, forward_comm_size= tp_comm_size*self.topk,
backward_compute_time=default_compute_time, backward_comm=forward_comm4, backward_comm_size=tp_comm_size*self.topk,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm5, forward_comm_size= tp_comm_size*self.topk,
backward_compute_time=default_compute_time, backward_comm=forward_comm4, backward_comm_size=tp_comm_size*self.topk,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm6, forward_comm_size=1,
backward_compute_time=default_compute_time, backward_comm=forward_comm6, backward_comm_size=1,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm7, forward_comm_size= tp_comm_size//self.tp,
backward_compute_time=default_compute_time, backward_comm=forward_comm7, backward_comm_size=tp_comm_size//self.tp,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
else:
if args.tensor_model_parallel_size == 1 :
forward_comm = "NONE"
backward_comm = "NONE"
else:
forward_comm = "ALLREDUCE"
backward_comm = "NONE"
if self.args.recompute_activations and 'attention' in name:
forward_compute_time *= 2
if "embedding" in name:
emb_compute_time = _get_aiob_compute_time(
self.compute_cache, "", "embedding"
)
self.workload.append(
Work_Item(
name=name,
forward_compute_time=emb_compute_time,
forward_comm=forward_comm,
forward_comm_size=forward_comm_size,
backward_compute_time=default_compute_time,
backward_comm=backward_comm,
backward_comm_size=backward_comm_size,
dp_compute_time=backward_compute_time,
dp_comm=dp_comm,
dp_comm_size=dp_comm_size,
)
)
else:
forward_compute_time = _get_aiob_compute_time(
self.compute_cache, "forward", name.split("_")[0]
)
backward_compute_time = _get_aiob_compute_time(
self.compute_cache, "backward", name.split("_")[0]
)
self.workload.append(
Work_Item(
name=name,
forward_compute_time=forward_compute_time,
forward_comm=forward_comm,
forward_comm_size=forward_comm_size,
backward_compute_time=backward_compute_time,
backward_comm=backward_comm,
backward_comm_size=backward_comm_size,
dp_compute_time=backward_compute_time,
dp_comm=dp_comm,
dp_comm_size=dp_comm_size,
)
)
# compute_time = _get_aiob_compute_time(self.compute_cache, "forward", "embedding")
# self.workload.append(Work_Item(name="embedding_norm", forward_compute_time=compute_time,
# forward_comm = "ALLREDUCE", forward_comm_size= self.args.vocab_size*self.args.hidden_size*2,
# backward_compute_time=default_compute_time, backward_comm="NONE", backward_comm_size=0,
# dp_compute_time=default_compute_time, dp_comm="NONE", dp_comm_size=0
# ))
for i in range(3):
self.workload.append(
Work_Item(
name="cross_entropy" + str(i + 1),
forward_compute_time=compute_time,
forward_comm="ALLREDUCE",
forward_comm_size=self.args.seq_length * self.args.micro_batch * 4,
backward_compute_time=compute_time,
backward_comm="NONE",
backward_comm_size=0,
dp_compute_time=compute_time,
dp_comm="NONE",
dp_comm_size=0,
)
)
for i in range(4):
self.workload.append(
Work_Item(
name="optimizer" + str(i + 1),
forward_compute_time=compute_time,
forward_comm="ALLREDUCE",
forward_comm_size=4,
backward_compute_time=compute_time,
backward_comm="NONE",
backward_comm_size=0,
dp_compute_time=compute_time,
dp_comm="NONE",
dp_comm_size=0,
)
)