in workload_generator/AIOB_simAI_workload_generator.py [0:0]
def workload_generate(self):
# args.world_size --> total gpus number
self.ga_num = self.args.global_batch // (self.args.micro_batch * self.args.dp_num)
if self.ga_num < 1:
print(
"[WARN]: ga num < 1, please confirm global_batch num and micro_batch num"
)
default_compute_time = 1
compute_time = 0
tp_comm_size = (
2 * self.args.micro_batch * self.args.seq_length * self.args.hidden_size
)
layers = self.get_model_details()
total_params, moe_param_count = self._get_total_params()
# print(f"Total params is {total_params}, moe params is {moe_param_count}")
# self.workload.append(Work_Item(name="norm", forward_compute_time=0,
# forward_comm = "BROADCAST", forward_comm_size= 8*self.args.micro_batch*self.args.seq_length,
# backward_compute_time=default_compute_time, backward_comm="NONE", backward_comm_size=0,
# dp_compute_time=default_compute_time, dp_comm="NONE", dp_comm_size=0
# ))
forward_compute_time = default_compute_time
backward_compute_time = default_compute_time
self.workload.append(
Work_Item(
name="grad_norm",
forward_compute_time=forward_compute_time,
forward_comm="ALLGATHER",
forward_comm_size=2 * total_params,
backward_compute_time=backward_compute_time,
backward_comm="NONE",
backward_comm_size=0,
dp_compute_time=default_compute_time,
dp_comm="REDUCESCATTER",
dp_comm_size=4 * total_params,
)
)
if not self.args.enable_sequence_parallel:
self.workload.append(
Work_Item(
name="layernorm",
forward_compute_time=default_compute_time,
forward_comm="NONE",
forward_comm_size=0,
backward_compute_time=default_compute_time,
backward_comm="ALLREDUCE",
backward_comm_size=2 * total_params,
dp_compute_time=default_compute_time,
dp_comm="NONE",
dp_comm_size=0,
)
)
if args.expert_model_parallel_size != args.dp_num:
self.workload.append(Work_Item(name="moe_grad_norm1", forward_compute_time=default_compute_time,
forward_comm = "NONE", forward_comm_size= 0,
backward_compute_time=default_compute_time, backward_comm="NONE", backward_comm_size=0,
dp_compute_time=default_compute_time, dp_comm="ALLGATHER_DP_EP", dp_comm_size=2*moe_param_count
))
self.workload.append(Work_Item(name="moe_grad_norm2", forward_compute_time=default_compute_time,
forward_comm = "NONE", forward_comm_size= 0,
backward_compute_time=default_compute_time, backward_comm="NONE", backward_comm_size=0,
dp_compute_time=default_compute_time, dp_comm="REDUCESCATTER_DP_EP", dp_comm_size=4*moe_param_count
))
for _ in range(self.ga_num):
for layer in layers:
name = layer.layer_name
forward_comm = backward_comm = backward_comm_2 = "NONE"
forward_comm_size = tp_comm_size
backward_comm_size = tp_comm_size
dp_comm = "NONE"
dp_comm_size = 0
if self.args.enable_sequence_parallel:
if "embedding" in name:
self.workload.append(
Work_Item(
name=name,
forward_compute_time=default_compute_time,
forward_comm=forward_comm,
forward_comm_size=forward_comm_size,
backward_compute_time=default_compute_time,
backward_comm=backward_comm,
backward_comm_size=backward_comm_size,
dp_compute_time=backward_compute_time,
dp_comm=dp_comm,
dp_comm_size=dp_comm_size,
)
)
if "row" in name:
if self.args.recompute_activations and 'attention' in name:
forward_comm_size *= 2
forward_comm = "REDUCESCATTER"
backward_comm = "ALLGATHER"
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm, forward_comm_size= forward_comm_size,
backward_compute_time=default_compute_time, backward_comm="NONE", backward_comm_size=tp_comm_size,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
if "column" in name:
if self.args.recompute_activations and 'attention' in name:
forward_comm_size *= 2
forward_comm = "ALLGATHER"
forward_comm2 = "NONE"
backward_comm = "REDUCESCATTER"
backward_comm_2 = "ALLGATHER"
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm, forward_comm_size= forward_comm_size,
backward_compute_time=default_compute_time, backward_comm="NONE", backward_comm_size=0,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
if "moelayer" in name:
forward_comm1 = "ALLGATHER"
forward_comm2 = "ALLTOALL"
forward_comm3 = "ALLTOALL_EP"
forward_comm4 = "ALLGATHER"
forward_comm5 = "REDUCESCATTER"
forward_comm6 = "ALLTOALL_EP"
forward_comm7 = "ALLTOALL"
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm1, forward_comm_size= 2*self.seq_len*self.num_experts,
backward_compute_time=default_compute_time, backward_comm=forward_comm1, backward_comm_size=2*self.seq_len*self.num_experts,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm2, forward_comm_size= tp_comm_size//self.tp,
backward_compute_time=default_compute_time, backward_comm=forward_comm2, backward_comm_size=tp_comm_size//self.tp,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm3, forward_comm_size= tp_comm_size*self.topk//self.tp,
backward_compute_time=default_compute_time, backward_comm=forward_comm3, backward_comm_size=tp_comm_size*self.topk//self.tp,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm4, forward_comm_size= tp_comm_size*self.topk//self.tp,
backward_compute_time=default_compute_time, backward_comm=forward_comm4, backward_comm_size=tp_comm_size*self.topk,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm5, forward_comm_size= tp_comm_size*self.topk//self.tp,
backward_compute_time=default_compute_time, backward_comm=forward_comm4, backward_comm_size=tp_comm_size*self.topk//self.tp,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm6, forward_comm_size= tp_comm_size*self.topk//self.tp,
backward_compute_time=default_compute_time, backward_comm=forward_comm6, backward_comm_size=tp_comm_size*self.topk//self.tp,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
self.workload.append(Work_Item(name=name, forward_compute_time=default_compute_time,
forward_comm = forward_comm7, forward_comm_size= tp_comm_size//self.tp,
backward_compute_time=default_compute_time, backward_comm=forward_comm7, backward_comm_size=tp_comm_size//self.tp,
dp_compute_time=default_compute_time, dp_comm=dp_comm, dp_comm_size=dp_comm_size
))
else:
forward_comm = "ALLREDUCE"
backward_comm = "ALLREDUCE"
if self.args.recompute_activations and 'attention' in name:
forward_comm_size *= 2
if "embedding" in name:
self.workload.append(
Work_Item(
name=name,
forward_compute_time=default_compute_time,
forward_comm=forward_comm,
forward_comm_size=forward_comm_size,
backward_compute_time=default_compute_time,
backward_comm=backward_comm,
backward_comm_size=backward_comm_size,
dp_compute_time=backward_compute_time,
dp_comm=dp_comm,
dp_comm_size=dp_comm_size,
)
)
else:
self.workload.append(
Work_Item(
name=name,
forward_compute_time=default_compute_time,
forward_comm=forward_comm,
forward_comm_size=forward_comm_size,
backward_compute_time=default_compute_time,
backward_comm=backward_comm,
backward_comm_size=backward_comm_size,
dp_compute_time=default_compute_time,
dp_comm=dp_comm,
dp_comm_size=dp_comm_size,
)
)
self.workload.append(
Work_Item(
name="embedding_norm",
forward_compute_time=default_compute_time,
forward_comm="ALLREDUCE",
forward_comm_size=self.args.vocab_size * self.args.hidden_size * 2,
backward_compute_time=default_compute_time,
backward_comm="NONE",
backward_comm_size=0,
dp_compute_time=default_compute_time,
dp_comm="NONE",
dp_comm_size=0,
)
)
for i in range(3):
self.workload.append(
Work_Item(
name="cross_entropy" + str(i + 1),
forward_compute_time=compute_time,
forward_comm="ALLREDUCE",
forward_comm_size=self.args.seq_length * self.args.micro_batch * 4,
backward_compute_time=compute_time,
backward_comm="NONE",
backward_comm_size=0,
dp_compute_time=compute_time,
dp_comm="NONE",
dp_comm_size=0,
)
)
for i in range(4):
self.workload.append(
Work_Item(
name="optimizer" + str(i + 1),
forward_compute_time=compute_time,
forward_comm="ALLREDUCE",
forward_comm_size=4,
backward_compute_time=compute_time,
backward_comm="NONE",
backward_comm_size=0,
dp_compute_time=compute_time,
dp_comm="NONE",
dp_comm_size=0,
)
)