in workload_generator/generate_megatron_workload.py [0:0]
def forward(self):
args = self.args
if self.tp_is_enable:
self.workload.append(
LogItem(
comm_type=CommType.broadcast,
comm_group=CommGroup.tp_group,
comm_group_size=self.args.tensor_model_parallel_size,
msg_size=5 * 8,
stage="forward_step",
src=0,
)
)
self.workload.append(
LogItem(
comm_type=CommType.broadcast,
comm_group=CommGroup.tp_group,
comm_group_size=self.args.tensor_model_parallel_size,
msg_size=8 * (args.world_size + args.seq_length * args.micro_batch),
stage="forward_step",
src=0,
)
)
self.workload.extend(self.model.forward())
for _ in range(3):
# for bf16, we need to use float32 in loss communication
self.workload.append(
LogItem(
comm_type=CommType.all_reduce,
comm_group=CommGroup.tp_group,
comm_group_size=self.args.tensor_model_parallel_size,
msg_size=args.micro_batch * args.seq_length * 4,
stage="forward_step._VocabParallelCrossEntropy",
)
)
# average_losses_across_data_parallel_group
self.workload.append(
LogItem(
comm_type=CommType.all_reduce,
comm_group=CommGroup.dp_group,
comm_group_size=self.args.dp_num,
msg_size=1 * 4,
stage="forward_step.average_losses_across_data_parallel_group",
)
)