in optimum/neuron/accelerate/optimizer.py [0:0]
def allreduce_sequence_parallel_gradients(optimizer):
"""
All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used.
Modified from megatron-lm:
https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425
"""
from neuronx_distributed.parallel_layers.mappings import reduce_from_tensor_model_parallel_region
grads = []
for param_group in optimizer.__getstate__()["param_groups"]:
for group, params in param_group.items():
if group == "params":
for p in params:
if isinstance(p, torch.Tensor) and p.grad is not None:
sequence_parallel_param = getattr(p, "sequence_parallel_enabled", False)
if sequence_parallel_param:
grads.append(p.grad.data)
for grad in grads:
# sum v.s. average: sum
reduce_from_tensor_model_parallel_region(grad)