in src/sagemaker_training/torch_distributed.py [0:0]
def _setup(self):
logger.info("Starting distributed training through torchrun")
# EFA settings
if self._instance_type in SM_EFA_NCCL_INSTANCES:
# Enable EFA use
os.environ["FI_PROVIDER"] = "efa"
if self._instance_type in SM_EFA_RDMA_INSTANCES:
# Use EFA's RDMA functionality for one-sided and two-sided transfer
os.environ["FI_EFA_USE_DEVICE_RDMA"] = "1"
os.environ["RDMAV_FORK_SAFE"] = "1"
os.environ["NCCL_SOCKET_IFNAME"] = str(self._network_interface_name)
os.environ["NCCL_PROTO"] = "simple"