in src/sagemaker_pytorch_container/training.py [0:0]
def _set_nccl_environment(network_interface_name):
"""Set NCCL environment variables for the container.
https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html#ncclknobs
Args:
network_interface_name: The name of the network interface to use for
distributed training.
"""
# Set the network interface for inter node communication
os.environ['NCCL_SOCKET_IFNAME'] = network_interface_name
# Disable IB transport and force to use IP sockets by default
os.environ['NCCL_IB_DISABLE'] = '1'
# Set to INFO for more NCCL debugging information
os.environ['NCCL_DEBUG'] = 'WARN'