def _set_nccl_environment()

in src/sagemaker_pytorch_container/training.py [0:0]


def _set_nccl_environment(network_interface_name):
    """Set NCCL environment variables for the container.

    https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html#ncclknobs

    Args:
        network_interface_name: The name of the network interface to use for
            distributed training.
    """
    # Set the network interface for inter node communication
    os.environ['NCCL_SOCKET_IFNAME'] = network_interface_name
    # Disable IB transport and force to use IP sockets by default
    os.environ['NCCL_IB_DISABLE'] = '1'
    # Set to INFO for more NCCL debugging information
    os.environ['NCCL_DEBUG'] = 'WARN'