include/nccl_ofi_param.h (164 lines of code) (raw):
/*
* Copyright (c) 2020-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
*/
#ifndef NCCL_OFI_PARAM_H_
#define NCCL_OFI_PARAM_H_
#include <assert.h>
#include <errno.h>
#include <pthread.h>
#include <string.h>
#include "nccl_ofi_log.h"
#include "nccl_ofi_pthread.h"
/*
* This is an ugly hack. The original implementation of
* nccl_ofi_param created inline functions to access each environment
* variable, using the macros found in nccl_ofi_param.h. However,
* this creates something of an ODR problem, as multiple complication
* units can call the same param lookup function, and that results in
* naming conflicts. So instead, we have the header file act like a
* normal header file most of the time, and when included from
* nccl_ofi_param.c with OFI_NCCL_PARAM_DEFINE set to 1, stamps out
* the original implementations of the functions. So now we have one
* copy of each function that everyone can call.
*
* This is intended to be a transient state. We want to rewrite the
* entire param system once we've finished moving to C++, but need to
* solve the ODR problem before we move to C++. So here lies one of
* the more terrible pieces of code I've ever written.
*/
#ifndef OFI_NCCL_PARAM_DEFINE
#define OFI_NCCL_PARAM_UINT(name, env, default_value) \
uint64_t ofi_nccl_##name(void)
#define OFI_NCCL_PARAM_INT(name, env, default_value) \
int64_t ofi_nccl_##name(void)
#define OFI_NCCL_PARAM_STR(name, env, default_value) \
const char *ofi_nccl_##name(void)
#else
#define OFI_NCCL_PARAM_UINT(name, env, default_value) \
uint64_t ofi_nccl_##name(void); \
static pthread_mutex_t ofi_nccl_param_lock_##name = PTHREAD_MUTEX_INITIALIZER; \
uint64_t ofi_nccl_##name(void) \
{ \
static bool initialized = false; \
static uint64_t value = default_value; \
if (initialized) { \
return value; \
} \
nccl_net_ofi_mutex_lock(&ofi_nccl_param_lock_##name); \
uint64_t v; \
char *str, *endptr; \
if (!initialized) { \
str = getenv("OFI_NCCL_" env); \
if (str && strlen(str) > 0) { \
errno = 0; \
v = strtoull(str, &endptr, 0); \
if (errno || str == endptr || *endptr != '\0') { \
NCCL_OFI_INFO( \
NCCL_INIT | NCCL_NET, \
"Invalid value %s provided for %s environment variable, using default %lu", \
str, \
"OFI_NCCL_" env, \
value); \
} else { \
value = v; \
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, \
"Setting %s environment variable to %lu", \
"OFI_NCCL_" env, \
value); \
} \
} \
initialized = true; \
} \
nccl_net_ofi_mutex_unlock(&ofi_nccl_param_lock_##name); \
return value; \
}
#define OFI_NCCL_PARAM_INT(name, env, default_value) \
int64_t ofi_nccl_##name(); \
static pthread_mutex_t ofi_nccl_param_lock_##name = PTHREAD_MUTEX_INITIALIZER; \
int64_t ofi_nccl_##name() { \
static bool initialized = false; \
static int64_t value = default_value; \
if (initialized) { \
return value; \
} \
nccl_net_ofi_mutex_lock(&ofi_nccl_param_lock_##name); \
int64_t v; \
char *str, *endptr; \
if (!initialized) { \
str = getenv("OFI_NCCL_" env); \
if (str && strlen(str) > 0) { \
errno = 0; \
v = strtoll(str, &endptr, 0); \
if (errno || str == endptr || *endptr != '\0') { \
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, \
"Invalid value %s provided for %s environment variable, using default %lu", \
str, "OFI_NCCL_" env, value); \
} else { \
value = v; \
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Setting %s environment variable to %lu", \
"OFI_NCCL_" env, value); \
} \
} \
initialized = true; \
} \
nccl_net_ofi_mutex_unlock(&ofi_nccl_param_lock_##name); \
return value; \
}
#define OFI_NCCL_PARAM_STR(name, env, default_value) \
const char *ofi_nccl_##name(); \
static pthread_mutex_t ofi_nccl_param_lock_##name = PTHREAD_MUTEX_INITIALIZER; \
const char *ofi_nccl_##name() { \
static bool initialized = false; \
static const char *value = default_value; \
if (initialized) { \
return value; \
} \
nccl_net_ofi_mutex_lock(&ofi_nccl_param_lock_##name); \
char *str; \
if (!initialized) { \
str = getenv("OFI_NCCL_" env); \
if (str) { \
value = strdup(str); \
if (value) { \
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Setting %s environment variable to %s", \
"OFI_NCCL_" env, value); \
} else { \
value = default_value; \
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, \
"Allocation error saving result for %s environment variable. Falling back to default %s", \
"OFI_NCCL_" env, value); \
} \
} \
initialized = true; \
} \
nccl_net_ofi_mutex_unlock(&ofi_nccl_param_lock_##name); \
return value; \
}
#endif
/*
* Enable using endpoints with IPv6 addressing format for TCP provider.
* By default, we disable using endpoints having IPv6 addressing format.
*/
OFI_NCCL_PARAM_INT(use_ipv6_tcp, "USE_IPV6_TCP", 0);
/*
* List of interface names (comma-separated) to be filtered out for TCP provider.
* By default, it is set to eliminate lo and docker0 interfaces.
*
* TODO: Remove lo after https://github.com/ofiwg/libfabric/issues/6127 is fixed
*/
OFI_NCCL_PARAM_STR(exclude_tcp_if, "EXCLUDE_TCP_IF", "lo,docker0");
/*
* Disable flush operation when using GPUDirect. Flush commands
* are used to enforce data consistency at the receiving GPU. It should
* only be disabled when underlying libfabric provider or hardware
* ensures data consistency.
* By default, plugin issues flush commands.
*/
OFI_NCCL_PARAM_INT(gdr_flush_disable, "GDR_FLUSH_DISABLE", 0);
/*
* Specify the number of network connections created by
* NIC_DUP_CONNS. Each chosen Libfabric provider will be duplicated N
* times and exposed to NCCL as a unique endpoint.
*/
OFI_NCCL_PARAM_INT(nic_dup_conns, "NIC_DUP_CONNS", 0);
/*
* When using GPUDirect use the cudaDeviceFlushGPUDirectRDMAWrites
* to enforce data consistency at the receiving GPU. Requires CUDA 11.3 or
* later. Note that this function only provides a GPU memory fence and requires
* that data has already been delivered to GPU memory. Some networks and
* PCIe configurations require an additional network-level flush that
* is not provided by this option.
*/
OFI_NCCL_PARAM_INT(cuda_flush_enable, "CUDA_FLUSH_ENABLE", 0);
/*
* Specify the memory registration key size in bytes when using a libfabric
* provider that supports application-selected memory registration keys.
*/
OFI_NCCL_PARAM_UINT(mr_key_size, "MR_KEY_SIZE", 2);
/*
* Disable the MR cache. The MR cache is used to keep track of registered
* memory regions, so that calling regMr() on the same buffer (address and
* size), will quickly return a previously globally registered MR on that
* buffer, avoiding redundant (and expensive) registrations with the
* underlying device.
* Disabling the MR cache will make all calls to regMR() result in a
* registration with the device, so it may cause a significant performance
* degradation.
*/
OFI_NCCL_PARAM_INT(mr_cache_disable, "MR_CACHE_DISABLE",
#if HAVE_NEURON
/*
* 1. NeuronRuntime maintains its own MR cache, making the aws-ofi-nccl
* MR cache redundant.
* 2. Neuron registers MRs that are smaller than system page size.
* NeuronRuntime MR cache supports that, while aws-ofi-nccl MR
* cache doesn't.
*/
1
#else
0
#endif
);
/*
* Maximum number of cq entries to read in a single call to
* fi_cq_read.
*/
OFI_NCCL_PARAM_INT(cq_read_count, "CQ_READ_COUNT", 4);
/*
* Protocol to use for send/recv operations. Valid options are
* SENDRECV and RDMA, with SENDRECV the default. Default param is
* NULL so that we can determine if user set the option.
*/
OFI_NCCL_PARAM_STR(protocol, "PROTOCOL", NULL);
/*
* Override the platform default for domain allocation, with
* respect to the process or thread.
*
* -1 (unset default): use the platform-specific configuration.
* 0: Allocate one domain per process
* 1: Allocate one domain per thread
*/
OFI_NCCL_PARAM_INT(domain_per_thread, "DOMAIN_PER_THREAD", -1);
/*
* Disable the native RDMA write support check when using the "RDMA" protocol
* for send/recv operations on AWS platforms. When the check is disabled, the
* "RDMA" protocol can be used even on platforms where native RDMA write is not
* supported or cannot be verified to be supported. By default, the plugin
* peforms the native RDMA support checks.
*/
OFI_NCCL_PARAM_INT(disable_native_rdma_check, "DISABLE_NATIVE_RDMA_CHECK", 0);
/*
* Disable the check for required GDR support on EC2 instances. When this check
* is disabled, the plugin can be used without GDR support even on platforms
* that support GDR (P4d and later). By default, the plugin performs the check.
*/
OFI_NCCL_PARAM_INT(disable_gdr_required_check, "DISABLE_GDR_REQUIRED_CHECK", 0);
/*
* In cases where libfabric>=1.20 is available, and the provider has FI_HMEM
* support, the only further stated requirement for a user application to use
* dmabuf is to pass FI_MR_DMABUF in the flags on the call to fi_regattr(3).
*
* Unfortunately, the plugin needs to signal DMABUF support or lack thereof back
* to NCCL prior to having an opportuntiy to make any any memory registrations.
* This ultimately means that the plugin will opimistically assume DMA-BUF is
* viable on all FI_HMEM providers beyond libfabric 1.20, if not for this param.
*
* If dmabuf registrations fail, (ie: if ibv_reg_dmabuf_mr cannot be resolved),
* the plugin has no freedom to renegotiate DMABUF support with NCCL, and so it
* is fatal. Under those conditions, users should ensure that they have set this
* environment variable to '1' to force NCCL to avoid providing dmabuf file
* desciptors.
*/
OFI_NCCL_PARAM_INT(disable_dmabuf, "DISABLE_DMABUF", 0);
/*
* Messages sized larger than this threshold will be striped across multiple rails
*/
OFI_NCCL_PARAM_UINT(min_stripe_size, "MIN_STRIPE_SIZE", (128 * 1024));
/*
* The round robin scheduler has two round robin counts, for small (likely
* control) and medium (likely data) messages. This parameter moves that value.
*/
OFI_NCCL_PARAM_UINT(sched_max_small_msg_size, "SCHED_MAX_SMALL_RR_SIZE", 64);
/*
* Deprecated value to control both eager and control bounce counts.
*/
OFI_NCCL_PARAM_INT(deprecated_rdma_min_posted_bounce_buffers, "RDMA_MIN_POSTED_BOUNCE_BUFFERS", -1);
/*
* Deprecated value to control both eager and control bounce counts.
*/
OFI_NCCL_PARAM_INT(deprecated_rdma_max_posted_bounce_buffers, "RDMA_MAX_POSTED_BOUNCE_BUFFERS", -1);
/*
* Minimum eager rx buffers posted per endpoint. The plugin will attempt to post
* more rx buffers if we dip below this threshold, allocating new rx buffers if
* needed.
*/
OFI_NCCL_PARAM_INT(rdma_min_posted_eager_buffers, "RDMA_MIN_POSTED_EAGER_BUFFERS", 64);
/*
* Maximum rx buffers posted per endpoint. The plugin will not attempt to
* post more rx buffers if we reach this threshold, returning available
* buffers to the free list if needed
*/
OFI_NCCL_PARAM_INT(rdma_max_posted_eager_buffers, "RDMA_MAX_POSTED_EAGER_BUFFERS", 128);
/*
* Minimum control rx buffers posted per endpoint. The plugin will attempt to post
* more rx buffers if we dip below this threshold, allocating new rx buffers if
* needed.
*/
OFI_NCCL_PARAM_INT(rdma_min_posted_control_buffers, "RDMA_MIN_POSTED_CONTROL_BUFFERS", 1920);
/*
* Maximum rx buffers posted per endpoint. The plugin will not attempt to
* post more rx buffers if we reach this threshold, returning available
* buffers to the free list if needed
*/
OFI_NCCL_PARAM_INT(rdma_max_posted_control_buffers, "RDMA_MAX_POSTED_CONTROL_BUFFERS", 2048);
/*
* Whether to spread the control message across multiple rails in round robin fashion or
* send it consistenly on one rail.
*/
OFI_NCCL_PARAM_INT(rdma_rr_ctrl_msg, "RR_CTRL_MSG", 1);
/*
* Internode network latency reported to NCCL. Defaults to 0, unless the configured
* platform sets a specific value.
*/
OFI_NCCL_PARAM_INT(net_latency, "NET_LATENCY", -1);
/*
* Eager message size limit when using RDMA protocol. Message sizes greater than
* this limit will always be sent using RDMA write instead of eagerly.
*/
OFI_NCCL_PARAM_INT(eager_max_size, "EAGER_MAX_SIZE", 8192);
/*
* Decide whether or not mutexes should default to errorcheck mode.
* Defaults to no, unless debugging is enabled, in which case it
* defaults to 1.
*/
#if defined(NDEBUG) && NDEBUG != 0
#define OFI_NCCL_PARAM_ERRORCHECK_MUTEX_DEFAULT 0
#else
#define OFI_NCCL_PARAM_ERRORCHECK_MUTEX_DEFAULT 1
#endif
OFI_NCCL_PARAM_INT(errorcheck_mutex, "ERRORCHECK_MUTEX",
OFI_NCCL_PARAM_ERRORCHECK_MUTEX_DEFAULT);
/*
* If 0, create a Libfabric endpoint per domain, shared across all
* communicators. If non-0, create a Libfabric endpoint per
* communicator.
*/
OFI_NCCL_PARAM_INT(endpoint_per_communicator, "ENDPOINT_PER_COMM", 0);
/*
* Some versions of NCCL (in particular, we know NCCL 2.21-2.23) will
* not properly handle when the network plugin returns an error,
* meaning that jobs can end up hanging if an asynchronous request
* fails when calling test(). This is annoying for customers, so we
* provide an environment variable to cause the plugin to abort the
* job rather than returning an (ignored) error to NCCL.
*/
OFI_NCCL_PARAM_INT(abort_on_error, "ABORT_ON_ERROR", 0);
/*
* Force using a specific tuner type.
* "Internal" for NCCL internal tuner.
* "Region" for NCCL OFI Region base tuner.
* "Model" for NCCL OFI Model base tuner.
*/
OFI_NCCL_PARAM_STR(tuner_force_type, "TUNER_TYPE", NULL);
/*
* The plugin interface lets us tune the number of channels as well, but that
* can come later (once a proto+algo combination is chosen, we can compute the
* cost with different channel count and optimize for it.
*/
OFI_NCCL_PARAM_INT(tuner_num_channels, "TUNER_NUM_CHANNELS", 8);
/*
* Latency in µsecs. Note, this is currently different from the network plugin's param for
* net latency by design. When we merge with the platform_data values, we will
* need to do some additional testing on the base case where a tuner is not
* loaded to make sure the same defaykts make sense across both paths, and
* combine the parameters. This parameter is meant for internal testing only and
* is not meant to be documented for users.
*/
OFI_NCCL_PARAM_INT(tuner_net_latency, "TUNER_NET_LATENCY", 20);
/*
* With EFA, we expect a ~2µsec cost in the device and ~1µsec cost to write that
* completion up to the host stack.
*/
OFI_NCCL_PARAM_INT(tuner_net_comp_overhead, "TUNER_NET_COMP_OVERHEAD", 3);
/*
* Do we want to set the LOW_LATENCY traffic class for control
* messages? This generally improves performance for platforms that
* support TCs, unless the prioritization over-reacts on the given network.
*/
OFI_NCCL_PARAM_INT(use_low_lat_tc, "USE_LOW_LATENCY_TC", 1);
/*
* Number of rails that the rdma transport should build. If the
* number of rails is more than the number of NICs, then the number of
* rails must be a multiple of the number of NICs.
*/
OFI_NCCL_PARAM_INT(force_num_rails, "FORCE_NUM_RAILS", 0);
/*
* 1 to enable early completion, 0 to disable it.
* Default at -1 to follow the data progress model, given that
* early completion feature is contigent on FI_PROGRESS_AUTO data progress model
* i.e. enabled when FI_PROGRESS_AUTO, otherwise disabled
*/
OFI_NCCL_PARAM_INT(early_completion, "EARLY_COMPLETION", -1);
#endif // End NCCL_OFI_PARAM_H_