in src/platform-aws.cpp [596:754]
int platform_config_endpoint(struct fi_info *info, struct fid_ep* endpoint) {
int ret = 0;
#if HAVE_CUDA
const char *optname_name = "none";
int optname = -1;
#endif
if (endpoint == NULL) {
NCCL_OFI_WARN("Unable to configure invalid endpoint");
ret = -EINVAL;
goto exit;
}
/* short circuit when not using EFA */
if (0 != strcmp(info->fabric_attr->prov_name, "efa")) {
ret = 0;
goto exit;
}
if (ofi_nccl_disable_gdr_required_check() == 0) {
/* Ensure GDR is enabled on GDR-supported instances */
struct ec2_platform_data *platform_data = get_platform_data();
if (platform_data && platform_data->gdr_required && support_gdr != GDR_SUPPORTED) {
NCCL_OFI_WARN("GDR disabled on GDR-supported instance type %s", platform_data->name);
ret = -EINVAL;
goto exit;
}
}
/* If the selected communication protocol is RDMA write and the user did
* not disable the native RDMA support check, validate that the
* FI_OPT_EFA_EMULATED_WRITE endpoint option can be accessed, and that
* emulated writes are disabled.
*/
if (0 == strcasecmp("RDMA", nccl_ofi_selected_protocol) &&
ofi_nccl_disable_native_rdma_check() == 0) {
ret = validate_rdma_write(endpoint);
if (ret != 0) {
goto exit;
}
}
#if HAVE_CUDA
static bool nccl_proto_configured = false;
static bool need_ordering = false;
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
/* During initialization, try to set
* FI_OPT_EFA_{SENDRECV,WRTIE}_IN_ORDER_ALIGNED_128_BYTES to
* true to see if the LL/LL128 protocol is supported. After
* initialization, try to set the option to true again if it
* was previously set and error if we can't set them the same
* way later.
*/
if (0 == strcasecmp("SENDRECV", nccl_ofi_selected_protocol)) {
#if HAVE_DECL_FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES
optname = FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES;
optname_name = "FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES";
#endif
} else if (0 == strcasecmp("RDMA", nccl_ofi_selected_protocol)) {
#if HAVE_DECL_FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES
optname = FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES;
optname_name = "FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES";
#endif
} else {
NCCL_OFI_WARN("unkonwn transport %s", nccl_ofi_selected_protocol);
ret = -EINVAL;
goto exit;
}
nccl_net_ofi_mutex_lock(&mutex);
/* TODO: This is a temporary hack to disable setting
* NCCL_PROTO=simple on P5en when using the RDMA protocol. EFA
* on P5en does not currently report
* WRITE_IN_ORDER_ALIGNED_128_BYTES because it can deliver the
* (correct) payload twice. This violates the meaning of the
* WRITE_IN_ORDER_ALIGNED_128_BYTES flag in rdma-core, but
* does not violate any assumptions about buffer reuse in
* NCCL. We have confirmed that the EFA provider in Libfabric
* will not segment messages for fi_write(), so this is safe.
* Note that the SENDRECV protocol does have segmentation
* challenges that require us to obey the
* SENDRECV_IN_ORDER_ALIGNED_128_BYTES flag, so we only skip
* the check when using the RDMA protocol.
*/
if (!nccl_proto_configured) {
if ((NULL == getenv("NCCL_PROTO")) &&
(0 == strcasecmp("RDMA", nccl_ofi_selected_protocol)) &&
(0 == strcmp(nccl_net_ofi_get_product_name(), "p5en.48xlarge"))) {
NCCL_OFI_INFO(NCCL_INIT, "Skipping NCCL_PROTO checks on P5en + RDMA");
need_ordering = false;
nccl_proto_configured = true;
}
}
/* If we know we need byte delivery ordering (need_ordering ==
* true) or this is the first time that we're configuring an
* endpoint (nccl_proto_configured == false), then try to
* configure ordering on the endpoint. The only time we care
* about ordering is if we don't set NCCL_PROTO=simple,
* because previous endpoints were able to be configured with
* ordering. If we're not expecting ordering, we don't really
* care if ordering is on or off for the endpoint.
*/
if (need_ordering || !nccl_proto_configured) {
bool have_ordering = false;
if (optname != -1) {
ret = configure_ep_inorder(endpoint, optname, optname_name,
&have_ordering);
if (ret != 0) {
NCCL_OFI_WARN("Unexpected failure setting inorder %d", ret);
goto unlock;
}
}
if (need_ordering && !have_ordering) {
NCCL_OFI_WARN("Setting %s option failed after succeeding during initialization",
optname_name);
ret = -ENOTSUP;
goto unlock;
}
if (!nccl_proto_configured) {
need_ordering = have_ordering;
nccl_proto_configured = true;
if (!have_ordering) {
/* When byte delivery ordering is not guaranteed, force
* the simple protocol as the LL/LL128 protocols can lead
* to data corruption without data delivery ordering.
*/
ret = nccl_net_ofi_configure_nccl_proto_simple("byte delivery ordering");
if (ret != 0) {
NCCL_OFI_WARN("Failed to set NCCL_PROTO: %d", ret);
ret = -ENOTSUP;
goto unlock;
}
}
}
}
if (0 == strcasecmp("RDMA", nccl_ofi_selected_protocol)) {
ret = configure_ep_max_msg_size(endpoint);
if (ret != 0) {
NCCL_OFI_WARN("Unexpected failure setting max_msg_size %d", ret);
goto unlock;
}
}
unlock:
nccl_net_ofi_mutex_unlock(&mutex);
#endif // HAVE_CUDA
exit:
return ret;
}