int platform_config_endpoint()

in src/platform-aws.cpp [596:754]


int platform_config_endpoint(struct fi_info *info, struct fid_ep* endpoint) {
	int ret = 0;
#if HAVE_CUDA
	const char *optname_name = "none";
	int optname = -1;
#endif

	if (endpoint == NULL) {
		NCCL_OFI_WARN("Unable to configure invalid endpoint");
		ret = -EINVAL;
		goto exit;
	}

	/* short circuit when not using EFA */
	if (0 != strcmp(info->fabric_attr->prov_name, "efa")) {
		ret = 0;
		goto exit;
	}

	if (ofi_nccl_disable_gdr_required_check() == 0) {
		/* Ensure GDR is enabled on GDR-supported instances */
		struct ec2_platform_data *platform_data = get_platform_data();
		if (platform_data && platform_data->gdr_required && support_gdr != GDR_SUPPORTED) {
			NCCL_OFI_WARN("GDR disabled on GDR-supported instance type %s", platform_data->name);
			ret = -EINVAL;
			goto exit;
		}
	}

	/* If the selected communication protocol is RDMA write and the user did
	 * not disable the native RDMA support check, validate that the
	 * FI_OPT_EFA_EMULATED_WRITE endpoint option can be accessed, and that
	 * emulated writes are disabled.
	 */

	if (0 == strcasecmp("RDMA", nccl_ofi_selected_protocol) &&
	    ofi_nccl_disable_native_rdma_check() == 0) {
		ret = validate_rdma_write(endpoint);
		if (ret != 0) {
			goto exit;
		}
	}

#if HAVE_CUDA
	static bool nccl_proto_configured = false;
	static bool need_ordering = false;
	static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;

	/* During initialization, try to set
	 * FI_OPT_EFA_{SENDRECV,WRTIE}_IN_ORDER_ALIGNED_128_BYTES to
	 * true to see if the LL/LL128 protocol is supported. After
	 * initialization, try to set the option to true again if it
	 * was previously set and error if we can't set them the same
	 * way later.
	 */
	if (0 == strcasecmp("SENDRECV", nccl_ofi_selected_protocol)) {
#if HAVE_DECL_FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES
		optname = FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES;
		optname_name = "FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES";
#endif
	} else if (0 == strcasecmp("RDMA", nccl_ofi_selected_protocol)) {
#if HAVE_DECL_FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES
		optname = FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES;
		optname_name = "FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES";
#endif
	} else {
		NCCL_OFI_WARN("unkonwn transport %s", nccl_ofi_selected_protocol);
		ret = -EINVAL;
		goto exit;
	}

	nccl_net_ofi_mutex_lock(&mutex);

	/* TODO: This is a temporary hack to disable setting
	 * NCCL_PROTO=simple on P5en when using the RDMA protocol.  EFA
	 * on P5en does not currently report
	 * WRITE_IN_ORDER_ALIGNED_128_BYTES because it can deliver the
	 * (correct) payload twice.  This violates the meaning of the
	 * WRITE_IN_ORDER_ALIGNED_128_BYTES flag in rdma-core, but
	 * does not violate any assumptions about buffer reuse in
	 * NCCL. We have confirmed that the EFA provider in Libfabric
	 * will not segment messages for fi_write(), so this is safe.
	 * Note that the SENDRECV protocol does have segmentation
	 * challenges that require us to obey the
	 * SENDRECV_IN_ORDER_ALIGNED_128_BYTES flag, so we only skip
	 * the check when using the RDMA protocol.
	 */
	if (!nccl_proto_configured) {
		if ((NULL == getenv("NCCL_PROTO")) &&
		    (0 == strcasecmp("RDMA", nccl_ofi_selected_protocol)) &&
		    (0 == strcmp(nccl_net_ofi_get_product_name(), "p5en.48xlarge"))) {
			NCCL_OFI_INFO(NCCL_INIT, "Skipping NCCL_PROTO checks on P5en + RDMA");
			need_ordering = false;
			nccl_proto_configured = true;
		}
	}

	/* If we know we need byte delivery ordering (need_ordering ==
	 * true) or this is the first time that we're configuring an
	 * endpoint (nccl_proto_configured == false), then try to
	 * configure ordering on the endpoint.  The only time we care
	 * about ordering is if we don't set NCCL_PROTO=simple,
	 * because previous endpoints were able to be configured with
	 * ordering.  If we're not expecting ordering, we don't really
	 * care if ordering is on or off for the endpoint.
	 */
	if (need_ordering || !nccl_proto_configured) {
		bool have_ordering = false;

		if (optname != -1) {
			ret = configure_ep_inorder(endpoint, optname, optname_name,
						   &have_ordering);
			if (ret != 0) {
				NCCL_OFI_WARN("Unexpected failure setting inorder %d", ret);
				goto unlock;
			}
		}

		if (need_ordering && !have_ordering) {
			NCCL_OFI_WARN("Setting %s option failed after succeeding during initialization",
				      optname_name);
			ret = -ENOTSUP;
			goto unlock;
		}

		if (!nccl_proto_configured) {
			need_ordering = have_ordering;
			nccl_proto_configured = true;

			if (!have_ordering) {
				/* When byte delivery ordering is not guaranteed, force
				 * the simple protocol as the LL/LL128 protocols can lead
				 * to data corruption without data delivery ordering.
				 */
				ret = nccl_net_ofi_configure_nccl_proto_simple("byte delivery ordering");
				if (ret != 0) {
					NCCL_OFI_WARN("Failed to set NCCL_PROTO: %d", ret);
					ret = -ENOTSUP;
					goto unlock;
				}
			}
		}
	}

	if (0 == strcasecmp("RDMA", nccl_ofi_selected_protocol)) {
		ret = configure_ep_max_msg_size(endpoint);
		if (ret != 0) {
			NCCL_OFI_WARN("Unexpected failure setting max_msg_size %d", ret);
			goto unlock;
		}
	}

unlock:
	nccl_net_ofi_mutex_unlock(&mutex);
#endif // HAVE_CUDA

exit:
	return ret;
}