static ncclResult_t create_nccl_ofi_component()

in src/nccl_ofi_net.c [659:772]


static ncclResult_t create_nccl_ofi_component(struct fi_info *prov,
				     nccl_ofi_t *nccl_ofi_comp)
{
	ncclResult_t ret = ncclSuccess;
	struct fi_cq_attr cq_attr = {0};
	struct fi_av_attr av_attr = {0};
	int ofi_tag_leading_zeroes = 0, ofi_tag_bits_for_ring_id = 64;

	/* Determine if any tag bits are used by provider */
	while (!((prov->ep_attr->mem_tag_format << ofi_tag_leading_zeroes++) &
		(uint64_t) OFI_HIGHEST_TAG_BIT) &&
		(ofi_tag_bits_for_ring_id >= MIN_TAG_BITS_FOR_RING_ID)) {
		ofi_tag_bits_for_ring_id--;
	}

	if (OFI_UNLIKELY(ofi_tag_bits_for_ring_id < MIN_TAG_BITS_FOR_RING_ID)) {
		NCCL_OFI_WARN("Provider %s does not provide enough tag bits %d for ring ID. Minimum required is %d",
			      prov->fabric_attr->prov_name,
			      ofi_tag_bits_for_ring_id,
			      MIN_TAG_BITS_FOR_RING_ID);
		ret = ncclSystemError;
		goto exit;
	}

	/* Set maximum tag information; Reserving 1 bit for control information */
	nccl_ofi_comp->max_tag = (uint64_t)((1ULL <<
					    (ofi_tag_bits_for_ring_id - 1)) - 1);

	/* Create fabric */
	ret = fi_fabric(prov->fabric_attr, &(nccl_ofi_comp->fabric), NULL);
	if (OFI_UNLIKELY(ret != 0)) {
		NCCL_OFI_WARN("Couldn't open a fabric provider. RC: %d, ERROR: %s",
			     ret, fi_strerror(-ret));
		ret = ncclSystemError;
		goto error;
	}

	/* Create domain */
	ret = fi_domain(nccl_ofi_comp->fabric, prov,
			&(nccl_ofi_comp->domain), NULL);
	if (OFI_UNLIKELY(ret != 0)) {
		NCCL_OFI_WARN("Couldn't open a fabric access domain. RC: %d, ERROR: %s",
			     ret, fi_strerror(-ret));
		ret = ncclSystemError;
		goto error;
	}

	/* Create transport level communication endpoint(s) */
	ret = fi_endpoint(nccl_ofi_comp->domain, prov, &(nccl_ofi_comp->ep), NULL);
	if (OFI_UNLIKELY(ret != 0)) {
		NCCL_OFI_WARN("Couldn't allocate endpoint. RC: %d, ERROR: %s",
			     ret, fi_strerror(-ret));
		ret = ncclSystemError;
		goto error;
	}

	cq_attr.format = FI_CQ_FORMAT_TAGGED;
	ret = fi_cq_open(nccl_ofi_comp->domain, &cq_attr, &nccl_ofi_comp->cq, NULL);
	if (OFI_UNLIKELY(ret != 0)) {
		NCCL_OFI_WARN("Couldn't open CQ. RC: %d, ERROR: %s",
			     ret, fi_strerror(-ret));
		ret = ncclSystemError;
		goto error;
	}

	ret = fi_av_open(nccl_ofi_comp->domain, &av_attr, &nccl_ofi_comp->av, NULL);
	if (OFI_UNLIKELY(ret != 0)) {
		NCCL_OFI_WARN("Couldn't open AV. RC: %d, ERROR: %s",
			     ret, fi_strerror(-ret));
		ret = ncclSystemError;
		goto error;
	}

	/* Bind CQ and AV to endpoint */
	ret = fi_ep_bind(nccl_ofi_comp->ep, (fid_t)nccl_ofi_comp->cq, FI_SEND | FI_RECV);
	if (OFI_UNLIKELY(ret != 0)) {
		NCCL_OFI_WARN("Couldn't bind EP-CQ. RC: %d, ERROR: %s",
			     ret, fi_strerror(-ret));
		ret = ncclSystemError;
		goto error;
	}

	ret = fi_ep_bind(nccl_ofi_comp->ep, (fid_t)nccl_ofi_comp->av, 0);
	if (OFI_UNLIKELY(ret != 0)) {
		NCCL_OFI_WARN("Couldn't bind EP-CQ. RC: %d, ERROR: %s",
			     ret, fi_strerror(-ret));
		ret = ncclSystemError;
		goto error;
	}

	/* Enable endpoint for communication */
	ret = fi_enable(nccl_ofi_comp->ep);
	if (OFI_UNLIKELY(ret != 0)) {
		NCCL_OFI_WARN("Couldn't enable endpoint. RC: %d, ERROR: %s",
			     ret, fi_strerror(-ret));
		ret = ncclSystemError;
		goto error;
	}

	return ret;
error:
	if (nccl_ofi_comp->domain)
		fi_close((fid_t)nccl_ofi_comp->domain);
	if (nccl_ofi_comp->fabric)
		fi_close((fid_t)nccl_ofi_comp->fabric);
	if (nccl_ofi_comp->ep)
		fi_close((fid_t)nccl_ofi_comp->ep);
	if (nccl_ofi_comp->av)
		fi_close((fid_t)nccl_ofi_comp->av);
	if (nccl_ofi_comp->cq)
		fi_close((fid_t)nccl_ofi_comp->cq);
exit:
	return ret;
}