in src/nccl_ofi_net.c [659:772]
static ncclResult_t create_nccl_ofi_component(struct fi_info *prov,
nccl_ofi_t *nccl_ofi_comp)
{
ncclResult_t ret = ncclSuccess;
struct fi_cq_attr cq_attr = {0};
struct fi_av_attr av_attr = {0};
int ofi_tag_leading_zeroes = 0, ofi_tag_bits_for_ring_id = 64;
/* Determine if any tag bits are used by provider */
while (!((prov->ep_attr->mem_tag_format << ofi_tag_leading_zeroes++) &
(uint64_t) OFI_HIGHEST_TAG_BIT) &&
(ofi_tag_bits_for_ring_id >= MIN_TAG_BITS_FOR_RING_ID)) {
ofi_tag_bits_for_ring_id--;
}
if (OFI_UNLIKELY(ofi_tag_bits_for_ring_id < MIN_TAG_BITS_FOR_RING_ID)) {
NCCL_OFI_WARN("Provider %s does not provide enough tag bits %d for ring ID. Minimum required is %d",
prov->fabric_attr->prov_name,
ofi_tag_bits_for_ring_id,
MIN_TAG_BITS_FOR_RING_ID);
ret = ncclSystemError;
goto exit;
}
/* Set maximum tag information; Reserving 1 bit for control information */
nccl_ofi_comp->max_tag = (uint64_t)((1ULL <<
(ofi_tag_bits_for_ring_id - 1)) - 1);
/* Create fabric */
ret = fi_fabric(prov->fabric_attr, &(nccl_ofi_comp->fabric), NULL);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't open a fabric provider. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
ret = ncclSystemError;
goto error;
}
/* Create domain */
ret = fi_domain(nccl_ofi_comp->fabric, prov,
&(nccl_ofi_comp->domain), NULL);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't open a fabric access domain. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
ret = ncclSystemError;
goto error;
}
/* Create transport level communication endpoint(s) */
ret = fi_endpoint(nccl_ofi_comp->domain, prov, &(nccl_ofi_comp->ep), NULL);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't allocate endpoint. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
ret = ncclSystemError;
goto error;
}
cq_attr.format = FI_CQ_FORMAT_TAGGED;
ret = fi_cq_open(nccl_ofi_comp->domain, &cq_attr, &nccl_ofi_comp->cq, NULL);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't open CQ. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
ret = ncclSystemError;
goto error;
}
ret = fi_av_open(nccl_ofi_comp->domain, &av_attr, &nccl_ofi_comp->av, NULL);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't open AV. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
ret = ncclSystemError;
goto error;
}
/* Bind CQ and AV to endpoint */
ret = fi_ep_bind(nccl_ofi_comp->ep, (fid_t)nccl_ofi_comp->cq, FI_SEND | FI_RECV);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't bind EP-CQ. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
ret = ncclSystemError;
goto error;
}
ret = fi_ep_bind(nccl_ofi_comp->ep, (fid_t)nccl_ofi_comp->av, 0);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't bind EP-CQ. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
ret = ncclSystemError;
goto error;
}
/* Enable endpoint for communication */
ret = fi_enable(nccl_ofi_comp->ep);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't enable endpoint. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
ret = ncclSystemError;
goto error;
}
return ret;
error:
if (nccl_ofi_comp->domain)
fi_close((fid_t)nccl_ofi_comp->domain);
if (nccl_ofi_comp->fabric)
fi_close((fid_t)nccl_ofi_comp->fabric);
if (nccl_ofi_comp->ep)
fi_close((fid_t)nccl_ofi_comp->ep);
if (nccl_ofi_comp->av)
fi_close((fid_t)nccl_ofi_comp->av);
if (nccl_ofi_comp->cq)
fi_close((fid_t)nccl_ofi_comp->cq);
exit:
return ret;
}