in src/nccl_ofi_rdma.cpp [7255:7376]
static int nccl_net_ofi_rdma_domain_create_endpoint(nccl_net_ofi_domain_t *base_domain,
nccl_net_ofi_ep_t **base_ep)
{
int ret = 0;
nccl_net_ofi_rdma_ep_t *ep = NULL;
nccl_net_ofi_rdma_domain_t *domain = NULL;
nccl_net_ofi_rdma_device_t *device = NULL;
domain = (nccl_net_ofi_rdma_domain_t *)base_domain;
if (OFI_UNLIKELY(domain == NULL)) {
NCCL_OFI_WARN("Invalid domain provided");
return -EINVAL;
}
device = rdma_domain_get_device(domain);
assert(device != NULL);
/* Allocate endpoint */
ep = (nccl_net_ofi_rdma_ep_t *)calloc(1, sizeof(nccl_net_ofi_rdma_ep_t));
if (!ep) {
NCCL_OFI_WARN("Unable to allocate rdma endpoint");
return -ENOMEM;
}
ret = nccl_net_ofi_endpoint_init(&domain->base, &ep->base);
if (ret != 0) {
NCCL_OFI_WARN("Initializing endpoint base failed");
goto error;
}
ep->base.listen = listen;
ep->base.connect = connect;
ep->base.release_ep = nccl_net_ofi_rdma_endpoint_release;
ep->base.free_ep = nccl_net_ofi_rdma_endpoint_free;
ep->num_rails = domain->num_rails;
if (ofi_nccl_rdma_rr_ctrl_msg()) {
/*
* Round robin the control message across all rails by using dedicated
* endpoints with CQs shared with the data endpoints.
*/
ep->num_control_rails = domain->num_rails;
} else {
/*
* Use a single rail for control messages, with a dedicated
* endpoint and a CQ shared with the data endpoint.
*/
ep->num_control_rails = 1;
}
ep->use_long_rkeys = device->use_long_rkeys;
ep->rails = (nccl_net_ofi_ep_rail_t *)calloc(ep->num_rails,
sizeof(nccl_net_ofi_ep_rail_t));
if (!ep->rails) {
NCCL_OFI_WARN("Unable to allocate rdma rails");
ret = -ENOMEM;
goto error;
}
ep->control_rails = (nccl_net_ofi_ep_rail_t *)calloc(ep->num_control_rails, sizeof(nccl_net_ofi_ep_rail_t));
if (!ep->control_rails) {
NCCL_OFI_WARN("Unable to allocate rdma control rails");
ret = -ENOMEM;
goto error;
}
ep->pending_reqs_queue = new std::deque<nccl_net_ofi_rdma_req_t *>;
ret = nccl_net_ofi_mutex_init(&ep->pending_reqs_lock, NULL);
if (ret != 0) {
NCCL_OFI_WARN("Mutex initialization failed: %s", strerror(ret));
goto error;
}
ep->ctrl_rx_buff_size = std::max({sizeof(nccl_net_ofi_rdma_ctrl_msg_t),
sizeof(nccl_ofi_rdma_connection_info_t),
sizeof(nccl_net_ofi_rdma_close_msg_t)});
ep->eager_send_size = ofi_nccl_eager_max_size();
/* Work around EFA provider bug around posting 0 byte rx buffers by not
posting 0 byte rx buffers. Note that if eager_send_size is -1
(disabled), eager_rx_buff_size will also be -1. */
ep->eager_rx_buff_size = (ep->eager_send_size == 0) ?
EAGER_RX_BUFFER_ALIGNMENT : ep->eager_send_size;
ep->is_endpoint_per_communicator_ep = false;
ret = init_rail_ofi_resources(device, domain, ep);
if (ret != 0) {
goto error;
}
ret = init_rx_buffers(ep);
if (ret != 0) {
NCCL_OFI_WARN("Preparation of rx buffers failed");
goto error;
}
NCCL_OFI_TRACE(NCCL_NET, "RDMA endpoint %p for dev #%d is created",
ep,
device->base.dev_id);
*base_ep = &ep->base;
/* During plugin initialization, this function is invoked the
* first time. Consequently, initialization function of
* maximum write inline size is executed on initialization
* path the first time, avoiding data race on
* `max_write_inline_size` when `get_properties()` function
* reads the maximum write inline size variable. */
if (ret == 0) {
ret = init_max_write_inline_size_if_not_initialized(device, ep);
}
error:
if (ret != 0) {
ep->base.release_ep(&(ep->base), false, false);
}
return ret;
}