in src/nccl_ofi_rdma.cpp [5181:5260]
static int listen(nccl_net_ofi_ep_t *base_ep,
nccl_net_ofi_conn_handle_t *handle,
nccl_net_ofi_listen_comm_t **listen_comm)
{
int ret = 0;
nccl_net_ofi_rdma_listen_comm_t *l_comm = NULL;
size_t comm_id = 0;
nccl_net_ofi_rdma_ep_t *ep =
(nccl_net_ofi_rdma_ep_t *)base_ep;
nccl_net_ofi_ep_rail_t *first_control_rail = rdma_endpoint_get_control_rail(ep, 0);
/* Retrieve and validate device */
nccl_net_ofi_rdma_device_t *device = rdma_endpoint_get_device(ep);
assert(device != NULL);
int dev_id = device->base.dev_id;
ret = post_rx_buffs(ep);
if (ret != 0) {
NCCL_OFI_WARN("Error posting rx buffers: %d", ret);
return ret;
}
/* Build handle */
memset(handle, 0, sizeof(nccl_net_ofi_conn_handle_t));
assert(sizeof(handle->ep_name) == sizeof(first_control_rail->local_ep_name));
memcpy(handle->ep_name, first_control_rail->local_ep_name,
first_control_rail->local_ep_name_len);
/* We don't copy the size here since the handle doesn't have a size field.
The size will be distributed later by the connect response message.
Instead, zero the unused bytes here. */
memset(handle->ep_name + first_control_rail->local_ep_name_len, 0,
sizeof(handle->ep_name) - first_control_rail->local_ep_name_len);
/* Build listen_comm */
l_comm = (nccl_net_ofi_rdma_listen_comm_t *)calloc(1,
sizeof(nccl_net_ofi_rdma_listen_comm_t));
if (OFI_UNLIKELY(l_comm == NULL)) {
NCCL_OFI_WARN("Couldn't allocate listen_comm for dev %d", dev_id);
ret = -ENOMEM;
goto error;
}
/* Initialize listen communicator */
l_comm->base.base.type = NCCL_NET_OFI_LISTEN_COMM;
l_comm->base.base.ep = base_ep;
l_comm->base.base.dev_id = dev_id;
l_comm->base.accept = accept;
l_comm->base.close = listen_close;
/* Allocate listen communicator ID */
comm_id = device->comm_idpool->allocate_id();
if (OFI_UNLIKELY(comm_id == FI_KEY_NOTAVAIL)) {
l_comm->comm_id = COMM_ID_INVALID;
ret = -ENOMEM;
goto error;
}
l_comm->comm_id = (uint32_t)comm_id;
handle->comm_id = l_comm->comm_id;
/* Add listen comm to ep's lookup array */
rdma_device_set_comm(device, l_comm->comm_id, &l_comm->base.base);
/* Prepare receive request to accept connections */
ret = prepare_recv_conn_req(l_comm);
if (ret != 0)
goto error;
*listen_comm = &l_comm->base;
goto exit;
error:
if (l_comm && COMM_ID_INVALID != l_comm->comm_id) {
device->comm_idpool->free_id(l_comm->comm_id);
}
free(l_comm);
exit:
return ret;
}