in src/nccl_ofi_sendrecv.cpp [2006:2112]
static inline int sendrecv_send_comm_create(nccl_net_ofi_conn_handle_t *handle,
nccl_net_ofi_sendrecv_ep_t *ep,
nccl_net_ofi_sendrecv_send_comm_t **s_comm)
{
char remote_ep_addr[MAX_EP_ADDR] = {};
uint64_t tag = 0ULL;
uint64_t max_tag = 0;
size_t req_size = sizeof(nccl_net_ofi_sendrecv_req_t);
fi_addr_t remote_addr;
nccl_net_ofi_sendrecv_send_comm_t *ret_s_comm = NULL;
nccl_ofi_connection_info_t *conn_info = NULL;
*s_comm = NULL;
int ret = 0;
/* Retrieve and validate device */
nccl_net_ofi_sendrecv_device_t *device = sendrecv_endpoint_get_device(ep);
if (OFI_UNLIKELY(device == NULL)) {
NCCL_OFI_WARN("Error accessing device.");
return -EINVAL;
}
max_tag = device->max_tag;
/* Get tag and remote name from handle */
memcpy(&remote_ep_addr, handle->ep_name, MAX_EP_ADDR);
memcpy(&tag, &handle->comm_id, sizeof(handle->comm_id));
if (tag < 1 || tag > max_tag) {
NCCL_OFI_WARN("Received an invalid tag %lu for device %d", tag,
device->base.dev_id);
return -EINVAL;
}
/* Insert remote address into AV */
ret = fi_av_insert(ep->av,
(void *)remote_ep_addr, 1,
&remote_addr, 0, NULL);
if (OFI_UNLIKELY(ret != 1)) {
NCCL_OFI_WARN("Unable to insert remote address into address vector for device %d. RC: %d",
device->base.dev_id, ret);
return -EINVAL;
}
/* Allocate and initialize send_comm */
ret_s_comm = (nccl_net_ofi_sendrecv_send_comm_t *)
calloc(1, sizeof(nccl_net_ofi_sendrecv_send_comm_t));
if (OFI_UNLIKELY(ret_s_comm == NULL)) {
NCCL_OFI_WARN("Couldn't allocate send_comm for dev %d", device->base.dev_id);
return -ENOMEM;
}
ret_s_comm->base.base.type = NCCL_NET_OFI_SEND_COMM;
ret_s_comm->base.base.ep = &ep->base;
ret_s_comm->base.base.dev_id = device->base.dev_id;
ret_s_comm->base.regMr = sendrecv_send_comm_reg_mr;
ret_s_comm->base.deregMr = sendrecv_send_comm_dereg_mr;
ret_s_comm->base.send = sendrecv_send_comm_send;
ret_s_comm->base.close = sendrecv_send_comm_close;
ret_s_comm->base.write = NULL;
ret_s_comm->base.write_inline = NULL;
ret_s_comm->tag = tag;
ret_s_comm->local_ep = ep->ofi_ep;
ret_s_comm->remote_ep = remote_addr;
ret_s_comm->conn_info = nccl_ofi_freelist_entry_alloc(ep->conn_msg_fl);
if (ret_s_comm->conn_info == NULL) {
NCCL_OFI_WARN("Could not allocate connect connection info");
ret = -ENOMEM;
goto out;
}
conn_info = (nccl_ofi_connection_info_t *)ret_s_comm->conn_info->ptr;
conn_info->ep_namelen = sizeof(conn_info->ep_name);
ret = fi_getname(&(ep->ofi_ep->fid),
(void *)conn_info->ep_name,
&conn_info->ep_namelen);
if (ret == -FI_ETOOSMALL) {
NCCL_OFI_WARN("Endpoint's address length (%zu) is larger than supplied buffer length (%d)",
conn_info->ep_namelen, MAX_EP_ADDR);
goto out;
} else if (ret != 0) {
NCCL_OFI_WARN("Call to fi_getname() failed with RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
goto out;
}
conn_info->connect_to_self =
(0 == memcmp(conn_info->ep_name, remote_ep_addr, conn_info->ep_namelen)) ? 1 : 0;
/* Pre-allocated buffers for data path */
ret = nccl_ofi_freelist_init(req_size, 16, 16, NCCL_OFI_MAX_SEND_REQUESTS,
sendrecv_fl_req_entry_init, NULL,
&ret_s_comm->nccl_ofi_reqs_fl);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Could not allocate NCCL OFI requests free list for dev %d",
device->base.dev_id);
goto out;
}
*s_comm = ret_s_comm;
out:
if (ret)
free(ret_s_comm);
return ret;
}