in src/nccl_ofi_net.c [1757:1829]
static ncclResult_t ofi_irecv(void* recvComm, void* data, int size,
void *mhandle, void** request)
{
ncclResult_t ret = ncclSuccess;
ssize_t rc = 0;
nccl_ofi_req_t *req = NULL;
recvComm_t *rComm = (recvComm_t *)recvComm;
void *desc = NULL;
/* Validate recvComm */
if (OFI_UNLIKELY(rComm == NULL)) {
ret = ncclSystemError;
NCCL_OFI_WARN("Invalid recvComm provided");
goto error;
}
/* Support only NCCL_OFI_MAX_REQUESTS inflight requests. */
if (OFI_UNLIKELY(rComm->num_inflight_reqs == NCCL_OFI_MAX_REQUESTS)) {
ret = ncclSystemError;
NCCL_OFI_WARN("Can not support more than %d inflight requests",
NCCL_OFI_MAX_REQUESTS);
goto error;
}
/* Allocate NCCL OFI request */
req = allocate_nccl_ofi_request(rComm->nccl_ofi_reqs_fl);
if (OFI_UNLIKELY(req == NULL)) {
ret = ncclSystemError;
NCCL_OFI_WARN("Unable to get NCCL OFI request for device %d",
rComm->dev);
goto error;
}
/* Progress NCCL OFI */
ret = nccl_ofi_progress(nccl_ofi_component[rComm->dev]);
if (OFI_UNLIKELY(ret != 0))
goto error;
req->rComm = rComm;
req->dev = rComm->dev;
req->direction = NCCL_OFI_RECV;
if (mhandle != NULL)
desc = fi_mr_desc(mhandle);
/* Try posting buffer to local EP */
rc = fi_trecv(rComm->local_ep, data, size, desc,
FI_ADDR_UNSPEC, rComm->tag, 0, &req->ctx);
if (rc == -FI_EAGAIN) {
/* Return NULL request */
*request = NULL;
goto error;
}
else if (rc != 0) {
NCCL_OFI_WARN("Unable to post receive buffer for dev %d. RC: %zd, ERROR: %s",
rComm->dev, rc, fi_strerror(-rc));
ret = ncclSystemError;
goto error;
}
rComm->num_inflight_reqs++;
/* Return request to NCCL */
*request = req;
goto exit;
error:
if (req)
free_nccl_ofi_req(req, false);
exit:
return ret;
}