in src/nccl_ofi_net.c [1672:1755]
static ncclResult_t ofi_isend(void *sendComm, void* data, int size,
void *mhandle, void** request)
{
ncclResult_t ret = ncclSuccess;
ssize_t rc = 0;
nccl_ofi_req_t *req = NULL;
sendComm_t *sComm = (sendComm_t *)sendComm;
nccl_ofi_t *nccl_ofi_comp = NULL;
void *desc = NULL;
/* Validate sendComm */
if (OFI_UNLIKELY(sComm == NULL)) {
ret = ncclSystemError;
NCCL_OFI_WARN("Invalid sendComm provided");
goto error;
}
/* Support only NCCL_OFI_MAX_REQUESTS inflight requests. */
if (OFI_UNLIKELY(sComm->num_inflight_reqs == NCCL_OFI_MAX_REQUESTS)) {
ret = ncclSystemError;
NCCL_OFI_WARN("Can not support more than %d inflight requests",
NCCL_OFI_MAX_REQUESTS);
goto error;
}
/* Allocate NCCL OFI request */
req = allocate_nccl_ofi_request(sComm->nccl_ofi_reqs_fl);
if (OFI_UNLIKELY(req == NULL)) {
ret = ncclSystemError;
NCCL_OFI_WARN("Unable to get NCCL OFI request for device %d",
sComm->dev);
goto error;
}
req->sComm = sComm;
req->dev = sComm->dev;
req->direction = NCCL_OFI_SEND;
nccl_ofi_comp = nccl_ofi_component[sComm->dev];
if (OFI_UNLIKELY(nccl_ofi_comp == NULL)) {
ret = ncclSystemError;
NCCL_OFI_WARN("NCCL OFI component for dev %d is not initialised",
sComm->dev);
goto error;
}
/* Progress NCCL OFI */
ret = nccl_ofi_progress(nccl_ofi_comp);
if (OFI_UNLIKELY(ret != 0))
goto error;
if (mhandle != NULL)
desc = fi_mr_desc(mhandle);
/*
* Try sending data to remote EP; Return NULL request
* if not able to send.
*/
rc = fi_tsend(sComm->local_ep, data, size, desc,
sComm->remote_ep, sComm->tag, &req->ctx);
if (OFI_UNLIKELY(rc == -FI_EAGAIN)) {
/* Return NULL */
*request = NULL;
goto error;
}
else if (OFI_UNLIKELY(rc != 0)) {
NCCL_OFI_WARN("Could not send request for device %d. RC: %zd",
sComm->dev, rc);
ret = ncclSystemError;
goto error;
}
sComm->num_inflight_reqs++;
/* Return request to NCCL */
*request = req;
goto exit;
error:
if (req)
free_nccl_ofi_req(req, false);
exit:
return ret;
}