in src/nccl_ofi_net.c [1981:2026]
static ncclResult_t ofi_flush(void* recvComm, void* data, int size,
void *mhandle)
{
ncclResult_t ret = ncclSuccess;
recvComm_t *rComm = (recvComm_t *)recvComm;
nccl_ofi_req_t *req = NULL;
int done = 0;
if (size == 0) {
/*
* Flush is an expensive operation. So, don't send fi_read for
* 0-sized messages. Since, NCCL issues flush for every irecv(),
* we guarantee to sync data to GPU even without it.
*/
goto exit;
}
if (ofi_nccl_gdr_flush_disable() || !support_gdr)
goto exit;
ret = OFI_UNLIKELY(ofi_iflush(recvComm, data, size, mhandle, (void **)&req));
if (ret != ncclSuccess) {
goto exit;
}
/* Ensure that the request completes */
while (done == 0) {
ret = ofi_test(req, &done, NULL);
/*
* If testing request completion fails and returns
* not completed, reduce number of inflight requests.
*/
if (OFI_UNLIKELY((ret != ncclSuccess) && (done == 0))) {
rComm->num_inflight_reqs--;
goto error;
}
}
return ret;
error:
if (req)
free_nccl_ofi_req(req, false);
exit:
return ret;
}