in src/nccl_ofi_net.cpp [1123:1165]
int nccl_net_ofi_endpoint_release(nccl_net_ofi_ep_t *ep, bool skip_lock, bool force_cleanup)
{
int ret = 0;
nccl_net_ofi_domain_t *domain;
assert(ep != NULL);
domain = ep->domain;
if (!skip_lock) {
nccl_net_ofi_mutex_lock(&domain->domain_lock);
}
ep->ref_cnt--;
if (ep->ref_cnt == 0 || force_cleanup) {
domain->endpoint = NULL;
if (force_cleanup && ep->ref_cnt != 0) {
NCCL_OFI_INFO(NCCL_NET, "Endpoint %p still have ref count %d when released",
ep, ep->ref_cnt);
}
ret = ep->free_ep(ep);
if (ret != 0) {
NCCL_OFI_WARN("Freeing endpoint failed: %d", ret);
goto cleanup;
}
}
cleanup:
if (!skip_lock) {
nccl_net_ofi_mutex_unlock(&domain->domain_lock);
}
/* Skip domain->release when handled by device->release_all_domain_and_ep()
* to avoid domain lock issue after the domain freed */
if (!force_cleanup && ret == 0) {
ret = domain->release(domain, skip_lock, false);
}
return ret;
}