in src/nccl_ofi_rdma.cpp [7129:7186]
static int nccl_net_ofi_rdma_endpoint_release(nccl_net_ofi_ep_t *base_ep, bool skip_lock, bool force_cleanup)
{
int ret = 0;
nccl_net_ofi_rdma_ep_t *ep = NULL;
/* Validate device */
ep = (nccl_net_ofi_rdma_ep_t *)base_ep;
if (OFI_UNLIKELY(ep == NULL)) {
NCCL_OFI_WARN("Invalid endpoint provided");
return -EINVAL;
}
/* this is a little messy, but because we kind of hacked in
* the endpoint per communicator code, we need ot use a
* different release mechanism depending on the endpoint
* type. Otherwise, we use the base code release function.
*/
if (ep->is_endpoint_per_communicator_ep) {
nccl_net_ofi_rdma_domain_t *domain = NULL;
domain = rdma_endpoint_get_domain(ep);
if (OFI_UNLIKELY(domain == NULL)) {
NCCL_OFI_WARN("Invalid domain provided");
return -EINVAL;
}
if (!skip_lock) {
nccl_net_ofi_mutex_lock(&domain->base.domain_lock);
}
if ((--ep->base.ref_cnt) == 0 || force_cleanup) {
if (force_cleanup && ep->base.ref_cnt != 0 ) {
NCCL_OFI_INFO(NCCL_NET, "Endpoint %p still have ref count %d when released",
ep, ep->base.ref_cnt);
}
ret = domain->ep_addr_list->remove(&ep->base);
if (ret != 0) {
NCCL_OFI_WARN("delete ep for addr failed: %d", ret);
goto unlock;
}
ret = ep->base.free_ep(&ep->base);
if (ret != 0) {
NCCL_OFI_WARN("Freeing ep failed");
goto unlock;
}
}
unlock:
if (!skip_lock) {
nccl_net_ofi_mutex_unlock(&domain->base.domain_lock);
}
} else {
ret = nccl_net_ofi_endpoint_release(&ep->base, skip_lock, force_cleanup);
}
return ret;
}