static int nccl_net_ofi_rdma_endpoint_release()

in src/nccl_ofi_rdma.cpp [7129:7186]


static int nccl_net_ofi_rdma_endpoint_release(nccl_net_ofi_ep_t *base_ep, bool skip_lock, bool force_cleanup)
{
	int ret = 0;
	nccl_net_ofi_rdma_ep_t *ep = NULL;

	/* Validate device */
	ep = (nccl_net_ofi_rdma_ep_t *)base_ep;
	if (OFI_UNLIKELY(ep == NULL)) {
		NCCL_OFI_WARN("Invalid endpoint provided");
		return -EINVAL;
	}

	/* this is a little messy, but because we kind of hacked in
	 * the endpoint per communicator code, we need ot use a
	 * different release mechanism depending on the endpoint
	 * type.  Otherwise, we use the base code release function.
	 */
	if (ep->is_endpoint_per_communicator_ep) {
		nccl_net_ofi_rdma_domain_t *domain = NULL;

		domain = rdma_endpoint_get_domain(ep);
		if (OFI_UNLIKELY(domain == NULL)) {
			NCCL_OFI_WARN("Invalid domain provided");
			return -EINVAL;
		}

		if (!skip_lock) {
			nccl_net_ofi_mutex_lock(&domain->base.domain_lock);
		}

		if ((--ep->base.ref_cnt) == 0 || force_cleanup) {
			if (force_cleanup && ep->base.ref_cnt != 0 ) {
				NCCL_OFI_INFO(NCCL_NET, "Endpoint %p still have ref count %d when released",
					      ep, ep->base.ref_cnt);
			}
			ret = domain->ep_addr_list->remove(&ep->base);
			if (ret != 0) {
				NCCL_OFI_WARN("delete ep for addr failed: %d", ret);
				goto unlock;
			}

			ret = ep->base.free_ep(&ep->base);
			if (ret != 0) {
				NCCL_OFI_WARN("Freeing ep failed");
				goto unlock;
			}
		}

 unlock:
		if (!skip_lock) {
			nccl_net_ofi_mutex_unlock(&domain->base.domain_lock);
		}
	} else {
		ret = nccl_net_ofi_endpoint_release(&ep->base, skip_lock, force_cleanup);
	}

	return ret;
}