ssize_t _gnix_sendv()

in prov/gni/src/gnix_msg.c [3562:3776]


ssize_t _gnix_sendv(struct gnix_fid_ep *ep, const struct iovec *iov,
		    void **mdesc, size_t count, uint64_t dest_addr,
		    void *context, uint64_t flags, uint64_t tag)
{
	int i, ret = FI_SUCCESS;
	unsigned long long cum_len = 0;
	void *tmp = NULL;
	struct gnix_vc *vc = NULL;
	struct gnix_fab_req *req = NULL;
	struct fid_mr *auto_mr;
	int connected;

	GNIX_DEBUG(FI_LOG_EP_DATA, "iov_count = %lu\n", count);

	if (!ep->send_cq && !ep->send_cntr) {
		return -FI_ENOCQ;
	}

	if (!(flags & FI_TAGGED)) {
		if (!ep->ep_ops.msg_send_allowed)
			return -FI_EOPNOTSUPP;
	} else {
		if (!ep->ep_ops.tagged_send_allowed)
			return -FI_EOPNOTSUPP;
	}

	req = _gnix_fr_alloc(ep);
	if (req == NULL) {
		return -FI_ENOSPC;
	}

	GNIX_DEBUG(FI_LOG_EP_DATA, "Created req - %p\n", req);

	/* calculate cumulative size of the iovec buf lens */
	for (i = 0; i < count; i++) {
		/* TODO: handle possible overflow */
		cum_len += iov[i].iov_len;

		GNIX_DEBUG(FI_LOG_EP_DATA, "iov[%d].iov_len = %lu\n", i, iov[i].iov_len);
	}

	/* Fill out fabric request */
	if (flags & FI_TAGGED) {
		req->type = GNIX_FAB_RQ_TSENDV;
		req->msg.tag = tag;
		req->msg.ignore = 0;
	} else {
		req->type = GNIX_FAB_RQ_SENDV;
		req->msg.tag = 0;
		req->msg.ignore = ~0;
	}

	req->gnix_ep = ep;
	req->user_context = context;
	req->work_fn = _gnix_send_req;
	req->flags = flags;
	req->msg.send_flags = flags;
	req->msg.imm = 0;
	req->msg.parent = NULL;

	/*
	 * If the cum_len is >= ep->domain->params.msg_rendezvous_thresh
	 * transfer the iovec entries individually.
	 *
	 * For this case, use CtPostFma for iovec lengths that are smaller than
	 * the rendezvous thresh. For CtPostFma:
	 * the sum of the iov lens must be either <= 1GB or <= 1MB if the comm
	 * dom is configured with FmaSharing.
	 * otherwise use PostRdma.
	 */
	if (cum_len >= ep->domain->params.msg_rendezvous_thresh) {
		if (!mdesc) {	/* Register the memory for the user */
			for (i = 0; i < count; i++) {
				auto_mr = NULL;

				ret = _gnix_mr_reg(&ep->domain->domain_fid.fid,
						   iov[i].iov_base,
						   iov[i].iov_len,
						   FI_READ | FI_WRITE, 0, 0, 0,
						   &auto_mr, NULL, ep->auth_key, GNIX_PROV_REG);

				if (ret != FI_SUCCESS) {
					GNIX_DEBUG(FI_LOG_EP_DATA,
						   "Failed to auto-register"
						   " local buffer: %s\n",
						   fi_strerror(-ret));

					for (i--; i >= 0; i--) {
						ret = fi_close(&req->msg.send_md[i]->mr_fid.fid);
						if (ret != FI_SUCCESS) {
							GNIX_FATAL(FI_LOG_DOMAIN,
								"failed to release auto-registered region, "
								"rc=%d\n", ret);
						}
					}

					goto err_mr_reg;
				}

				req->msg.send_md[i] = container_of(
					(void *) auto_mr,
					struct gnix_fid_mem_desc,
					mr_fid);

				req->msg.send_info[i].send_addr = (uint64_t) iov[i].iov_base;
				req->msg.send_info[i].send_len = iov[i].iov_len;
				req->msg.send_info[i].mem_hndl =
					req->msg.send_md[i]->mem_hndl;

				GNIX_DEBUG(FI_LOG_EP_DATA, "iov[%d].iov_len = %lu,"
					   " req->msg.send_info[%d].send_addr = "
					   "%p, req->msg.send_info[%d].send_len "
					   "= %lu\n", i, iov[i].iov_len, i,
					   (void *) req->msg.send_info[i].send_addr,
					   i, req->msg.send_info[i].send_len);

				GNIX_DEBUG(FI_LOG_EP_DATA, "req->msg.send_md[%d] "
					   "= %p\n", i,
					   req->msg.send_md[i]);

				GNIX_DEBUG(FI_LOG_EP_DATA, "auto-reg MR: %p\n",
					   req->msg.send_md[i]);

			}

			req->msg.send_flags |= FI_LOCAL_MR;
		} else {	/* User registered their memory */
			for (i = 0; i < count; i++) {
				if (!mdesc[i]) {
					GNIX_WARN(FI_LOG_EP_DATA,
						  "invalid memory reg"
						  "istration (%p).\n",
						  mdesc[i]);
					ret = -FI_EINVAL;
					goto err_mr_reg;
				}

				req->msg.send_md[i] =
					container_of(mdesc[i],
						     struct gnix_fid_mem_desc,
						     mr_fid);

				req->msg.send_info[i].send_addr = (uint64_t) iov[i].iov_base;
				req->msg.send_info[i].send_len = iov[i].iov_len;
				req->msg.send_info[i].mem_hndl =
					req->msg.send_md[i]->mem_hndl;
			}
		}

		req->msg.send_iov_cnt = count;
		req->msg.send_flags |= GNIX_MSG_RENDEZVOUS;
	} else {
		/*
		 * TODO: Use buddy allocator with max alloc lim of
		 * ep->domain->params.msg_rendezvous_thresh
		 */
		/* This is freed in __comp_eager_msg_w_data */
		tmp = malloc(cum_len);
		assert(tmp != NULL);

		__gnix_msg_pack_data_from_iov((uint64_t) tmp, cum_len,
					      iov, count);
		req->msg.send_info[0].send_addr = (uint64_t) tmp;
		req->msg.send_info[0].send_len = cum_len;
	}

	if ((flags & GNIX_SUPPRESS_COMPLETION) ||
	    (ep->send_selective_completion &&
	     !(flags & FI_COMPLETION))) {
		req->msg.send_flags &= ~FI_COMPLETION;
	} else {
		req->msg.send_flags |= FI_COMPLETION;
	}

	req->msg.cum_send_len = (size_t) cum_len;

	COND_ACQUIRE(ep->requires_lock, &ep->vc_lock);

	ret = _gnix_vc_ep_get_vc(ep, dest_addr, &vc);
	if (ret != FI_SUCCESS) {
		goto err_get_vc;
	}

	req->vc = vc;

	ret = _gnix_vc_queue_tx_req(req);
	connected = (vc->conn_state == GNIX_VC_CONNECTED);

	COND_RELEASE(ep->requires_lock, &ep->vc_lock);

	/*
	 * If a new VC was allocated, progress CM before returning.
	 * If the VC is connected and there's a backlog, poke
	 * the nic progress engine befure returning.
	 */
	if (!connected) {
		_gnix_cm_nic_progress(ep->cm_nic);
	} else if (!dlist_empty(&vc->tx_queue)) {
		_gnix_nic_progress(vc->ep->nic);
	}

	return ret;

err_get_vc:
	COND_RELEASE(ep->requires_lock, &ep->vc_lock);
	if (req->msg.send_flags & FI_LOCAL_MR) {
		for (i = 0; i < count; i++) {
			fi_close(&req->msg.send_md[i]->mr_fid.fid);
		}
	}
err_mr_reg:
	_gnix_fr_free(ep, req);

	return ret;
}