ssize_t _gnix_rma()

in prov/gni/src/gnix_rma.c [1340:1585]


ssize_t _gnix_rma(struct gnix_fid_ep *ep, enum gnix_fab_req_type fr_type,
		  uint64_t loc_addr, size_t len, void *mdesc,
		  uint64_t dest_addr, uint64_t rem_addr, uint64_t mkey,
		  void *context, uint64_t flags, uint64_t data)
{
	struct gnix_vc *vc;
	struct gnix_fab_req *req;
	struct gnix_fid_mem_desc *md = NULL;
	int rc;
	int rdma;
	struct fid_mr *auto_mr = NULL;
	struct gnix_fab_req *more_req;
	struct slist_entry *sle;
	int connected;
	struct gnix_auth_key *info;

	if (!(flags & FI_INJECT) && !ep->send_cq &&
	    (((fr_type == GNIX_FAB_RQ_RDMA_WRITE) && !ep->write_cntr) ||
	     ((fr_type == GNIX_FAB_RQ_RDMA_READ) && !ep->read_cntr))) {
		return -FI_ENOCQ;
	}

	if (flags & FI_TRIGGER) {
		struct fi_triggered_context *trigger_context =
				(struct fi_triggered_context *)context;
		if ((trigger_context->event_type != FI_TRIGGER_THRESHOLD) ||
		    (flags & FI_INJECT)) {
			return -FI_EINVAL;
		}
	}

	if ((flags & FI_INJECT) && (len > GNIX_INJECT_SIZE)) {
		GNIX_INFO(FI_LOG_EP_DATA,
			  "RMA length %d exceeds inject max size: %d\n",
			  len, GNIX_INJECT_SIZE);
		return -FI_EINVAL;
	}

	/* setup fabric request */
	req = _gnix_fr_alloc(ep);
	if (!req) {
		GNIX_INFO(FI_LOG_EP_DATA, "_gnix_fr_alloc() failed\n");
		return -FI_ENOSPC;
	}

	rdma = len >= ep->domain->params.rma_rdma_thresh;

	req->type = fr_type;
	req->gnix_ep = ep;
	req->user_context = context;
	req->work_fn = _gnix_rma_post_req;
	req->rma.sle.next = NULL;
	ofi_atomic_initialize32(&req->rma.outstanding_txds, 0);

	if (fr_type == GNIX_FAB_RQ_RDMA_READ &&
	    (rem_addr & GNI_READ_ALIGN_MASK || len & GNI_READ_ALIGN_MASK)) {
		if (len >= GNIX_RMA_UREAD_CHAINED_THRESH) {
			GNIX_INFO(FI_LOG_EP_DATA,
				  "Using CT for unaligned GET, req: %p\n",
				  req);
			flags |= GNIX_RMA_CHAINED;
		} else {
			GNIX_INFO(FI_LOG_EP_DATA,
				  "Using tmp buf for unaligned GET, req: %p\n",
				  req);
			flags |= GNIX_RMA_INDIRECT;
		}

		if (rdma)
			req->work_fn = _gnix_rma_post_rdma_chain_req;
	}

	if (!(flags & (GNIX_RMA_INDIRECT | FI_INJECT)) && !mdesc &&
	    (rdma || fr_type == GNIX_FAB_RQ_RDMA_READ)) {
		uint64_t requested_key;

		info = ep->auth_key;
		assert(info);

		if (info->using_vmdh)
			requested_key = _gnix_get_next_reserved_key(info);
		else
			requested_key = 0;

		/* We need to auto-register the source buffer. */
		rc = _gnix_mr_reg(&ep->domain->domain_fid.fid, (void *)loc_addr,
				 len, FI_READ | FI_WRITE, 0, requested_key,
				 0, &auto_mr, NULL, ep->auth_key,
				 GNIX_PROV_REG);
		if (rc != FI_SUCCESS) {
			GNIX_INFO(FI_LOG_EP_DATA,
				  "Failed to auto-register local buffer: %d\n",
				  rc);
			goto err_auto_reg;
		}
		flags |= FI_LOCAL_MR;
		mdesc = (void *)auto_mr;
		GNIX_INFO(FI_LOG_EP_DATA, "auto-reg MR: %p\n", auto_mr);
	}

	if (mdesc)
		md = container_of(mdesc, struct gnix_fid_mem_desc, mr_fid);
	req->rma.loc_md = (void *)md;

	req->rma.rem_addr = rem_addr;
	req->rma.rem_mr_key = mkey;
	req->rma.len = len;
	req->rma.imm = data;
	req->flags = flags;

	if (req->flags & FI_INJECT) {
		memcpy(req->inject_buf, (void *)loc_addr, len);
		req->rma.loc_addr = (uint64_t)req->inject_buf;
	} else {
		req->rma.loc_addr = loc_addr;
	}

	/* Inject interfaces always suppress completions.  If
	 * SELECTIVE_COMPLETION is set, honor any setting.  Otherwise, always
	 * deliver a completion. */
	if ((flags & GNIX_SUPPRESS_COMPLETION) ||
	    (ep->send_selective_completion && !(flags & FI_COMPLETION))) {
		req->flags &= ~FI_COMPLETION;
	} else {
		req->flags |= FI_COMPLETION;
	}

	if (rdma) {
		req->flags |= GNIX_RMA_RDMA;
	}

	COND_ACQUIRE(ep->requires_lock, &ep->vc_lock);

	/* find VC for target */
	rc = _gnix_vc_ep_get_vc(ep, dest_addr, &vc);
	if (rc) {
		GNIX_INFO(FI_LOG_EP_DATA,
			  "_gnix_vc_ep_get_vc() failed, addr: %lx, rc:%d\n",
			  dest_addr, rc);
		goto err_get_vc;
	}

	req->vc = vc;
	connected = (vc->conn_state == GNIX_VC_CONNECTED);

	/* Adding FI_FENCE to an FI_MORE list will break the FI_MORE Chain.
	 * Current FI_MORE implementation does not create remote CQ events.
	 * Remove FI_MORE flag when FI_FENCE or REMOTE EP requirements are
	 * present. We will also only allow FI_MORE if a prior connection has
	 * been established, so that the peer capabilities can be determined.*/
	if ((flags & FI_FENCE) || (flags & FI_REMOTE_CQ_DATA) ||
	    !connected || (req->vc->peer_caps & FI_RMA_EVENT)) {
		flags &= ~FI_MORE;
	}

	/* Add reads/writes to slist when FI_MORE is present, Or
	 * if this is the first message in the chain without FI_MORE.
	 * When FI_MORE is not present, if the slists are not empty
	 * it is the first message without FI_MORE.
	 * Do not add reqs with FI_FENCE or REMOTE EP requirements requirements
	 * to the fi_more list. */
	if ((flags & FI_MORE) ||
	    (!(flags & FI_MORE) && connected &&
	    (!slist_empty(&ep->more_write) || !slist_empty(&ep->more_read)) &&
	     !(flags & FI_FENCE || flags & FI_REMOTE_CQ_DATA ||
	       req->vc->peer_caps & FI_RMA_EVENT))) {
		if (fr_type == GNIX_FAB_RQ_RDMA_WRITE) {
			slist_insert_tail(&req->rma.sle, &ep->more_write);
			req->work_fn = _gnix_rma_more_post_req;
		} else if (fr_type == GNIX_FAB_RQ_RDMA_READ) {
			slist_insert_tail(&req->rma.sle, &ep->more_read);
			req->work_fn = _gnix_rma_more_post_req;
		}

		if (flags & FI_MORE) {
			COND_RELEASE(ep->requires_lock, &ep->vc_lock);
			return FI_SUCCESS;
		}
	}

	/* Initiate read/write chains on first message without FI_MORE. */
	if (!(flags & FI_MORE) &&
	    (!(slist_empty(&ep->more_write)) ||
	     !(slist_empty(&ep->more_read)))) {
		if (!(slist_empty(&ep->more_write))) {
			sle = ep->more_write.head;
			more_req = container_of(sle, struct gnix_fab_req,
						rma.sle);
			GNIX_DEBUG(FI_LOG_EP_DATA,
				   "FI_MORE: got fab_request from more_write. Queuing Request\n");
			_gnix_vc_queue_tx_req(more_req);
			slist_init(&ep->more_write); /* For future reqs */
		}
		if (!(slist_empty(&ep->more_read))) {
			sle = ep->more_read.head;
			more_req = container_of(sle, struct gnix_fab_req,
						rma.sle);
			GNIX_DEBUG(FI_LOG_EP_DATA,
				   "FI_MORE: got fab_request from more_read. Queuing Request\n");
			_gnix_vc_queue_tx_req(more_req);
			slist_init(&ep->more_read);
		}

		/* Requests with FI_FENCE or REMOTE EP requirements are not
		 * added to the FI_MORE List. They must be queued separately. */
		if ((flags & FI_FENCE) || (flags & FI_REMOTE_CQ_DATA) ||
		    (req->vc->peer_caps & FI_RMA_EVENT)) {
			rc = _gnix_vc_queue_tx_req(req);
			COND_RELEASE(ep->requires_lock, &ep->vc_lock);
			return rc;
		}
		COND_RELEASE(ep->requires_lock, &ep->vc_lock);
		return FI_SUCCESS;
	}

	GNIX_DEBUG(FI_LOG_EP_DATA, "Queuing (%p %p %d)\n",
			(void *)loc_addr, (void *)rem_addr, len);

	rc = _gnix_vc_queue_tx_req(req);
	connected = (vc->conn_state == GNIX_VC_CONNECTED);

	COND_RELEASE(ep->requires_lock, &ep->vc_lock);

	/*
	 * If a new VC was allocated, progress CM before returning.
	 * If the VC is connected and there's a backlog, poke
	 * the nic progress engine befure returning.
	 */
	if (!connected) {
		_gnix_cm_nic_progress(ep->cm_nic);
	} else if (!dlist_empty(&vc->tx_queue)) {
		_gnix_nic_progress(vc->ep->nic);
	}

	return rc;

err_get_vc:
	COND_RELEASE(ep->requires_lock, &ep->vc_lock);
	if (flags & FI_LOCAL_MR) {
		fi_close(&auto_mr->fid);
		flags &= ~FI_LOCAL_MR;
	}
err_auto_reg:
	_gnix_fr_free(req->vc->ep, req);
	return rc;
}