in prov/gni/src/gnix_rma.c [1340:1585]
ssize_t _gnix_rma(struct gnix_fid_ep *ep, enum gnix_fab_req_type fr_type,
uint64_t loc_addr, size_t len, void *mdesc,
uint64_t dest_addr, uint64_t rem_addr, uint64_t mkey,
void *context, uint64_t flags, uint64_t data)
{
struct gnix_vc *vc;
struct gnix_fab_req *req;
struct gnix_fid_mem_desc *md = NULL;
int rc;
int rdma;
struct fid_mr *auto_mr = NULL;
struct gnix_fab_req *more_req;
struct slist_entry *sle;
int connected;
struct gnix_auth_key *info;
if (!(flags & FI_INJECT) && !ep->send_cq &&
(((fr_type == GNIX_FAB_RQ_RDMA_WRITE) && !ep->write_cntr) ||
((fr_type == GNIX_FAB_RQ_RDMA_READ) && !ep->read_cntr))) {
return -FI_ENOCQ;
}
if (flags & FI_TRIGGER) {
struct fi_triggered_context *trigger_context =
(struct fi_triggered_context *)context;
if ((trigger_context->event_type != FI_TRIGGER_THRESHOLD) ||
(flags & FI_INJECT)) {
return -FI_EINVAL;
}
}
if ((flags & FI_INJECT) && (len > GNIX_INJECT_SIZE)) {
GNIX_INFO(FI_LOG_EP_DATA,
"RMA length %d exceeds inject max size: %d\n",
len, GNIX_INJECT_SIZE);
return -FI_EINVAL;
}
/* setup fabric request */
req = _gnix_fr_alloc(ep);
if (!req) {
GNIX_INFO(FI_LOG_EP_DATA, "_gnix_fr_alloc() failed\n");
return -FI_ENOSPC;
}
rdma = len >= ep->domain->params.rma_rdma_thresh;
req->type = fr_type;
req->gnix_ep = ep;
req->user_context = context;
req->work_fn = _gnix_rma_post_req;
req->rma.sle.next = NULL;
ofi_atomic_initialize32(&req->rma.outstanding_txds, 0);
if (fr_type == GNIX_FAB_RQ_RDMA_READ &&
(rem_addr & GNI_READ_ALIGN_MASK || len & GNI_READ_ALIGN_MASK)) {
if (len >= GNIX_RMA_UREAD_CHAINED_THRESH) {
GNIX_INFO(FI_LOG_EP_DATA,
"Using CT for unaligned GET, req: %p\n",
req);
flags |= GNIX_RMA_CHAINED;
} else {
GNIX_INFO(FI_LOG_EP_DATA,
"Using tmp buf for unaligned GET, req: %p\n",
req);
flags |= GNIX_RMA_INDIRECT;
}
if (rdma)
req->work_fn = _gnix_rma_post_rdma_chain_req;
}
if (!(flags & (GNIX_RMA_INDIRECT | FI_INJECT)) && !mdesc &&
(rdma || fr_type == GNIX_FAB_RQ_RDMA_READ)) {
uint64_t requested_key;
info = ep->auth_key;
assert(info);
if (info->using_vmdh)
requested_key = _gnix_get_next_reserved_key(info);
else
requested_key = 0;
/* We need to auto-register the source buffer. */
rc = _gnix_mr_reg(&ep->domain->domain_fid.fid, (void *)loc_addr,
len, FI_READ | FI_WRITE, 0, requested_key,
0, &auto_mr, NULL, ep->auth_key,
GNIX_PROV_REG);
if (rc != FI_SUCCESS) {
GNIX_INFO(FI_LOG_EP_DATA,
"Failed to auto-register local buffer: %d\n",
rc);
goto err_auto_reg;
}
flags |= FI_LOCAL_MR;
mdesc = (void *)auto_mr;
GNIX_INFO(FI_LOG_EP_DATA, "auto-reg MR: %p\n", auto_mr);
}
if (mdesc)
md = container_of(mdesc, struct gnix_fid_mem_desc, mr_fid);
req->rma.loc_md = (void *)md;
req->rma.rem_addr = rem_addr;
req->rma.rem_mr_key = mkey;
req->rma.len = len;
req->rma.imm = data;
req->flags = flags;
if (req->flags & FI_INJECT) {
memcpy(req->inject_buf, (void *)loc_addr, len);
req->rma.loc_addr = (uint64_t)req->inject_buf;
} else {
req->rma.loc_addr = loc_addr;
}
/* Inject interfaces always suppress completions. If
* SELECTIVE_COMPLETION is set, honor any setting. Otherwise, always
* deliver a completion. */
if ((flags & GNIX_SUPPRESS_COMPLETION) ||
(ep->send_selective_completion && !(flags & FI_COMPLETION))) {
req->flags &= ~FI_COMPLETION;
} else {
req->flags |= FI_COMPLETION;
}
if (rdma) {
req->flags |= GNIX_RMA_RDMA;
}
COND_ACQUIRE(ep->requires_lock, &ep->vc_lock);
/* find VC for target */
rc = _gnix_vc_ep_get_vc(ep, dest_addr, &vc);
if (rc) {
GNIX_INFO(FI_LOG_EP_DATA,
"_gnix_vc_ep_get_vc() failed, addr: %lx, rc:%d\n",
dest_addr, rc);
goto err_get_vc;
}
req->vc = vc;
connected = (vc->conn_state == GNIX_VC_CONNECTED);
/* Adding FI_FENCE to an FI_MORE list will break the FI_MORE Chain.
* Current FI_MORE implementation does not create remote CQ events.
* Remove FI_MORE flag when FI_FENCE or REMOTE EP requirements are
* present. We will also only allow FI_MORE if a prior connection has
* been established, so that the peer capabilities can be determined.*/
if ((flags & FI_FENCE) || (flags & FI_REMOTE_CQ_DATA) ||
!connected || (req->vc->peer_caps & FI_RMA_EVENT)) {
flags &= ~FI_MORE;
}
/* Add reads/writes to slist when FI_MORE is present, Or
* if this is the first message in the chain without FI_MORE.
* When FI_MORE is not present, if the slists are not empty
* it is the first message without FI_MORE.
* Do not add reqs with FI_FENCE or REMOTE EP requirements requirements
* to the fi_more list. */
if ((flags & FI_MORE) ||
(!(flags & FI_MORE) && connected &&
(!slist_empty(&ep->more_write) || !slist_empty(&ep->more_read)) &&
!(flags & FI_FENCE || flags & FI_REMOTE_CQ_DATA ||
req->vc->peer_caps & FI_RMA_EVENT))) {
if (fr_type == GNIX_FAB_RQ_RDMA_WRITE) {
slist_insert_tail(&req->rma.sle, &ep->more_write);
req->work_fn = _gnix_rma_more_post_req;
} else if (fr_type == GNIX_FAB_RQ_RDMA_READ) {
slist_insert_tail(&req->rma.sle, &ep->more_read);
req->work_fn = _gnix_rma_more_post_req;
}
if (flags & FI_MORE) {
COND_RELEASE(ep->requires_lock, &ep->vc_lock);
return FI_SUCCESS;
}
}
/* Initiate read/write chains on first message without FI_MORE. */
if (!(flags & FI_MORE) &&
(!(slist_empty(&ep->more_write)) ||
!(slist_empty(&ep->more_read)))) {
if (!(slist_empty(&ep->more_write))) {
sle = ep->more_write.head;
more_req = container_of(sle, struct gnix_fab_req,
rma.sle);
GNIX_DEBUG(FI_LOG_EP_DATA,
"FI_MORE: got fab_request from more_write. Queuing Request\n");
_gnix_vc_queue_tx_req(more_req);
slist_init(&ep->more_write); /* For future reqs */
}
if (!(slist_empty(&ep->more_read))) {
sle = ep->more_read.head;
more_req = container_of(sle, struct gnix_fab_req,
rma.sle);
GNIX_DEBUG(FI_LOG_EP_DATA,
"FI_MORE: got fab_request from more_read. Queuing Request\n");
_gnix_vc_queue_tx_req(more_req);
slist_init(&ep->more_read);
}
/* Requests with FI_FENCE or REMOTE EP requirements are not
* added to the FI_MORE List. They must be queued separately. */
if ((flags & FI_FENCE) || (flags & FI_REMOTE_CQ_DATA) ||
(req->vc->peer_caps & FI_RMA_EVENT)) {
rc = _gnix_vc_queue_tx_req(req);
COND_RELEASE(ep->requires_lock, &ep->vc_lock);
return rc;
}
COND_RELEASE(ep->requires_lock, &ep->vc_lock);
return FI_SUCCESS;
}
GNIX_DEBUG(FI_LOG_EP_DATA, "Queuing (%p %p %d)\n",
(void *)loc_addr, (void *)rem_addr, len);
rc = _gnix_vc_queue_tx_req(req);
connected = (vc->conn_state == GNIX_VC_CONNECTED);
COND_RELEASE(ep->requires_lock, &ep->vc_lock);
/*
* If a new VC was allocated, progress CM before returning.
* If the VC is connected and there's a backlog, poke
* the nic progress engine befure returning.
*/
if (!connected) {
_gnix_cm_nic_progress(ep->cm_nic);
} else if (!dlist_empty(&vc->tx_queue)) {
_gnix_nic_progress(vc->ep->nic);
}
return rc;
err_get_vc:
COND_RELEASE(ep->requires_lock, &ep->vc_lock);
if (flags & FI_LOCAL_MR) {
fi_close(&auto_mr->fid);
flags &= ~FI_LOCAL_MR;
}
err_auto_reg:
_gnix_fr_free(req->vc->ep, req);
return rc;
}