in prov/gni/src/gnix_nic.c [938:1398]
int gnix_nic_alloc(struct gnix_fid_domain *domain,
struct gnix_nic_attr *attr,
struct gnix_nic **nic_ptr)
{
int ret = FI_SUCCESS;
struct gnix_nic *nic = NULL;
uint32_t device_addr;
gni_return_t status;
uint32_t fake_cdm_id = GNIX_CREATE_CDM_ID;
gni_smsg_attr_t smsg_mbox_attr;
struct gnix_nic_attr *nic_attr = &default_attr;
uint32_t num_corespec_cpus = 0;
bool must_alloc_nic = false;
bool free_list_inited = false;
struct gnix_auth_key *auth_key;
GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
*nic_ptr = NULL;
nic_attr->gni_cdm_modes = gnix_cdm_modes;
if (attr) {
ret = __gnix_nic_check_attr_sanity(attr);
if (ret != FI_SUCCESS)
return ret;
nic_attr = attr;
must_alloc_nic = nic_attr->must_alloc;
}
auth_key = nic_attr->auth_key;
/*
* If we've maxed out the number of nics for this domain/ptag,
* search the list of existing nics. Take the gnix_nic_list_lock
* here since the gnix_nic_list will be manipulated whether or
* not we attach to an existing nic or create a new one.
*
* Should not matter much that this is a pretty fat critical section
* since endpoint setup for RDM type will typically occur near
* app startup, likely in a single threaded region, and for the
* case of MSG, where there will likely be many 100s of EPs, after
* a few initial slow times through this section when nics are created,
* max nic count for the ptag will be reached and only the first part
* of the critical section - iteration over existing nics - will be
* happening.
*/
pthread_mutex_lock(&gnix_nic_list_lock);
/*
* we can reuse previously allocated nics as long as a
* must_alloc is not specified in the nic_attr arg.
*/
if ((must_alloc_nic == false) &&
(gnix_nics_per_ptag[auth_key->ptag] >= gnix_max_nics_per_ptag)) {
assert(!dlist_empty(&gnix_nic_list_ptag[auth_key->ptag]));
nic = dlist_first_entry(&gnix_nic_list_ptag[auth_key->ptag],
struct gnix_nic, ptag_nic_list);
dlist_remove(&nic->ptag_nic_list);
dlist_insert_tail(&nic->ptag_nic_list,
&gnix_nic_list_ptag[auth_key->ptag]);
_gnix_ref_get(nic);
GNIX_INFO(FI_LOG_EP_CTRL, "Reusing NIC:%p\n", nic);
}
/*
* no nic found create a cdm and attach
*/
if (!nic) {
nic = calloc(1, sizeof(struct gnix_nic));
if (nic == NULL) {
ret = -FI_ENOMEM;
goto err;
}
nic->using_vmdh = domain->using_vmdh;
if (nic_attr->use_cdm_id == false) {
ret = _gnix_cm_nic_create_cdm_id(domain, &fake_cdm_id);
if (ret != FI_SUCCESS) {
GNIX_WARN(FI_LOG_EP_CTRL,
"_gnix_cm_nic_create_cdm_id returned %s\n",
fi_strerror(-ret));
goto err;
}
} else
fake_cdm_id = nic_attr->cdm_id;
if (nic_attr->gni_cdm_hndl == NULL) {
status = GNI_CdmCreate(fake_cdm_id,
auth_key->ptag,
auth_key->cookie,
gnix_cdm_modes,
&nic->gni_cdm_hndl);
if (status != GNI_RC_SUCCESS) {
GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CdmCreate returned %s\n",
gni_err_str[status]);
ret = gnixu_to_fi_errno(status);
goto err1;
}
nic->allocd_gni_res |= GNIX_NIC_CDM_ALLOCD;
} else {
nic->gni_cdm_hndl = nic_attr->gni_cdm_hndl;
}
/*
* Okay, now go for the attach
*/
if (nic_attr->gni_nic_hndl == NULL) {
status = GNI_CdmAttach(nic->gni_cdm_hndl,
0,
&device_addr,
&nic->gni_nic_hndl);
if (status != GNI_RC_SUCCESS) {
GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CdmAttach returned %s\n",
gni_err_str[status]);
_gnix_dump_gni_res(auth_key->ptag);
ret = gnixu_to_fi_errno(status);
goto err1;
}
} else
nic->gni_nic_hndl = nic_attr->gni_nic_hndl;
/*
* create TX CQs - first polling, then blocking
*/
status = GNI_CqCreate(nic->gni_nic_hndl,
domain->params.tx_cq_size,
0, /* no delay count */
GNI_CQ_BLOCKING |
domain->gni_cq_modes,
NULL, /* useless handler */
NULL, /* useless handler
context */
&nic->tx_cq_blk);
if (status != GNI_RC_SUCCESS) {
GNIX_WARN(FI_LOG_EP_CTRL,
"GNI_CqCreate returned %s\n",
gni_err_str[status]);
_gnix_dump_gni_res(auth_key->ptag);
ret = gnixu_to_fi_errno(status);
goto err1;
}
/* Use blocking CQs for all operations if eager_auto_progress
* is used. */
if (domain->params.eager_auto_progress) {
nic->tx_cq = nic->tx_cq_blk;
} else {
status = GNI_CqCreate(nic->gni_nic_hndl,
domain->params.tx_cq_size,
0, /* no delay count */
domain->gni_cq_modes,
NULL, /* useless handler */
NULL, /* useless handler ctx */
&nic->tx_cq);
if (status != GNI_RC_SUCCESS) {
GNIX_WARN(FI_LOG_EP_CTRL,
"GNI_CqCreate returned %s\n",
gni_err_str[status]);
_gnix_dump_gni_res(auth_key->ptag);
ret = gnixu_to_fi_errno(status);
goto err1;
}
}
/*
* create RX CQs - first polling, then blocking
*/
status = GNI_CqCreate(nic->gni_nic_hndl,
domain->params.rx_cq_size,
0,
GNI_CQ_BLOCKING |
domain->gni_cq_modes,
NULL,
NULL,
&nic->rx_cq_blk);
if (status != GNI_RC_SUCCESS) {
GNIX_WARN(FI_LOG_EP_CTRL,
"GNI_CqCreate returned %s\n",
gni_err_str[status]);
_gnix_dump_gni_res(auth_key->ptag);
ret = gnixu_to_fi_errno(status);
goto err1;
}
/* Use blocking CQs for all operations if eager_auto_progress
* is used. */
if (domain->params.eager_auto_progress) {
nic->rx_cq = nic->rx_cq_blk;
} else {
status = GNI_CqCreate(nic->gni_nic_hndl,
domain->params.rx_cq_size,
0,
domain->gni_cq_modes,
NULL,
NULL,
&nic->rx_cq);
if (status != GNI_RC_SUCCESS) {
GNIX_WARN(FI_LOG_EP_CTRL,
"GNI_CqCreate returned %s\n",
gni_err_str[status]);
_gnix_dump_gni_res(auth_key->ptag);
ret = gnixu_to_fi_errno(status);
goto err1;
}
}
nic->device_addr = device_addr;
nic->ptag = auth_key->ptag;
nic->cookie = auth_key->cookie;
nic->vc_id_table_capacity = domain->params.vc_id_table_capacity;
nic->vc_id_table = malloc(sizeof(void *) *
nic->vc_id_table_capacity);
if (nic->vc_id_table == NULL) {
GNIX_WARN(FI_LOG_EP_CTRL,
"malloc of vc_id_table failed\n");
ret = -FI_ENOMEM;
goto err1;
}
ret = _gnix_alloc_bitmap(&nic->vc_id_bitmap,
nic->vc_id_table_capacity, NULL);
if (ret != FI_SUCCESS) {
GNIX_WARN(FI_LOG_EP_CTRL,
"alloc_bitmap returned %d\n", ret);
goto err1;
}
fastlock_init(&nic->vc_id_lock);
/*
* initialize free list for VC's
* In addition to hopefully allowing for a more compact
* allocation of VC structs, the free list is also import
* because there is a window of time when using auto progress
* that a thread may be going through the progress engine
* while one of the application threads is actively tearing
* down an endpoint (and hence its associated VCs) before the
* rem_id for the vc is removed from the vector.
* As a consequence, it is important that
* the memory allocated within the freelist allocator not be
* returned to the system prior to the freelist being destroyed
* as part of the nic destructor procedure. The freelist is
* destroyed in that procedure after the progress thread
* has been joined.
*/
ret = _gnix_fl_init_ts(sizeof(struct gnix_vc),
offsetof(struct gnix_vc, fr_list),
GNIX_VC_FL_MIN_SIZE,
GNIX_VC_FL_INIT_REFILL_SIZE,
0,
0,
&nic->vc_freelist);
if (ret == FI_SUCCESS) {
free_list_inited = true;
} else {
GNIX_DEBUG(FI_LOG_EP_DATA, "_gnix_fl_init returned: %s\n",
fi_strerror(-ret));
goto err1;
}
fastlock_init(&nic->lock);
ret = __gnix_nic_tx_freelist_init(nic,
domain->params.tx_cq_size);
if (ret != FI_SUCCESS)
goto err1;
fastlock_init(&nic->prog_vcs_lock);
dlist_init(&nic->prog_vcs);
_gnix_ref_init(&nic->ref_cnt, 1, __nic_destruct);
smsg_mbox_attr.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
smsg_mbox_attr.mbox_maxcredit = domain->params.mbox_maxcredit;
smsg_mbox_attr.msg_maxsize = domain->params.mbox_msg_maxsize;
status = GNI_SmsgBufferSizeNeeded(&smsg_mbox_attr,
&nic->mem_per_mbox);
if (status != GNI_RC_SUCCESS) {
GNIX_WARN(FI_LOG_EP_CTRL,
"GNI_SmsgBufferSizeNeeded returned %s\n",
gni_err_str[status]);
ret = gnixu_to_fi_errno(status);
goto err1;
}
/*
* set up mailbox allocator for SMSG mailboxes
*/
ret = _gnix_mbox_allocator_create(nic,
nic->rx_cq,
domain->params.mbox_page_size,
(size_t)nic->mem_per_mbox,
domain->params.mbox_num_per_slab,
&nic->mbox_hndl);
if (ret != FI_SUCCESS) {
GNIX_WARN(FI_LOG_EP_CTRL,
"_gnix_mbox_alloc returned %s\n",
fi_strerror(-ret));
goto err1;
}
/*
* use the mailbox allocator system to set up an
* pre-pinned RDMA bounce buffers for longer eager
* messages and other cases where zero-copy
* can't be safely used.
*
* One set of blocks is used for the send side.
* A second set of blocks is used for the receive
* side. Both sets of blocks are registered against
* the blocking RX CQ for this nic.
*
* TODO: hardwired constants, uff
* TODO: better to use a buddy allocator or some other
* allocator
* Disable these for now as we're not using and they
* chew up a lot of IOMMU space per nic.
*/
#if 0
ret = _gnix_mbox_allocator_create(nic,
NULL,
GNIX_PAGE_2MB,
65536,
512,
&nic->s_rdma_buf_hndl);
if (ret != FI_SUCCESS) {
GNIX_WARN(FI_LOG_EP_CTRL,
"_gnix_mbox_alloc returned %s\n",
fi_strerror(-ret));
_gnix_dump_gni_res(domain->ptag);
goto err1;
}
ret = _gnix_mbox_allocator_create(nic,
NULL,
GNIX_PAGE_2MB,
65536,
512,
&nic->r_rdma_buf_hndl);
if (ret != FI_SUCCESS) {
GNIX_WARN(FI_LOG_EP_CTRL,
"_gnix_mbox_alloc returned %s\n",
fi_strerror(-ret));
_gnix_dump_gni_res(domain->ptag);
goto err1;
}
#endif
ret = __nic_setup_irq_cq(nic);
if (ret != FI_SUCCESS) {
GNIX_WARN(FI_LOG_EP_CTRL,
"__nic_setup_irq_cq returned %s\n",
fi_strerror(-ret));
_gnix_dump_gni_res(auth_key->ptag);
goto err1;
}
/*
* if the domain is using PROGRESS_AUTO for data, set up
* a progress thread.
*/
if (domain->data_progress == FI_PROGRESS_AUTO) {
/*
* tell CLE job container that next thread should be
* runnable anywhere in the cpuset, don't treat as
* an error if one is returned, may have perf issues
* though...
*/
ret = _gnix_get_num_corespec_cpus(&num_corespec_cpus);
if (ret != FI_SUCCESS) {
GNIX_WARN(FI_LOG_EP_CTRL,
"failed to get num corespec cpus\n");
}
if (num_corespec_cpus > 0) {
ret = _gnix_job_disable_affinity_apply();
} else {
ret = _gnix_job_enable_unassigned_cpus();
}
if (ret != 0)
GNIX_WARN(FI_LOG_EP_CTRL,
"job_disable/unassigned cpus returned %d\n",
ret);
ret = pthread_create(&nic->progress_thread,
NULL,
__gnix_nic_prog_thread_fn,
(void *)nic);
if (ret)
GNIX_WARN(FI_LOG_EP_CTRL,
"pthread_create call returned %d\n", ret);
}
dlist_insert_tail(&nic->gnix_nic_list, &gnix_nic_list);
dlist_insert_tail(&nic->ptag_nic_list,
&gnix_nic_list_ptag[auth_key->ptag]);
nic->smsg_callbacks = gnix_ep_smsg_callbacks;
++gnix_nics_per_ptag[auth_key->ptag];
GNIX_INFO(FI_LOG_EP_CTRL, "Allocated NIC:%p\n", nic);
}
if (nic) {
nic->requires_lock = domain->thread_model != FI_THREAD_COMPLETION;
nic->using_vmdh = domain->using_vmdh;
}
*nic_ptr = nic;
goto out;
err1:
ofi_atomic_dec32(&gnix_id_counter);
err:
if (nic != NULL) {
__nic_teardown_irq_cq(nic);
if (nic->r_rdma_buf_hndl != NULL)
_gnix_mbox_allocator_destroy(nic->r_rdma_buf_hndl);
if (nic->s_rdma_buf_hndl != NULL)
_gnix_mbox_allocator_destroy(nic->s_rdma_buf_hndl);
if (nic->mbox_hndl != NULL)
_gnix_mbox_allocator_destroy(nic->mbox_hndl);
if (nic->rx_cq != NULL && nic->rx_cq != nic->rx_cq_blk)
GNI_CqDestroy(nic->rx_cq);
if (nic->rx_cq_blk != NULL)
GNI_CqDestroy(nic->rx_cq_blk);
if (nic->tx_cq != NULL && nic->tx_cq != nic->tx_cq_blk)
GNI_CqDestroy(nic->tx_cq);
if (nic->tx_cq_blk != NULL)
GNI_CqDestroy(nic->tx_cq_blk);
if ((nic->gni_cdm_hndl != NULL) && (nic->allocd_gni_res &
GNIX_NIC_CDM_ALLOCD))
GNI_CdmDestroy(nic->gni_cdm_hndl);
if (free_list_inited == true)
_gnix_fl_destroy(&nic->vc_freelist);
free(nic);
}
out:
pthread_mutex_unlock(&gnix_nic_list_lock);
return ret;
}