int gnix_nic_alloc()

in prov/gni/src/gnix_nic.c [938:1398]


int gnix_nic_alloc(struct gnix_fid_domain *domain,
		   struct gnix_nic_attr *attr,
		   struct gnix_nic **nic_ptr)
{
	int ret = FI_SUCCESS;
	struct gnix_nic *nic = NULL;
	uint32_t device_addr;
	gni_return_t status;
	uint32_t fake_cdm_id = GNIX_CREATE_CDM_ID;
	gni_smsg_attr_t smsg_mbox_attr;
	struct gnix_nic_attr *nic_attr = &default_attr;
	uint32_t num_corespec_cpus = 0;
	bool must_alloc_nic = false;
	bool free_list_inited = false;
	struct gnix_auth_key *auth_key;

	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");

	*nic_ptr = NULL;
	nic_attr->gni_cdm_modes = gnix_cdm_modes;

	if (attr) {
		ret = __gnix_nic_check_attr_sanity(attr);
		if (ret != FI_SUCCESS)
			return ret;
		nic_attr = attr;
		must_alloc_nic = nic_attr->must_alloc;
	}

	auth_key = nic_attr->auth_key;

	/*
	 * If we've maxed out the number of nics for this domain/ptag,
	 * search the list of existing nics.  Take the gnix_nic_list_lock
	 * here since the gnix_nic_list will be manipulated whether or
	 * not we attach to an existing nic or create a new one.
	 *
	 * Should not matter much that this is a pretty fat critical section
	 * since endpoint setup for RDM type will typically occur near
	 * app startup, likely in a single threaded region, and for the
	 * case of MSG, where there will likely be many 100s of EPs, after
	 * a few initial slow times through this section when nics are created,
	 * max nic count for the ptag will be reached and only the first part
	 * of the critical section - iteration over existing nics - will be
	 * happening.
	 */

	pthread_mutex_lock(&gnix_nic_list_lock);

	/*
	 * we can reuse previously allocated nics as long as a
	 * must_alloc is not specified in the nic_attr arg.
	 */

	if ((must_alloc_nic == false) &&
	    (gnix_nics_per_ptag[auth_key->ptag] >= gnix_max_nics_per_ptag)) {
		assert(!dlist_empty(&gnix_nic_list_ptag[auth_key->ptag]));

		nic = dlist_first_entry(&gnix_nic_list_ptag[auth_key->ptag],
					struct gnix_nic, ptag_nic_list);
		dlist_remove(&nic->ptag_nic_list);
		dlist_insert_tail(&nic->ptag_nic_list,
				  &gnix_nic_list_ptag[auth_key->ptag]);
		_gnix_ref_get(nic);

		GNIX_INFO(FI_LOG_EP_CTRL, "Reusing NIC:%p\n", nic);
	}

	/*
	 * no nic found create a cdm and attach
	 */

	if (!nic) {

		nic = calloc(1, sizeof(struct gnix_nic));
		if (nic == NULL) {
			ret = -FI_ENOMEM;
			goto err;
		}

		nic->using_vmdh = domain->using_vmdh;

		if (nic_attr->use_cdm_id == false) {
			ret = _gnix_cm_nic_create_cdm_id(domain, &fake_cdm_id);
			if (ret != FI_SUCCESS) {
				GNIX_WARN(FI_LOG_EP_CTRL,
					  "_gnix_cm_nic_create_cdm_id returned %s\n",
					  fi_strerror(-ret));
				goto err;
			}
		} else
			fake_cdm_id = nic_attr->cdm_id;

		if (nic_attr->gni_cdm_hndl == NULL) {
			status = GNI_CdmCreate(fake_cdm_id,
						auth_key->ptag,
						auth_key->cookie,
						gnix_cdm_modes,
						&nic->gni_cdm_hndl);
			if (status != GNI_RC_SUCCESS) {
				GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CdmCreate returned %s\n",
					 gni_err_str[status]);
				ret = gnixu_to_fi_errno(status);
				goto err1;
			}
			nic->allocd_gni_res |= GNIX_NIC_CDM_ALLOCD;
		} else {
			nic->gni_cdm_hndl = nic_attr->gni_cdm_hndl;
		}

		/*
		 * Okay, now go for the attach
		*/

		if (nic_attr->gni_nic_hndl == NULL) {
			status = GNI_CdmAttach(nic->gni_cdm_hndl,
						0,
						&device_addr,
						&nic->gni_nic_hndl);
			if (status != GNI_RC_SUCCESS) {
				GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CdmAttach returned %s\n",
					 gni_err_str[status]);
				_gnix_dump_gni_res(auth_key->ptag);
				ret = gnixu_to_fi_errno(status);
				goto err1;
			}
		} else
			nic->gni_nic_hndl = nic_attr->gni_nic_hndl;

		/*
		 * create TX CQs - first polling, then blocking
		 */

		status = GNI_CqCreate(nic->gni_nic_hndl,
					domain->params.tx_cq_size,
					0,                  /* no delay count */
					GNI_CQ_BLOCKING |
						domain->gni_cq_modes,
					NULL,              /* useless handler */
					NULL,               /* useless handler
								context */
					&nic->tx_cq_blk);
		if (status != GNI_RC_SUCCESS) {
			GNIX_WARN(FI_LOG_EP_CTRL,
				  "GNI_CqCreate returned %s\n",
				  gni_err_str[status]);
			_gnix_dump_gni_res(auth_key->ptag);
			ret = gnixu_to_fi_errno(status);
			goto err1;
		}

		/* Use blocking CQs for all operations if eager_auto_progress
		 * is used.  */
		if (domain->params.eager_auto_progress) {
			nic->tx_cq = nic->tx_cq_blk;
		} else {
			status = GNI_CqCreate(nic->gni_nic_hndl,
						domain->params.tx_cq_size,
						0, /* no delay count */
						domain->gni_cq_modes,
						NULL, /* useless handler */
						NULL, /* useless handler ctx */
						&nic->tx_cq);
			if (status != GNI_RC_SUCCESS) {
				GNIX_WARN(FI_LOG_EP_CTRL,
					  "GNI_CqCreate returned %s\n",
					  gni_err_str[status]);
				_gnix_dump_gni_res(auth_key->ptag);
				ret = gnixu_to_fi_errno(status);
				goto err1;
			}
		}


		/*
		 * create RX CQs - first polling, then blocking
		 */

		status = GNI_CqCreate(nic->gni_nic_hndl,
					domain->params.rx_cq_size,
					0,
					GNI_CQ_BLOCKING |
						domain->gni_cq_modes,
					NULL,
					NULL,
					&nic->rx_cq_blk);
		if (status != GNI_RC_SUCCESS) {
			GNIX_WARN(FI_LOG_EP_CTRL,
				  "GNI_CqCreate returned %s\n",
				  gni_err_str[status]);
			_gnix_dump_gni_res(auth_key->ptag);
			ret = gnixu_to_fi_errno(status);
			goto err1;
		}

		/* Use blocking CQs for all operations if eager_auto_progress
		 * is used.  */
		if (domain->params.eager_auto_progress) {
			nic->rx_cq = nic->rx_cq_blk;
		} else {
			status = GNI_CqCreate(nic->gni_nic_hndl,
						domain->params.rx_cq_size,
						0,
						domain->gni_cq_modes,
						NULL,
						NULL,
						&nic->rx_cq);
			if (status != GNI_RC_SUCCESS) {
				GNIX_WARN(FI_LOG_EP_CTRL,
					  "GNI_CqCreate returned %s\n",
					  gni_err_str[status]);
				_gnix_dump_gni_res(auth_key->ptag);
				ret = gnixu_to_fi_errno(status);
				goto err1;
			}
		}

		nic->device_addr = device_addr;
		nic->ptag = auth_key->ptag;
		nic->cookie = auth_key->cookie;

		nic->vc_id_table_capacity = domain->params.vc_id_table_capacity;
		nic->vc_id_table = malloc(sizeof(void *) *
					       nic->vc_id_table_capacity);
		if (nic->vc_id_table == NULL) {
			GNIX_WARN(FI_LOG_EP_CTRL,
				  "malloc of vc_id_table failed\n");
			ret = -FI_ENOMEM;
			goto err1;
		}

		ret = _gnix_alloc_bitmap(&nic->vc_id_bitmap,
					 nic->vc_id_table_capacity, NULL);
		if (ret != FI_SUCCESS) {
			GNIX_WARN(FI_LOG_EP_CTRL,
				  "alloc_bitmap returned %d\n", ret);
			goto err1;
		}
		fastlock_init(&nic->vc_id_lock);

		/*
		 * initialize free list for VC's
		 * In addition to hopefully allowing for a more compact
		 * allocation of VC structs, the free list is also import
		 * because there is a window of time when using auto progress
		 * that a thread may be going through the progress engine
		 * while one of the application threads is actively tearing
		 * down an endpoint (and hence its associated VCs) before the
		 * rem_id for the vc is removed from the vector.
		 * As a consequence, it is important that
		 * the memory allocated within the freelist allocator not be
		 * returned to the system prior to the freelist being destroyed
		 * as part of the nic destructor procedure.  The freelist is
		 * destroyed in that procedure after the progress thread
		 * has been joined.
		 */

		ret = _gnix_fl_init_ts(sizeof(struct gnix_vc),
				       offsetof(struct gnix_vc, fr_list),
				       GNIX_VC_FL_MIN_SIZE,
				       GNIX_VC_FL_INIT_REFILL_SIZE,
				       0,
				       0,
				       &nic->vc_freelist);
		if (ret == FI_SUCCESS) {
			free_list_inited = true;
		} else {
			GNIX_DEBUG(FI_LOG_EP_DATA, "_gnix_fl_init returned: %s\n",
				   fi_strerror(-ret));
			goto err1;
		}

		fastlock_init(&nic->lock);

		ret = __gnix_nic_tx_freelist_init(nic,
						  domain->params.tx_cq_size);
		if (ret != FI_SUCCESS)
			goto err1;

		fastlock_init(&nic->prog_vcs_lock);
		dlist_init(&nic->prog_vcs);

		_gnix_ref_init(&nic->ref_cnt, 1, __nic_destruct);

		smsg_mbox_attr.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
		smsg_mbox_attr.mbox_maxcredit = domain->params.mbox_maxcredit;
		smsg_mbox_attr.msg_maxsize =  domain->params.mbox_msg_maxsize;

		status = GNI_SmsgBufferSizeNeeded(&smsg_mbox_attr,
						  &nic->mem_per_mbox);
		if (status != GNI_RC_SUCCESS) {
			GNIX_WARN(FI_LOG_EP_CTRL,
				  "GNI_SmsgBufferSizeNeeded returned %s\n",
				  gni_err_str[status]);
			ret = gnixu_to_fi_errno(status);
			goto err1;
		}

		/*
		 * set up mailbox allocator for SMSG mailboxes
		 */

		ret = _gnix_mbox_allocator_create(nic,
					  nic->rx_cq,
					  domain->params.mbox_page_size,
					  (size_t)nic->mem_per_mbox,
					  domain->params.mbox_num_per_slab,
					  &nic->mbox_hndl);

		if (ret != FI_SUCCESS) {
			GNIX_WARN(FI_LOG_EP_CTRL,
				  "_gnix_mbox_alloc returned %s\n",
				  fi_strerror(-ret));
			goto err1;
		}

		/*
		 * use the mailbox allocator system to set up an
		 * pre-pinned RDMA bounce buffers for longer eager
		 * messages and other cases where zero-copy
		 * can't be safely used.
		 *
		 * One set of blocks is used for the send side.
		 * A second set of blocks is used for the receive
		 * side.  Both sets of blocks are registered against
		 * the blocking RX CQ for this nic.
		 *
		 * TODO: hardwired constants, uff
		 * TODO: better to use a buddy allocator or some other
		 * allocator
		 * Disable these for now as we're not using and they
		 * chew up a lot of IOMMU space per nic.
		 */

#if 0
		ret = _gnix_mbox_allocator_create(nic,
						  NULL,
						  GNIX_PAGE_2MB,
						  65536,
						  512,
						  &nic->s_rdma_buf_hndl);
		if (ret != FI_SUCCESS) {
			GNIX_WARN(FI_LOG_EP_CTRL,
				  "_gnix_mbox_alloc returned %s\n",
				  fi_strerror(-ret));
			_gnix_dump_gni_res(domain->ptag);
			goto err1;
		}

		ret = _gnix_mbox_allocator_create(nic,
						  NULL,
						  GNIX_PAGE_2MB,
						  65536,
						  512,
						  &nic->r_rdma_buf_hndl);
		if (ret != FI_SUCCESS) {
			GNIX_WARN(FI_LOG_EP_CTRL,
				  "_gnix_mbox_alloc returned %s\n",
				  fi_strerror(-ret));
			_gnix_dump_gni_res(domain->ptag);
			goto err1;
		}
#endif

		ret =  __nic_setup_irq_cq(nic);
		if (ret != FI_SUCCESS) {
			GNIX_WARN(FI_LOG_EP_CTRL,
				  "__nic_setup_irq_cq returned %s\n",
				  fi_strerror(-ret));
			_gnix_dump_gni_res(auth_key->ptag);
			goto err1;
		}

		/*
 		 * if the domain is using PROGRESS_AUTO for data, set up
 		 * a progress thread.
 		 */

		if (domain->data_progress == FI_PROGRESS_AUTO) {

			/*
			 * tell CLE job container that next thread should be
			 * runnable anywhere in the cpuset, don't treat as
			 * an error if one is returned, may have perf issues
			 * though...
			 */

			ret = _gnix_get_num_corespec_cpus(&num_corespec_cpus);
			if (ret != FI_SUCCESS) {
				GNIX_WARN(FI_LOG_EP_CTRL,
				  "failed to get num corespec cpus\n");
			}
			if (num_corespec_cpus > 0) {
				ret = _gnix_job_disable_affinity_apply();
			} else {
				ret = _gnix_job_enable_unassigned_cpus();
			}
			if (ret != 0)
				GNIX_WARN(FI_LOG_EP_CTRL,
				"job_disable/unassigned cpus returned %d\n",
					 ret);

			ret = pthread_create(&nic->progress_thread,
					     NULL,
					     __gnix_nic_prog_thread_fn,
					     (void *)nic);
			if (ret)
				GNIX_WARN(FI_LOG_EP_CTRL,
				"pthread_create call returned %d\n", ret);
		}

		dlist_insert_tail(&nic->gnix_nic_list, &gnix_nic_list);
		dlist_insert_tail(&nic->ptag_nic_list,
				  &gnix_nic_list_ptag[auth_key->ptag]);

		nic->smsg_callbacks = gnix_ep_smsg_callbacks;

		++gnix_nics_per_ptag[auth_key->ptag];

		GNIX_INFO(FI_LOG_EP_CTRL, "Allocated NIC:%p\n", nic);
	}

	if (nic) {
		nic->requires_lock = domain->thread_model != FI_THREAD_COMPLETION;
		nic->using_vmdh = domain->using_vmdh;
	}

	*nic_ptr = nic;
	goto out;

err1:
	ofi_atomic_dec32(&gnix_id_counter);
err:
	if (nic != NULL) {
		__nic_teardown_irq_cq(nic);
		if (nic->r_rdma_buf_hndl != NULL)
			_gnix_mbox_allocator_destroy(nic->r_rdma_buf_hndl);
		if (nic->s_rdma_buf_hndl != NULL)
			_gnix_mbox_allocator_destroy(nic->s_rdma_buf_hndl);
		if (nic->mbox_hndl != NULL)
			_gnix_mbox_allocator_destroy(nic->mbox_hndl);
		if (nic->rx_cq != NULL && nic->rx_cq != nic->rx_cq_blk)
			GNI_CqDestroy(nic->rx_cq);
		if (nic->rx_cq_blk != NULL)
			GNI_CqDestroy(nic->rx_cq_blk);
		if (nic->tx_cq != NULL && nic->tx_cq != nic->tx_cq_blk)
			GNI_CqDestroy(nic->tx_cq);
		if (nic->tx_cq_blk != NULL)
			GNI_CqDestroy(nic->tx_cq_blk);
		if ((nic->gni_cdm_hndl != NULL) && (nic->allocd_gni_res &
		    GNIX_NIC_CDM_ALLOCD))
			GNI_CdmDestroy(nic->gni_cdm_hndl);
		if (free_list_inited == true)
			_gnix_fl_destroy(&nic->vc_freelist);
		free(nic);
	}

out:
	pthread_mutex_unlock(&gnix_nic_list_lock);
	return ret;
}