in src/nccl_ofi_rdma.cpp [6213:6311]
static inline int init_rx_buffers(nccl_net_ofi_rdma_ep_t *ep)
{
int ret = 0;
nccl_net_ofi_ep_rail_t *rail;
nccl_net_ofi_rdma_domain_t *domain = rdma_endpoint_get_domain(ep);
/* This is a little bit of a heuristic, but we need as many requests as
we have posted control messages, so that's as reasonable a starting
point as any. */
ret = nccl_ofi_freelist_init(sizeof(nccl_net_ofi_rdma_req_t),
ofi_nccl_rdma_min_posted_control_buffers(), 16, 0,
rdma_fl_req_entry_init, rdma_fl_req_entry_fini,
&ep->rx_buff_reqs_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to init rx_buff_reqs_fl");
return ret;
}
ret = nccl_ofi_freelist_init_mr(ep->ctrl_rx_buff_size,
ofi_nccl_rdma_min_posted_control_buffers(), 16, 0,
NULL, NULL,
freelist_regmr_host_fn, freelist_deregmr_host_fn,
domain, 1, &ep->ctrl_rx_buff_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to init ctrl_rx_buff_fl");
if (nccl_ofi_freelist_fini(ep->rx_buff_reqs_fl))
NCCL_OFI_WARN("Also failed to freelist_fini rx_buff_reqs_fl");
return ret;
}
if (ep->eager_rx_buff_size > 0) {
ret = nccl_ofi_freelist_init_mr(ep->eager_rx_buff_size,
ofi_nccl_rdma_min_posted_eager_buffers(), 16, 0,
NULL, NULL,
freelist_regmr_host_fn, freelist_deregmr_host_fn,
domain, EAGER_RX_BUFFER_ALIGNMENT, &ep->eager_rx_buff_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to init eager_rx_buff_size");
nccl_ofi_freelist_fini(ep->ctrl_rx_buff_fl);
nccl_ofi_freelist_fini(ep->rx_buff_reqs_fl);
return ret;
}
} else {
ep->eager_rx_buff_fl = NULL;
}
ret = nccl_ofi_freelist_init_mr(sizeof(nccl_ofi_rdma_connection_info_t),
4, 4, 0, NULL, NULL,
freelist_regmr_host_fn, freelist_deregmr_host_fn,
domain, sizeof(void *), &ep->conn_msg_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to init conn_msg freelist");
if (ep->eager_rx_buff_fl != NULL) {
nccl_ofi_freelist_fini(ep->eager_rx_buff_fl);
}
nccl_ofi_freelist_fini(ep->ctrl_rx_buff_fl);
nccl_ofi_freelist_fini(ep->rx_buff_reqs_fl);
return ret;
}
/*
* The *_rx_buff_posted limits are used in the progress engine to
* determine if the receive queue is hydrated with sufficient buffers.
* The parameters account for all the rails, so scale down bounds to
* what a single rail would need.
*/
for (uint16_t rail_id = 0; rail_id < ep->num_control_rails; ++rail_id) {
rail = rdma_endpoint_get_control_rail(ep, rail_id);
rail->min_rx_buff_posted = NCCL_OFI_DIV_CEIL(
ofi_nccl_rdma_min_posted_control_buffers(), ep->num_control_rails
);
rail->max_rx_buff_posted = NCCL_OFI_DIV_CEIL(
ofi_nccl_rdma_max_posted_control_buffers(), ep->num_control_rails
);
rail->num_rx_buff_posted = 0;
nccl_net_ofi_mutex_init(&rail->rx_buff_mutex, NULL);
rail->rx_buff_req_alloc = ctrl_rx_buff_req_alloc;
}
for (uint16_t rail_id = 0; rail_id < ep->num_rails; ++rail_id) {
rail = rdma_endpoint_get_rail(ep, rail_id);
if (ep->eager_rx_buff_size >= 0) {
rail->min_rx_buff_posted = NCCL_OFI_DIV_CEIL(
ofi_nccl_rdma_min_posted_eager_buffers(), ep->num_rails
);
rail->max_rx_buff_posted = NCCL_OFI_DIV_CEIL(
ofi_nccl_rdma_max_posted_eager_buffers(), ep->num_rails
);
} else {
rail->min_rx_buff_posted = 0;
rail->max_rx_buff_posted = 0;
}
rail->num_rx_buff_posted = 0;
nccl_net_ofi_mutex_init(&rail->rx_buff_mutex, NULL);
rail->rx_buff_req_alloc = eager_rx_buff_req_alloc;
}
return ret;
}