in src/nccl_ofi_sendrecv.cpp [632:732]
static int sendrecv_mr_buffers_register(struct fid_domain *domain,
struct fid_ep *ep,
nccl_ofi_idpool_t *key_pool,
int dev_id,
nccl_ofi_mr_ckey_ref ckey,
int type,
nccl_net_ofi_sendrecv_mr_handle_t **mr_handle)
{
int ret = 0;
struct fi_mr_attr mr_attr = {};
uint64_t regattr_flags = 0;
auto *ret_handle = new nccl_net_ofi_sendrecv_mr_handle_t{MR_KEY_INIT_VALUE, nullptr};
mr_attr.access = FI_SEND | FI_RECV;
nccl_ofi_mr_ckey_fill_mr_attrs(ckey, &mr_attr, ®attr_flags);
switch (type) {
case NCCL_PTR_HOST:
if (support_fi_rma) {
mr_attr.access |= FI_READ;
}
mr_attr.iface = FI_HMEM_SYSTEM;
break;
#if HAVE_CUDA
case NCCL_PTR_CUDA:
if (support_fi_rma) {
mr_attr.access |= FI_REMOTE_READ;
}
mr_attr.iface = FI_HMEM_CUDA;
/* Get CUDA device ID */
ret = nccl_net_ofi_get_cuda_device_for_addr((void *)nccl_ofi_mr_ckey_baseaddr(ckey),
&mr_attr.device.cuda);
if (OFI_UNLIKELY(ret != 0)) {
goto exit;
}
break;
#endif
#if HAVE_NEURON
case NCCL_PTR_NEURON:
mr_attr.access |= FI_REMOTE_READ;
mr_attr.iface = FI_HMEM_NEURON;
/*
* Store a sentinel; libfabric requires this to be initialized Libfabric
* requires the device.neuron field to be set for Neuron HMEM, but the EFA
* provider does not use the value. Store an invalid device id sentinel to
* both follow the Libfabric spec and cause an error if a provider uses the
* value in the future.
*/
mr_attr.device.neuron = -1;
break;
#endif
default:
ret = -EINVAL;
goto exit;
}
if (key_pool->get_size() != 0) {
size_t key = key_pool->allocate_id();
if (OFI_UNLIKELY(key == FI_KEY_NOTAVAIL)) {
NCCL_OFI_WARN("MR key allocation failed");
ret = -ENOMEM;
goto exit;
}
ret_handle->mr_key = static_cast<uint64_t>(key);
mr_attr.requested_key = ret_handle->mr_key;
}
ret = fi_mr_regattr(domain, &mr_attr, regattr_flags, &ret_handle->mr);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Unable to register memory (type = %d) for device %d. RC: %d, Error: %s",
type, dev_id, ret, fi_strerror(-ret));
goto exit;
}
if (endpoint_mr) {
ret = fi_mr_bind(ret_handle->mr, &ep->fid, 0);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Unable to bind MR to EP (type = %d) for device %d. RC: %d, Error: %s",
type, dev_id, ret, fi_strerror(-ret));
goto exit;
}
ret = fi_mr_enable(ret_handle->mr);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Unable to enable MR (type = %d) for device %d. RC: %d, Error: %s",
type, dev_id, ret, fi_strerror(-ret));
goto exit;
}
}
*mr_handle = ret_handle;
return 0;
exit:
if (ret_handle != nullptr) {
sendrecv_comm_mr_base_dereg(ret_handle, key_pool, nullptr);
ret_handle = nullptr;
}
*mr_handle = nullptr;
return ret;
}