in src/nccl_ofi_freelist.cpp [238:366]
int nccl_ofi_freelist_add(nccl_ofi_freelist_t *freelist,
size_t num_entries)
{
int ret;
size_t allocation_count = num_entries;
size_t block_mem_size = 0;
char *buffer = NULL;
struct nccl_ofi_freelist_block_t *block = NULL;
char *b_end = NULL;
char *b_end_aligned = NULL;
if (freelist->max_entry_count > 0 &&
freelist->max_entry_count - freelist->num_allocated_entries < allocation_count) {
allocation_count = freelist->max_entry_count - freelist->num_allocated_entries;
}
if (allocation_count == 0) {
NCCL_OFI_WARN("freelist %p is full", freelist);
return -ENOMEM;
}
/* init guarantees that entry_size is a multiple of the
pointer size, so we know that eact entry will be pointer
aligned. We allocate our allocation block tracking
structure at the end of the allocation so that large
buffers are more likely to be page aligned (or aligned to
their size, as the case may be). */
block_mem_size = freelist_buffer_mem_size_full_pages(freelist->entry_size, allocation_count);
ret = nccl_net_ofi_alloc_mr_buffer(block_mem_size, (void **)&buffer);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("freelist extension allocation failed (%d)", ret);
return ret;
}
block = (struct nccl_ofi_freelist_block_t *)
calloc(1, sizeof(struct nccl_ofi_freelist_block_t));
if (block == NULL) {
NCCL_OFI_WARN("Failed to allocate freelist block metadata");
goto error;
}
block->memory = buffer;
block->memory_size = block_mem_size;
block->next = freelist->blocks;
/* Mark unused memory after block structure as noaccess */
b_end = (char *)((uintptr_t)buffer + block_mem_size);
b_end_aligned = (char *)NCCL_OFI_ROUND_DOWN((uintptr_t)b_end,
(uintptr_t)MEMCHECK_GRANULARITY);
nccl_net_ofi_mem_noaccess(b_end_aligned,
block_mem_size - (b_end_aligned - buffer));
nccl_net_ofi_mem_undefined(b_end_aligned, b_end - b_end_aligned);
if (freelist->regmr_fn) {
ret = freelist->regmr_fn(freelist->regmr_opaque, buffer,
block_mem_size,
&block->mr_handle);
if (ret != 0) {
NCCL_OFI_WARN("freelist extension registration failed: %d", ret);
goto error;
}
} else {
block->mr_handle = NULL;
}
block->entries = (nccl_ofi_freelist_elem_t *)
calloc(allocation_count, sizeof(*(block->entries)));
if (block->entries == NULL) {
NCCL_OFI_WARN("Failed to allocate entries");
goto error;
}
block->num_entries = allocation_count;
freelist->blocks = block;
for (size_t i = 0 ; i < allocation_count ; ++i) {
nccl_ofi_freelist_elem_t *entry = &block->entries[i];
size_t user_entry_size = freelist->entry_size - freelist->memcheck_redzone_size;
/* Add redzone before entry */
nccl_net_ofi_mem_noaccess(buffer, freelist->memcheck_redzone_size);
buffer += freelist->memcheck_redzone_size;
if (freelist->have_reginfo) {
entry->mr_handle = block->mr_handle;
} else {
entry->mr_handle = NULL;
}
entry->ptr = buffer;
entry->next = freelist->entries;
freelist->entries = entry;
freelist->num_allocated_entries++;
nccl_net_ofi_mem_noaccess(entry->ptr, user_entry_size);
if (freelist->entry_init_fn) {
ret = freelist->entry_init_fn(entry->ptr);
if (ret != 0) {
goto error;
}
}
buffer += user_entry_size;
}
/* Block structure will not be accessed until freelist is destroyed */
nccl_net_ofi_mem_noaccess(block, sizeof(struct nccl_ofi_freelist_block_t));
return 0;
error:
if (block != NULL) {
free(block);
block = NULL;
}
if (buffer != NULL) {
/* Reset memcheck guards of block memory. This step
* needs to be performed manually since reallocation
* of the same memory via mmap() is invisible to
* ASAN. */
nccl_net_ofi_mem_undefined(buffer, block_mem_size);
nccl_net_ofi_dealloc_mr_buffer(buffer, block_mem_size);
buffer = NULL;
}
return ret;
}