include/nccl_ofi_freelist.h (104 lines of code) (raw):
/*
* Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
*/
#ifndef NCCL_OFI_FREELIST_H
#define NCCL_OFI_FREELIST_H
#include <assert.h>
#include <stdlib.h>
#include <pthread.h>
#include "nccl_ofi_log.h"
#include "nccl_ofi_memcheck.h"
#include "nccl_ofi_pthread.h"
/*
* Freelist element structure
*/
typedef struct nccl_ofi_freelist_elem {
void *ptr;
void *mr_handle;
struct nccl_ofi_freelist_elem *next;
} nccl_ofi_freelist_elem_t;
/*
* Internal: tracking data for blocks of allocated memory
*/
struct nccl_ofi_freelist_block_t {
struct nccl_ofi_freelist_block_t *next;
void *memory;
size_t memory_size;
void *mr_handle;
nccl_ofi_freelist_elem_t *entries;
size_t num_entries;
};
/*
* Function pointer to call when registering memory
*
* When nccl_ofi_freelist_init_mr() is used to create the freelist, an
* optional memory registration function will be called on any newly
* allocated regions of memory. The entire region will be registered
* in one call. The opaque field will contain the value passed as the
* regmr_opaque field to nccl_ofi_freelist_init_mr.
*
* Note that the freelist lock will be held during this function. The
* caller must avoid a deadlock situation with this behavior.
*
* The registered memory region must cover full memory pages. For more
* information, see function reg_internal_mr_ep().
*
* @param data
* Pointer to MR. MR must be aligned to system memory page size.
* @param size
* Size of MR. Size must be a multiple of system memory page size.
*/
typedef int (*nccl_ofi_freelist_regmr_fn)(void *opaque, void *data, size_t size,
void **handle);
/*
* Function pointer to call when releasing memory
*
* Similar to nccl_ofi_freelist_regmr_fn, but will be called before
* releasing registered memory.
*
* Note that the freelist lock may be held during this function. The
* caller must avoid a deadlock situation with this behavior.
*/
typedef int (*nccl_ofi_freelist_deregmr_fn)(void *handle);
/*
* Function pointer to call to initialize newly allocated entries
*/
typedef int (*nccl_ofi_freelist_entry_init_fn)(void *entry);
/*
* Function pointer to call to finalize entries before deallocating
*/
typedef void (*nccl_ofi_freelist_entry_fini_fn)(void *entry);
/*
* Freelist structure
*
* Core freelist structure. This should be considered opaque to users
* of the freelist interface
*/
struct nccl_ofi_freelist_t {
size_t entry_size;
size_t num_allocated_entries;
size_t max_entry_count;
size_t increase_entry_count;
nccl_ofi_freelist_elem_t *entries;
struct nccl_ofi_freelist_block_t *blocks;
bool have_reginfo;
nccl_ofi_freelist_regmr_fn regmr_fn;
nccl_ofi_freelist_deregmr_fn deregmr_fn;
void *regmr_opaque;
size_t memcheck_redzone_size;
nccl_ofi_freelist_entry_init_fn entry_init_fn;
nccl_ofi_freelist_entry_fini_fn entry_fini_fn;
pthread_mutex_t lock;
};
typedef struct nccl_ofi_freelist_t nccl_ofi_freelist_t;
/*
* Initialize "simple" freelist structure.
*
* With simple freelists, there is no memory registration of freelist
* items, but also no requirement for any data structure embedded in
* the freelist item.
*
* The freelist will allocate initial_entry_count entries in the
* freelist during initialization. Any further growth in the freelist
* will be on-demand in units of increase_entry_count items.
*
* The freelist will grow until there are at most max_entry_count
* entries allocated as part of the freelist. If max_entry_count is
* 0, the freelist will grow until memory exhaustion.
*
* The caller can provide optional callbacks to be called during entry
* allocation and deallocation. The init callback function is intended to
* initialize the entry, so it is in a known state when returned from
* nccl_ofi_freelist_entry_alloc. The fini callback is intended to handle
* any cleanup associated with the init callback, and will be called before
* the backing memory is deallocated by the freelist. Either of these
* callbacks can be set to NULL if not required.
*/
int nccl_ofi_freelist_init(size_t entry_size,
size_t initial_entry_count,
size_t increase_entry_count,
size_t max_entry_count,
nccl_ofi_freelist_entry_init_fn entry_init_fn,
nccl_ofi_freelist_entry_fini_fn entry_fini_fn,
nccl_ofi_freelist_t **freelist_p);
/* Initialize "complex" freelist structure
*
* A complex freelist can require registration of memory as part of
* freelist expansion. Each block of allocated entries will have its
* own memory registration, allowing the freelist to grow over time
* similar to the simple freelist.
*
* The mr_handle field of the elem structure will contain the handle
* returned from regmr_fn() being called for the allocation block.
*/
int nccl_ofi_freelist_init_mr(size_t entry_size,
size_t initial_entry_count,
size_t increase_entry_count,
size_t max_entry_count,
nccl_ofi_freelist_entry_init_fn entry_init_fn,
nccl_ofi_freelist_entry_fini_fn entry_fini_fn,
nccl_ofi_freelist_regmr_fn regmr_fn,
nccl_ofi_freelist_deregmr_fn deregmr_fn,
void *regmr_opaque,
size_t entry_alignment,
nccl_ofi_freelist_t **freelist_p);
/*
* Finalize (free) a freelist
*
* Free a freelist, releasing all memory associated with the
* freelist. All memory will be released, even if there are allocated
* entries in the freelist that have not been returned. This may
* cause crashes in your application if you call free() while freelist
* items are still in use.
*/
int nccl_ofi_freelist_fini(nccl_ofi_freelist_t *freelist);
/* Internal function, which grows the freelist */
int nccl_ofi_freelist_add(nccl_ofi_freelist_t *freelist,
size_t num_entries);
/*
* Set memcheck guards of freelist entry's user data to accessible but undefined
*/
static inline void nccl_ofi_freelist_entry_set_undefined(nccl_ofi_freelist_t *freelist, void *entry_p)
{
size_t user_entry_size = freelist->entry_size - MEMCHECK_REDZONE_SIZE;
/* Entry allocated by the user is accessible but
* undefined. Note that this allows the user to
* override the nccl_ofi_freelist_elem_t structure. */
nccl_net_ofi_mem_undefined(entry_p, user_entry_size);
}
/* Allocate a new freelist item
*
* Return pointer to memory of size entry_size (provided to init) from
* the given freelist. If required, the freelist will grow during the
* call. Locking to protect the freelist is not required by the
* caller.
*
* If the function returns NULL, that means that all allocated buffers
* have previously been allocated and either the freelist has reached
* maximum size or the allocation to grow the freelist has failed.
*
* The pointer returned will be to a nccl_ofi_freelist_elem_t structure that
* contains the pointer and memory registration. For complex freelists,
* the elem_t structure will contain valid information for the mr_handle. The
* caller should not write into the bytes covered by the elem_t structure.
*/
static inline nccl_ofi_freelist_elem_t *nccl_ofi_freelist_entry_alloc
(nccl_ofi_freelist_t *freelist)
{
int ret;
nccl_ofi_freelist_elem_t *entry = NULL;
assert(freelist);
nccl_net_ofi_mutex_lock(&freelist->lock);
if (!freelist->entries) {
ret = nccl_ofi_freelist_add(freelist, freelist->increase_entry_count);
if (ret != 0) {
NCCL_OFI_WARN("Could not extend freelist: %d", ret);
goto cleanup;
}
}
entry = freelist->entries;
nccl_net_ofi_mem_defined_unaligned(entry, sizeof(*entry));
freelist->entries = entry->next;
nccl_ofi_freelist_entry_set_undefined(freelist, entry->ptr);
cleanup:
nccl_net_ofi_mutex_unlock(&freelist->lock);
return entry;
}
/* Release a freelist item
*
* Return a freelist item to the freelist. After calling this
* function, the user should not read from or write to memory in
* entry_p, as corruption may result. Locking to protect the freelist
* is not required by the caller.
*/
static inline void nccl_ofi_freelist_entry_free(nccl_ofi_freelist_t *freelist,
nccl_ofi_freelist_elem_t *entry)
{
size_t user_entry_size = freelist->entry_size - MEMCHECK_REDZONE_SIZE;
assert(freelist);
assert(entry);
nccl_net_ofi_mutex_lock(&freelist->lock);
entry->next = freelist->entries;
freelist->entries = entry;
nccl_net_ofi_mem_noaccess(entry->ptr, user_entry_size);
nccl_net_ofi_mutex_unlock(&freelist->lock);
}
#endif // End NCCL_OFI_FREELIST_H