include/nccl_ofi_ep_addr_list.h (38 lines of code) (raw):
/*
* Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved.
*/
#ifndef NCCL_OFI_EP_ADDR_LIST_H
#define NCCL_OFI_EP_ADDR_LIST_H
#include <cstdint>
#include <mutex>
#include <string_view>
#include <unordered_map>
#include <unordered_set>
#include <vector>
/* Endpoint structure used by plugin code */
struct nccl_net_ofi_ep;
typedef struct nccl_net_ofi_ep nccl_net_ofi_ep_t;
class nccl_ofi_ep_addr_list_t {
public:
/**
* Find endpoint in the list ep_pair_list that is not already connected to addr.
* If an endpoint is found, add this address to its connection list.
* If all endpoints are already connected to addr, return NULL.
*
* @param addr_in Libfabric address
* @param addr_size Size of address
* @param ep Output ep
* NULL if no ep found
*
* @return 0, on success
*/
int get(const void *addr_in, size_t addr_size, nccl_net_ofi_ep_t **ep);
/**
* Add ep to the list ep_pair_list, with a single connection to addr.
*
* This function makes a copy of the data in addr, so the caller is free to
* modify the memory at addr as desired.
*
* @param ep pointer to endpoint
* @param addr_in Libfabric address of size MAX_EP_ADDR
*
* @return 0, on success
* -EINVAL, invalid argument
*/
int insert(nccl_net_ofi_ep_t *ep, const void *addr_in, size_t addr_size);
/**
* Remove ep from the list ep_pair_list, if present
*
* @param ep pointer to endpoint
*
* @return 0, on success
* -ENOENT, ep not found
*/
int remove(nccl_net_ofi_ep_t *ep);
private:
// we need an object to represent opaque Libfabric addresses with two
// different properties. When we're searching for an address in the
// address_set for each endpoint, we want a lightweight overlay of the
// address passed into get() or insert(). But the definition of the
// existing ep_addr_list interface is that the remote address memory may
// be modified after insert() or get() returns. This means that for the
// key stored in address_set, we need to actually have a deep copy. To
// handle both cases, we create a surcture that can be initialized with
// a memory region as a lightweight string_view overlay, and on copy
// construction (which should only happen on insert into the
// address_set), we make a deep copy to keep the API guarantee.
//
// Because of this behavior, data may be an empty vector. view will
// always point to useful data.
class address_storage {
public:
address_storage(const void *addr, size_t addr_len);
address_storage(const address_storage &other);
bool operator==(const address_storage& rhs) const;
const std::string_view& get_view(void) const { return view; }
private:
// ordering is important here. view overlays at data, so data
// must be initialized before creating the view in the copy
// constructor.
const std::vector<char> data;
const std::string_view view;
};
struct address_storage_hash {
std::size_t operator()(const address_storage& k) const {
std::size_t val = std::hash<std::string_view>()(k.get_view());
return val;
}
};
using address_set = std::unordered_set<address_storage, address_storage_hash>;
using endpoint_map = std::unordered_map<nccl_net_ofi_ep_t *, address_set>;
std::mutex lock;
endpoint_map endpoints;
};
#endif