include/nccl_ofi.h (231 lines of code) (raw):

/* * Copyright (c) 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef NCCL_OFI_H_ #define NCCL_OFI_H_ #include <unordered_map> #include <rdma/fabric.h> #include <rdma/fi_errno.h> #include <rdma/fi_domain.h> #include <rdma/fi_endpoint.h> #include <rdma/fi_cm.h> #include <rdma/fi_tagged.h> #include <rdma/fi_rma.h> #include <nccl/net.h> #include "nccl_ofi_log.h" #include "nccl_ofi_topo.h" #include "nccl_ofi_idpool.h" #include "nccl_ofi_mr.h" /* * NCCL_NET_HANDLE_MAXSIZE is a limited resource (and defined in NCCL). * An endpoint address buffer of 56 bytes *should* be large enough to hold * all libfabric providers. In case the requirement changes, NCCL v2.12 * provides enough room to increase this size but we would need to maintain * backwards compatiblity with all NCCL versions. * * We also store tags and communicator stage information in remaining * part of the handle. */ #define MAX_EP_ADDR (56) /* * For each tag, we use MSB as control bit and remaining * for identifying different rings. We look at mem_tag_format for * an endpoint to determine if provider is reserving any MSBs. */ #define OFI_HIGHEST_TAG_BIT (0x1UL << 63) /* * We are supporting minimum 2^32 rings per endpoint and reserving 1 bit * for marking control sends/recvs. */ #define MIN_TAG_BITS_FOR_RING_ID (32 + 1) /* Maximum number of grouped receives */ #define NCCL_OFI_MAX_RECVS 1 /* * This defines a higher value than maximum inflight requests supported by NCCL * while not putting a lot of memory pressure. This higher number ensures that * we are able to support more number of outstanding requests with dynamic buffer * depth changes in NCCL and Neuron. */ #define NCCL_OFI_MAX_REQUESTS (128) /* * Number of send requests that can be active at any given time. In * the case of supporting NCCL_OFI_MAX_RECVS grouped receives for each * receive request, which means the number of send requests that must * be supported is actually larger than the number of receive * requests. */ #define NCCL_OFI_MAX_SEND_REQUESTS (NCCL_OFI_MAX_REQUESTS * NCCL_OFI_MAX_RECVS) /* Flush read size (bytes) */ #define NCCL_OFI_FLUSH_SIZE (4ULL) /* CPU cache line size (bytes) */ #define NCCL_OFI_DEFAULT_CPU_CACHE_LINE_SIZE (64ULL) /* Initial number of entries in the MR cache of a device */ #define NCCL_OFI_MR_CACHE_INIT_SIZE 128 /* Indicates if GPUDirect is supported by libfabric provider */ enum gdr_support_level_t {GDR_UNKNOWN, GDR_SUPPORTED, GDR_UNSUPPORTED}; extern enum gdr_support_level_t support_gdr; /* Indicates if the cudaDeviceFlushGPUDirectRDMAWrites function should be used * to flush data to the GPU. Note, CUDA flush support is not supported on all * platforms and should be disabled by default */ extern bool cuda_flush; /* number of duplicate providers to create for each discovered * provider, including renaming to cause NCCL to create additional * rings to use the connections */ extern int nic_dup_conns; /* number of cq entries to read in a single call to fi_cq_read. This variable will be updated during init (hence, can not be const), but will not change during execution. Therefore, it may be read in the polling loop without protection of a lock. */ extern size_t cq_read_count; /* Indicates if endpoint memory registration is required */ extern bool endpoint_mr; /* Indicates if remote virtual addressing is used */ extern bool virt_addr_mr; /* Indicates if provider's data progress model is FI_PROGRESS_AUTO */ extern bool data_progress_auto; /* Selected communication protocol. * * Until the protocol environment variable is checked in init(), this * is the protocol that the plugin will try to initialize; it can be * overridden by platform_init(). After init(), this is the protocol * that was selected. * * Valid values are SENDRECV and RDMA; default is SENDRECV (set by the * param OFI_NCCL_PROTOCOL) */ extern const char *nccl_ofi_selected_protocol; /* Internode network latency reported to NCCL. */ extern float net_latency; /* Size of system memory pages */ extern size_t system_page_size; struct nccl_net_ofi_plugin; struct nccl_net_ofi_device; struct nccl_net_ofi_domain; struct nccl_net_ofi_ep; struct nccl_net_ofi_req; struct nccl_net_ofi_mr_handle; struct nccl_net_ofi_comm; struct nccl_net_ofi_listen_comm; struct nccl_net_ofi_send_comm; struct nccl_net_ofi_recv_comm; typedef struct nccl_net_ofi_plugin nccl_net_ofi_plugin_t; typedef struct nccl_net_ofi_device nccl_net_ofi_device_t; typedef struct nccl_net_ofi_domain nccl_net_ofi_domain_t; typedef struct nccl_net_ofi_ep nccl_net_ofi_ep_t; typedef struct nccl_net_ofi_req nccl_net_ofi_req_t; typedef struct nccl_net_ofi_mr_handle nccl_net_ofi_mr_handle_t; typedef struct nccl_net_ofi_comm nccl_net_ofi_comm_t; typedef struct nccl_net_ofi_listen_comm nccl_net_ofi_listen_comm_t; typedef struct nccl_net_ofi_send_comm nccl_net_ofi_send_comm_t; typedef struct nccl_net_ofi_recv_comm nccl_net_ofi_recv_comm_t; /** * Request - handle for an outstanding non-blocking communication * * A request will be allocated and returned for every call to send, * recv, or flush. Memory is allocated by the callee to send, recv, * or flush, and will be freed by the callee of test when the request * is complete. */ struct nccl_net_ofi_req { int (*test)(nccl_net_ofi_req_t *req, int *done, int *size); }; /** * Struct enclosing the context parameter we pass to every Libfabric operation. * Contains callback function members to be invoked upon completion of the * corresponding request. */ struct nccl_net_ofi_context { /** * Libfabric context object. A pointer to this context is passed to all * Libfabric operations */ struct fi_context2 ofi_ctx; /** * Callback to be invoked upon completion of the request * * @param ctx: ptr to this context object * @param cq_entry: cq entry from Libfabric * @param rail_id: the rail on which the cq entry arrived. * Ignored in SENDRECV protocol */ int (*handle_cq_entry)(struct nccl_net_ofi_context *ctx, struct fi_cq_entry *cq_entry, uint16_t rail_id); /** * Callback to be invoked upon completion-with-error of the request * * @param ctx: ptr to this context object * @param cq: Libfabric completion queue * @param err_entry: err entry from Libfabric * @param rail_id: the rail on which the cq err entry arrived. * Ignored in SENDRECV protocol */ int (*handle_error_entry)(struct nccl_net_ofi_context *ctx, struct fid_cq *cq, struct fi_cq_err_entry *err_entry, uint16_t rail_id); }; typedef struct nccl_net_ofi_context nccl_net_ofi_context_t; /* Various stages of connection establishment */ typedef enum nccl_ofi_comm_stage { COMM_CREATE_START = 0, COMM_SEND_CONN, COMM_RECV_CONN, COMM_CONN_REQ_PENDING, COMM_CONN_RESP_REQ_PENDING, COMM_CONNECTED, } nccl_ofi_comm_stage_t; typedef struct save_comm_state { nccl_net_ofi_comm_t *comm; nccl_net_ofi_req_t *req; nccl_ofi_comm_stage_t stage; } save_comm_state_t; typedef struct nccl_ofi_connection_info { char ep_name[MAX_EP_ADDR]; uint64_t ep_namelen; uint64_t connect_to_self; nccl_net_ofi_req_t* req; } nccl_ofi_connection_info_t; /* Since this is a message on the wire, check that it has the expected size */ static_assert(sizeof(nccl_ofi_connection_info_t) == 80, "Wrong size for SENDRECV connect message"); typedef struct nccl_net_ofi_conn_handle { char ep_name[MAX_EP_ADDR]; uint32_t comm_id; /* Save temporary communicator state when creating send communicator */ save_comm_state_t state; } nccl_net_ofi_conn_handle_t; /** * Properties structure */ typedef struct nccl_ofi_properties { char *name; /** Path to the device in /sys */ char *pci_path; /** globally unique identifier for NIC */ uint64_t guid; /** support device memory */ bool hmem_support; /** support dmabuf interface */ bool dmabuf_support; /** Port number */ int port_number; /** Port speed in Mbps */ int port_speed; /** Port latency */ float latency; /** Maximum number of comms supported */ unsigned int max_communicators; /** Maximum number of grouped receives */ unsigned int max_group_receives; /** regMr is global if is not tied to a particular comm **/ int regIsGlobal; /** Maximum size of buffer supported to be transfered via * RMA write inline operation **/ size_t max_write_inline_size; /** Maximum size of the memory region remote access key in bytes **/ size_t max_mr_key_size; /** Indicator whether RMA operations of NCCL Net API are supported **/ int rma_supported; /** Max transfer size for point-to-point operations **/ size_t max_p2p_bytes; /** Max transfer size for collective operations **/ size_t max_coll_bytes; } nccl_ofi_properties_t; /** * Device Data * * A device is roughly a NIC (or a port on a NIC) or a multi-rail * group. The device is the unit of bandwidth sharing and general NIC * propoeries, and accessing domains (ie, groups of NIC resources). */ struct nccl_net_ofi_device { struct nccl_net_ofi_plugin *plugin; /* this device's index in the plugin's devices array */ int dev_id; /* * name of the device - should include the provider name, but may be * augmented (in the case of mrail). Set during the transport's * initialization, and should be read-only from that point. */ char *name; /* do we need to use an mr rkey pool? This is a * provider-specific behavior determined when providers are * selected. */ bool need_mr_rkey_pool; int (*get_properties)(nccl_net_ofi_device_t *base_dev, nccl_ofi_properties_t *props); /* Retrieve a domain associated with this device. There may * be more than one domain per device, depending on a number * of performance tradeoffs (be sure to read the domain * description below). */ nccl_net_ofi_domain_t *(*get_domain)(nccl_net_ofi_device_t *dev); int (*get_ep)(nccl_net_ofi_device_t *base_dev, nccl_net_ofi_ep_t **ep); int (*get_mr_key)(nccl_net_ofi_device_t *base_dev, void* mhandle, uint64_t* mr_key); /** * destructor - releases resources associated with device */ int (*release)(nccl_net_ofi_device_t *device); /* Lock for concurrency since domains can be shared by * multiple entities. */ pthread_mutex_t device_lock; /* private */ /* * create a new domain. This funcion is a private pure * virtual function, which is called from the base * implementation of get_domain() and should not be called * from the more general case. */ nccl_net_ofi_domain_t *(*create_domain)(nccl_net_ofi_device_t *dev); /* * release all domains and endpoints. This function is a private * function, which is called only during release() to free allocated * domains and endpoints. */ int (*release_all_domain_and_ep)(nccl_net_ofi_device_t *dev); /* * hash table indexed by thread id of active domains. * * TODO: When the device class is made a proper C++ class, this should * be changed from a pointer to a map to a map. We can't do that right * now, because that leaves us with no good way to invoke the map * constructor. */ std::unordered_map<long, nccl_net_ofi_domain_t *> *domain_table; }; /** * Domain Object - Represents a protection and thread safety domain * * A domain is a weird combination of a Libfabric domain (and related * resources like an AV and CQ) as well as a general thread boundary. * Transports are free to implement fine grained threads, but * generally it is expected that calls into resources that share the * same domain will share the same lock. */ struct nccl_net_ofi_domain { /* Backpointer to the device associated with this domain. */ nccl_net_ofi_device_t *device; /* * Retrieve an endpoint for this domain. If a suitable * endpoint does not exist, call create_endpoint() to create * one and return that endpoint. This function is a pure * virtual function that must be implemented by inheriting * classes. */ int (*get_ep)(nccl_net_ofi_domain_t *domain, nccl_net_ofi_ep_t **endpoint); /* * Destructor - release resources associated with the domain * @param domain * The domain (itself) to be released. * @param skip_device_lock * false, taking device lock by default. * ture, not taking device lock when caller takes it. * @param force_cleanup * false, not release when endpoint exists. * true, release no matter endpoint exists nor not. */ int (*release)(nccl_net_ofi_domain_t *domain, bool skip_device_lock, bool force_cleanup); /* * Protocol-agnostic MR cache for this device. */ nccl_ofi_mr_cache_t *mr_cache; /* Memory registration key pool */ nccl_ofi_idpool_t *mr_rkey_pool; pthread_mutex_t domain_lock; /* Private */ /* pure virtual function called when resources associated with * the ep should be destroyed. Device lock will be held when * this function is called. */ int (*free)(nccl_net_ofi_domain_t *domain); /* Create a new endpoint * * Pure virtual function to allocate a new endpoint structure */ int (*create_endpoint)(nccl_net_ofi_domain_t *domain, nccl_net_ofi_ep_t **ep); /* endpoint used for (at a minimum) receiving connection messages. Send/Recv protocol uses this for all communication. The rdma protocol uses this for all tx requests and all connection-establishment requests, but may have additional endpoints for the rx side of rdma writes. */ nccl_net_ofi_ep_t *endpoint; /* thread id of the thread that called get_domain(). Used as the hash key for the domain hash */ long creating_thread_id; }; /** * Endpoint - A per-Proxy Thread device abstraction * * The device structure is shared across potentially multiple proxy * threads (depending on NCCL configuration). The Endpoint abstracts * a unique address (assuming an RDM provider), allowing for the * possibility that the underlying transport uses an endpoint per * thread (or per thread calling listen/connect) to drive traffic * across multiple Libfabric endpoints and completion queues. * * Endpoints are implicitly created as part of the get_ep() call * in the device interface. Whether they are created during the first * call to get_ep() or during initialization is left to the * implementation. */ struct nccl_net_ofi_ep { /* Backpointer to the domain associated with this ep. */ nccl_net_ofi_domain_t *domain; /* Create a receiving object and provide a handle to it. * * The callee can expect that the handle provides * NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged across * the wire through an out of band mechanism. The callee must * allocate memory for listen_comm. * * The callee has to guarantee that the state stage of the * handle is set to COMM_CREATE_START. */ int (*listen)(nccl_net_ofi_ep_t *ep, nccl_net_ofi_conn_handle_t *handle, nccl_net_ofi_listen_comm_t **listen_comm); /* Create a connection to a process that has called * listen(). * * The callee has to guarantee the following invariants when * this function returns 0 and no send * communicator has been returned * 1) The state stage of the handle is set to a value * different from COMM_CREATE_START. * 2) The communicator state of the handle stores a pointer to * a communicator. Also, the endpoint pointer member variable * of that communicator points to the endpoint passed to * this connect() function. * * The callee must allocate memory for send_comm. */ int (*connect)(nccl_net_ofi_ep_t *ep, nccl_net_ofi_conn_handle_t *handle, nccl_net_ofi_send_comm_t **send_comm); /* * @brief Release nccl_ofi_ep. * * Decrease reference counter. Release resources and free * endpoint if reference counter becomes zero. Must be * protected by lock stored in base_dev. * * @param ep * The endpoint (itself) to be released. * @param skip_lock * false, taking domain lock by default. * ture, not taking domain lock when caller takes it. * @param force_cleanup * false, not release when endpoint has ref count. * true, release no matter endpoint has ref count or not. */ int (*release_ep)(nccl_net_ofi_ep_t *ep, bool skip_lock, bool force_cleanup); /* private */ /* pure virtual function called when resources associated with * the ep should be destroyed. Device lock will be held when * this function is called. */ int (*free_ep)(nccl_net_ofi_ep_t *ep); /* Endpoint reference counter for resource management. * sendrecv_get_ep()/sendrecv_release_ep() must be called in * pair when an object is acquired to use and * released. sendrecv_get_ep() allocates a new object when it * is called for the first time. sendrecv_get_ep() creates the * endpoint libfabric resources if the reference counter was * zero. sendrecv_release_ep() releases the resources if the * reference counter is decreased down to zero. */ int ref_cnt; }; enum nccl_net_ofi_comm_type_t { NCCL_NET_OFI_BASE_COMM, NCCL_NET_OFI_LISTEN_COMM, NCCL_NET_OFI_SEND_COMM, NCCL_NET_OFI_RECV_COMM, }; /** * Communicator - base class for communicator structures * * This is the base class for the listen, send, and recv * communicators. It should not be directly extended by transports, * but instead underlying transports should extend the listen, send, * and recv communicators. */ struct nccl_net_ofi_comm { enum nccl_net_ofi_comm_type_t type; nccl_net_ofi_ep_t *ep; int dev_id; }; /** * Listen Communicator - Communicator for a listen/accept pairing */ struct nccl_net_ofi_listen_comm { nccl_net_ofi_comm_t base; int (*accept)(nccl_net_ofi_listen_comm_t *listen_comm, nccl_net_ofi_recv_comm_t **recv_comm); int (*close)(nccl_net_ofi_listen_comm_t *listen_comm); }; struct nccl_net_ofi_send_comm { nccl_net_ofi_comm_t base; /* * @brief Register memory region on send communicator (both Host and CUDA) * * @return Memory handle for data send operations * @return 0 on success * non-zero on error */ int (*regMr)(nccl_net_ofi_send_comm_t *send_comm, nccl_ofi_mr_ckey_ref ckey, int type, void **mhandle); /* * @brief Deregister memory region on send communicator (both Host and CUDA) * * @return Memory handle for data send operations * @return 0 on success * non-zero on error */ int (*deregMr)(nccl_net_ofi_send_comm_t *send_comm, nccl_net_ofi_mr_handle_t *mhandle); int (*send)(nccl_net_ofi_send_comm_t *send_comm, void *data, size_t size, int tag, nccl_net_ofi_mr_handle_t *mhandle, nccl_net_ofi_req_t **req); int (*close)(nccl_net_ofi_send_comm_t *send_comm); int (*write)(nccl_net_ofi_send_comm_t *send_comm, void* src, size_t size, void* src_mhandle, uint64_t dest, uint64_t mr_key, nccl_net_ofi_req_t **req); int (*write_inline)(nccl_net_ofi_send_comm_t *, void* src, size_t size, uint64_t dest, uint64_t mr_key, nccl_net_ofi_req_t **request); }; struct nccl_net_ofi_recv_comm { nccl_net_ofi_comm_t base; /* * @brief Register memory region on recv communicator (both Host and CUDA) * * @return Memory handle for data recv operations * @return 0 on success * non-zero on error */ int (*regMr)(nccl_net_ofi_recv_comm_t *recv_comm, nccl_ofi_mr_ckey_ref ckey, int type, void **mhandle); /* * @brief Deregister memory region on recv communicator (both Host and CUDA) * * @return Memory handle for data recv operations * @return 0 on success * non-zero on error */ int (*deregMr)(nccl_net_ofi_recv_comm_t *recv_comm, nccl_net_ofi_mr_handle_t *mhandle); int (*recv)(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **data, size_t *sizes, int *tags, nccl_net_ofi_mr_handle_t **mhandles, nccl_net_ofi_req_t **req); int (*flush)(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **data, int *sizes, nccl_net_ofi_mr_handle_t **mhandles, nccl_net_ofi_req_t **req); int (*close)(nccl_net_ofi_recv_comm_t *recv_comm); int (*read)(nccl_net_ofi_recv_comm_t *recv_comm, void* dest, size_t size, void* dest_mhandle, uint64_t src, uint64_t mr_key, nccl_net_ofi_req_t **req); }; /** * Top-level plugin data * * Data associated with an instance of the plugin (which may involve * multiple proxy threads and multiple devices). There will be a * single instance of this structure, exposed as a global variable * named nccl_net_ofi_plugin, which is valid after NCCL calls init() * on the plugin. */ struct nccl_net_ofi_plugin { /* public */ /** * Complete initialization of plugin * * When a plugin is first created, it should not create any * network resources -- create is called to understand the * configuration of the network and see which transports can * run. The base code will pick one and call complete_init, * at which point devices and network resources can be * created. */ int (*complete_init)(nccl_net_ofi_plugin_t *plugin); int (*assign_device)(nccl_net_ofi_plugin_t *plugin, size_t device_index, nccl_net_ofi_device_t *device); nccl_net_ofi_device_t *(*get_device)(nccl_net_ofi_plugin_t *plugin, size_t device_index); size_t (*get_num_devices)(nccl_net_ofi_plugin_t *plugin); int (*release_plugin)(nccl_net_ofi_plugin_t *plugin); /* * Determine whether to allocate the domain per process or per * thread. * false: allocate domain per process * true: allocate domain per thread */ bool domain_per_thread; /* private */ /* Array of devices */ nccl_net_ofi_device_t **p_devs; /* Number of devices in devs array */ size_t p_num_devs; }; /* * Create a plugin object * * Create a plugin object and initialize all the resources, * including devices, required for operation. This function will pick * the correct transport and call its create function to actually * create the plugin (which is a little hacky, but it works). */ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p); /* base implementation of endpoint release. endpoint_init() will set * the release pointer to this function, although transports can * override that function pointer and later call this function * directly. */ int nccl_net_ofi_endpoint_release(nccl_net_ofi_ep_t *ep, bool skip_lock, bool force_cleanup); /* initialize resources associated with the endpoint base class. * Expectation is that this will be called by a transport's endpoint * creation function */ int nccl_net_ofi_endpoint_init(nccl_net_ofi_domain_t *domain, nccl_net_ofi_ep_t *ep); /* free resources associated with the endpoint base class. * Expectation is that this will be called by a transport's endpoint * free function. */ int nccl_net_ofi_endpoint_fini(nccl_net_ofi_ep_t *ep); /* initialize resources associated with the domain base class. * Expectation is that this will be called by a transport's domain * creation routine */ int nccl_net_ofi_domain_init(nccl_net_ofi_device_t *device, nccl_net_ofi_domain_t *domain); /* free resources associated with the domain base class. Expectation * is that this will be called by a transport's domain free * function. */ int nccl_net_ofi_domain_fini(nccl_net_ofi_domain_t *domain); /** * Constructor for a device object */ int nccl_net_ofi_device_init(nccl_net_ofi_device_t *device, nccl_net_ofi_plugin_t *plugin, int device_index, struct fi_info *ofi_info); /** * Destructor for a device object */ int nccl_net_ofi_device_fini(nccl_net_ofi_device_t *device); /* release all domains and their enpoints of a device. This is called * only by device->release() during plugin release to free all fabric * domain and QPs. */ int nccl_net_ofi_device_release_all_domain_and_ep(nccl_net_ofi_device_t *device); /* * Constructor for the nccl_net_ofi_plugin class * * Construct a nccl_net_ofi_plugin object. This is expected to be * called from the transport-specific plugin creation function, which * is called from nccl_net_ofi_create_plugin(). */ int nccl_net_ofi_plugin_init(nccl_net_ofi_plugin_t *plugin, size_t num_devices); /* * Destructor for the nccl_net_ofi_plugin class * * Destruct a nccl_net_ofi_plugin object. This is expected to be * called from the transport-specific plugin destructor. */ int nccl_net_ofi_plugin_fini(nccl_net_ofi_plugin_t *plugin); /* * @brief Set properties obtained from libfabric NIC Info. * * @return Populated props structure */ int nccl_net_ofi_info_properties(nccl_net_ofi_plugin_t *plugin, struct fi_info *nic_prov, int dev_id, int num_devices, nccl_ofi_properties_t *props); /* * @brief Allocate memory region for memory registration * * This function allocates memory that covers full page aligned. * * Internally allocated memory that is registered is required to cover * full memory pages. For more information, see functions * `register_internal_mr_buffers()` and `reg_internal_mr_ep()`. * * To free deallocate the memory region, function * nccl_net_ofi_dealloc_mr_buffer() must be used. * * @param size * Size of the memory region. Must be a multiple of system memory page size. * @return Pointer to memory region. Memory region is aligned to system memory page size. * @return 0, on success * error, on others */ int nccl_net_ofi_alloc_mr_buffer(size_t size, void **ptr); /* * @brief Deallocate memory region allocated by function nccl_net_ofi_alloc_mr_buffer() * * @return Pointer to memory region * @param size * Size of the memory region * @return 0, on success * error, on others */ int nccl_net_ofi_dealloc_mr_buffer(void *ptr, size_t size); /* * @brief Parse selected provider for required behavior flags * @return 0 (Success) * * Set required behavior flags (and print debugging information) for * virt_addr_mr, endpoint_mr and data_progress_auto. */ int nccl_net_ofi_query_provider_capabilities(const struct fi_info *selected_provider, unsigned int num_providers); /* * @brief Retrieve maximum size of inject RMA operations of ofi endpoint * * @return 0, on success * -FI_ENOPROTOOPT, in case option to retrieve size is not available * error, on others */ int get_inject_rma_size_opt(struct fid_ep *ofi_ep, size_t *max_write_inline_size); /* * @brief gettid() wrapper * return thread id of the current thread (always succeeds) */ long nccl_net_ofi_gettid(void); /* * @brief Configures NCCL_PROTO environment variable to "simple". * * @details If NCCL_PROTO is not set, configures it to "simple" protocol. * If NCCL_PROTO is already set, skip the configuration. * * @input log reason string * * @return 0 on success or when warning is issued * -errno in case of any failure */ int nccl_net_ofi_configure_nccl_proto_simple(const char *log_reason); #endif // End NCCL_OFI_H_