include/nccl_ofi_rdma.h (353 lines of code) (raw):

/* * Copyright (c) 2023-2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef NCCL_OFI_RDMA_H_ #define NCCL_OFI_RDMA_H_ #include "config.h" #include <rdma/fabric.h> #include <deque> #include "nccl_ofi.h" #include "nccl_ofi_ep_addr_list.h" #include "nccl_ofi_freelist.h" #include "nccl_ofi_idpool.h" #include "nccl_ofi_log.h" #include "nccl_ofi_msgbuff.h" #include "nccl_ofi_scheduler.h" #include "nccl_ofi_topo.h" #if HAVE_NVTX_TRACING #include <nvtx3/nvToolsExt.h> #endif /* Maximum number of rails supported. This defines the size of * messages exchanged during connection establishment (linear * scaling). The default is set to 4 to support 4 different rails per * NCCL comm structure. */ #define MAX_NUM_RAILS (4) static_assert(MAX_NUM_RAILS <= UINT16_MAX); #define NCCL_OFI_RDMA_CTRL_TYPE_BITS (4) /* * @brief Number of bits used for the communicator ID */ #define NCCL_OFI_RDMA_COMM_ID_BITS (18) /* * @brief Number of bits used for message sequence number * * The immediate data associated with an RDMA write operation is 32 * bits and is divided into three parts, the segment count, the * communicator ID, and the message sequence number (msg_seq_num). * The data is encoded as follows: * * | 4-bit segment count | 18-bit comm ID | 10-bit msg_seq_num | * * - Segment count: number of RDMA writes that will be delivered as part of this message * - Comm ID: the ID for this communicator * - Message sequence number: message identifier */ #define NCCL_OFI_RDMA_SEQ_BITS (10) typedef enum nccl_net_ofi_rdma_req_state { NCCL_OFI_RDMA_REQ_CREATED = 0, NCCL_OFI_RDMA_REQ_PENDING, NCCL_OFI_RDMA_REQ_COMPLETED, NCCL_OFI_RDMA_REQ_ERROR, NCCL_OFI_RDMA_REQ_INVALID_STATE, } nccl_net_ofi_rdma_req_state_t; typedef enum nccl_net_ofi_rdma_req_type { /* Write request */ NCCL_OFI_RDMA_WRITE, /* Read request */ NCCL_OFI_RDMA_READ, /* Send request */ NCCL_OFI_RDMA_SEND, /* Receive request */ NCCL_OFI_RDMA_RECV, /* Send control request. Subrequest of NCCL_OFI_RDMA_RECV */ NCCL_OFI_RDMA_SEND_CTRL, /* Send close request. */ NCCL_OFI_RDMA_SEND_CLOSE, /* Receive segments request. Subrequest of NCCL_OFI_RDMA_RECV */ NCCL_OFI_RDMA_RECV_SEGMS, /* Eager local copy request. Subrequest of NCCL_OFI_RDMA_RECV */ NCCL_OFI_RDMA_EAGER_COPY, /* Ctrl rx buff post request */ NCCL_OFI_RDMA_CTRL_RX_BUFF, /* Eager rx buff post request */ NCCL_OFI_RDMA_EAGER_RX_BUFF, /* Flush request */ NCCL_OFI_RDMA_FLUSH, /* Connect message send request */ NCCL_OFI_RDMA_SEND_CONN, /* Connect message receive request */ NCCL_OFI_RDMA_RECV_CONN, /* Connect response message receive request */ NCCL_OFI_RDMA_RECV_CONN_RESP, /* Connect response message send request */ NCCL_OFI_RDMA_SEND_CONN_RESP, /* Invalid type */ NCCL_OFI_RDMA_INVALID_TYPE, } nccl_net_ofi_rdma_req_type_t; enum nccl_ofi_rdma_msg_type { NCCL_OFI_RDMA_MSG_CONN = 0, NCCL_OFI_RDMA_MSG_CONN_RESP, NCCL_OFI_RDMA_MSG_CTRL, NCCL_OFI_RDMA_MSG_EAGER, NCCL_OFI_RDMA_MSG_CLOSE, NCCL_OFI_RDMA_MSG_CTRL_NO_COMPLETION, NCCL_OFI_RDMA_MSG_INVALID = 15, NCCL_OFI_RDMA_MSG_MAX = NCCL_OFI_RDMA_MSG_INVALID, }; static_assert(NCCL_OFI_RDMA_MSG_MAX <= (0x10), "Out of space in nccl_ofi_rdma_msg_type; must fit in a nibble"); /* This goes on the wire, so we want the datatype * size to be fixed. */ typedef uint16_t nccl_ofi_rdma_msg_type_t; /* * @brief Rdma memory registration handle * * Use function `calloc_rdma_mr_handle(int num_rails, int num_control_rails)' to * allocate a RDMA memory registration handle with `num_rails`+`num_control_rails` rails. */ typedef struct nccl_net_ofi_rdma_mr_handle { uint16_t num_rails; /* value of mr key id, if keys must be requested */ uint64_t mr_key; /* Array of size `num_rails' */ struct fid_mr **mr; } nccl_net_ofi_rdma_mr_handle_t; /* Contents of ctrl message sent from receiver to sender to advertise destination buffer */ typedef struct nccl_net_ofi_rdma_ctrl_msg { /* Message type, must be NCCL_OFI_RDMA_MSG_CTRL */ uint32_t type:NCCL_OFI_RDMA_CTRL_TYPE_BITS; /* Message sequence number */ uint32_t msg_seq_num:NCCL_OFI_RDMA_SEQ_BITS; /* A comm identitifer that uniquely identifies the comm * on the receiver side */ uint32_t remote_comm_id:NCCL_OFI_RDMA_COMM_ID_BITS; uint32_t buff_len; uint64_t buff_addr; union { uint32_t short_buff_mr_key[MAX_NUM_RAILS]; uint64_t long_buff_mr_key[MAX_NUM_RAILS]; }; } nccl_net_ofi_rdma_ctrl_msg_t; /* Since this is a message on the wire, check that it has the expected size */ static_assert(sizeof(nccl_net_ofi_rdma_ctrl_msg_t) == 48, "Wrong size for RDMA Control message"); static_assert(offsetof(nccl_net_ofi_rdma_ctrl_msg_t, short_buff_mr_key) + sizeof( ((nccl_net_ofi_rdma_ctrl_msg_t *)0)->short_buff_mr_key) <= 32, "Short RDMA Control message larger than 32 bytes (EFA inline size)"); #define NCCL_NET_OFI_CTRL_MSG_SHORT_KEY_SIZE (sizeof( ((nccl_net_ofi_rdma_ctrl_msg_t *)0)->short_buff_mr_key[0] )) #define NCCL_NET_OFI_CTRL_MSG_LONG_KEY_SIZE (sizeof( ((nccl_net_ofi_rdma_ctrl_msg_t *)0)->long_buff_mr_key[0] )) static inline size_t nccl_net_ofi_rdma_ctrl_msg_size(uint16_t num_rails, bool use_long_rkeys) { size_t rkey_len = (use_long_rkeys) ? NCCL_NET_OFI_CTRL_MSG_LONG_KEY_SIZE : NCCL_NET_OFI_CTRL_MSG_SHORT_KEY_SIZE; return offsetof(nccl_net_ofi_rdma_ctrl_msg_t, short_buff_mr_key) + num_rails * rkey_len; } /* Message from receiver to sender indicating sender can close resources */ typedef struct nccl_net_ofi_rdma_close_msg { /* Message type, must be NCCL_OFI_RDMA_MSG_CLOSE */ uint16_t type:NCCL_OFI_RDMA_CTRL_TYPE_BITS; /* Count of number of ctrl messages sent by the r_comm */ uint64_t ctrl_counter; /* Comm ID provided by the sender */ uint32_t send_comm_id; } nccl_net_ofi_rdma_close_msg_t; /* For LL/LL128 protocols, eager rx buffers (source of RDMA read operations) need to be 128B aligned */ #define EAGER_RX_BUFFER_ALIGNMENT 128 struct nccl_net_ofi_rdma_req; struct nccl_net_ofi_rdma_ep; struct nccl_net_ofi_ep_rail; typedef struct nccl_net_ofi_rdma_req nccl_net_ofi_rdma_req_t; typedef struct nccl_net_ofi_rdma_ep nccl_net_ofi_rdma_ep_t; typedef struct nccl_net_ofi_ep_rail nccl_net_ofi_ep_rail_t; typedef struct { /* Rx buffer freelist item */ nccl_ofi_freelist_elem_t *rx_buff_fl_elem; /* Length of rx buffer */ size_t buff_len; /* Length of received data */ size_t recv_len; /* * Keeps tracks of Rail ID which is used to post the rx buffer. * This is useful for re-posting the buffer on the same rail * when it gets completed. */ nccl_net_ofi_ep_rail_t *rail; /* * Back-pointer to associated endpoint */ nccl_net_ofi_rdma_ep_t *ep; } rdma_req_rx_buff_data_t; typedef struct { /* Remote destination buffer address */ uint64_t remote_buff; /* Remote MR key */ uint64_t remote_mr_key; /* Application-provided local src/dst buffer */ void *buff; /* Length of application-provided buffer */ size_t buff_len; /* First rail descriptor from memory registration of `buff' */ void *desc; /* Additional flags */ uint64_t flags; /* Total number of completions. Expect one completion for receiving the * control message and one completion for each send segment. */ int total_num_compls; /* Number of rails where we have successfully posted the network xfer. * Used mostly when the network xfer is sliced across multiple rails */ uint16_t xferred_rail_id; } rdma_req_rma_op_data_t; typedef struct { /* True for eager messages */ bool eager; /* Remote destination buffer address */ uint64_t remote_buff; /* Remote buffer length */ uint64_t remote_len; /* Remote MR key */ uint64_t remote_mr_key[MAX_NUM_RAILS]; /* Write immediate data */ uint64_t wdata; /* Application-provided local src/dst buffer */ void *buff; /* Length of application-provided buffer */ size_t buff_len; /* Memory region descriptors associated to `buff' */ nccl_net_ofi_rdma_mr_handle_t *buff_mr_handle; /* Schedule used to transfer this request. We save the pointer to * reference it when transferring the request over network. */ nccl_net_ofi_schedule_t *schedule; /* Total number of completions. Expect one completion for receiving the * control message and one completion for each send segment. */ int total_num_compls; /* Number of rails where we have successfully posted the network xfer. * Used mostly when the network xfer is sliced across multiple rails */ uint16_t xferred_rail_id; /* * Flag to indicate target side early completion, so that sender side * uses the corresponding RMA write operation. * True to use fi_write instead of fi_writedata in send() */ bool no_target_completion; #if HAVE_NVTX_TRACING nvtxRangeId_t trace_id; nvtxRangeId_t seg_trace_id[MAX_NUM_RAILS]; #endif } rdma_req_send_data_t; /* * @brief Data of request responsible for sending the control message */ typedef struct { /* Pointer to the allocated control buffer from freelist */ nccl_ofi_freelist_elem_t *ctrl_fl_elem; /* Schedule used to transfer the control buffer. We save the * pointer to reference it when transferring the buffer over * network. */ nccl_net_ofi_schedule_t *ctrl_schedule; /* Pointer to recv parent request */ nccl_net_ofi_rdma_req_t *recv_req; #if HAVE_NVTX_TRACING nvtxRangeId_t trace_id; #endif } rdma_req_send_ctrl_data_t; /* * @brief Data of request responsible for sending the close message */ typedef struct { /* Pointer to the allocated control buffer from freelist */ nccl_ofi_freelist_elem_t *ctrl_fl_elem; /* Schedule used to transfer the close buffer. We save the * pointer to reference it when transferring the buffer over * network. */ nccl_net_ofi_schedule_t *ctrl_schedule; } rdma_req_send_close_data_t; typedef struct { /* Pointer to rx buffer containing eager data */ nccl_net_ofi_rdma_req_t *eager_rx_buff_req; /* Pointer to recv parent request */ nccl_net_ofi_rdma_req_t *recv_req; } rdma_req_eager_copy_data_t; /* * @brief Data of request responsible for receiving segements */ typedef struct { /* Pointer to recv parent request */ nccl_net_ofi_rdma_req_t *recv_req; } rdma_req_recv_segms_data_t; /* * @brief Data of request responsible for receive operation */ typedef struct { /* Destination buffer */ void *dst_buff; /* Destination length */ size_t dst_len; /* Mr handle for destination buffer */ nccl_net_ofi_rdma_mr_handle_t *dest_mr_handle; /* Pointer to send control message child request */ nccl_net_ofi_rdma_req_t *send_ctrl_req; /* Pointer to receive segments child request */ nccl_net_ofi_rdma_req_t *recv_segms_req; /* (Eager messages) pointer to eager local copy request */ nccl_net_ofi_rdma_req_t *eager_copy_req; /* Total number of completions. Expect one send ctrl * completion and one completion that indicates that all * segments have arrived. * * For eager messages, the second completion will be received * when the local read into the destination buffer is complete */ int total_num_compls; #if HAVE_NVTX_TRACING nvtxRangeId_t trace_id; #endif } rdma_req_recv_data_t; /* * @brief Data of request responsible for flush operatoin */ typedef struct { /* Buffer to read flush data from */ void *data; /* MR handles for the data buffer */ nccl_net_ofi_rdma_mr_handle_t *mr_handle; /* Total number of completions. Expect completions from all NIC rail */ int total_num_compls; } rdma_req_flush_data_t; /* * @brief RDMA request */ typedef struct nccl_net_ofi_rdma_req { nccl_net_ofi_req_t base; nccl_net_ofi_context_t ctx[MAX_NUM_RAILS]; /* Associated Comm object */ nccl_net_ofi_comm_t *comm; /* Associated Device ID */ int dev_id; /* Message sequence number */ uint16_t msg_seq_num; /* Number of arrived request completions */ int ncompls; union { rdma_req_rma_op_data_t rma_op_data; rdma_req_send_data_t send_data; rdma_req_recv_data_t recv_data; rdma_req_send_ctrl_data_t send_ctrl_data; rdma_req_send_close_data_t send_close_data; rdma_req_eager_copy_data_t eager_copy_data; rdma_req_recv_segms_data_t recv_segms_data; rdma_req_flush_data_t flush_data; rdma_req_rx_buff_data_t rx_buff_data; }; /* Size of completed request */ size_t size; /* * Protect updating critical fields such as size and ncompls when * network xfer happened over multiple rails */ pthread_mutex_t req_lock; /* State of request */ nccl_net_ofi_rdma_req_state_t state; /* Type of request */ nccl_net_ofi_rdma_req_type_t type; /* Backpointer to freelist element */ nccl_ofi_freelist_elem_t *elem; /* Deinitialzie and free request. This function returns error * in cases where cleanup fails. This function may also return * error if the owner of the request has to deallocate the * request by its own. */ int (*free)(nccl_net_ofi_rdma_req_t *req, bool dec_inflight_reqs); } nccl_net_ofi_rdma_req_t; /* * Rdma endpoint name * * Length of the name is limited to `MAX_EP_ADDR`. */ typedef struct nccl_ofi_rdma_ep_name { char ep_name[MAX_EP_ADDR]; size_t ep_name_len; } nccl_ofi_rdma_ep_name_t; /* * @brief Message storing rail endpoint addresses for connection establishment * * Connect message is send from sender to receiver side to provide * connection information. */ typedef struct nccl_ofi_rdma_connection_info { /* Message type * either NCCL_OFI_RDMA_MSG_CONN or NCCL_OFI_RDMA_MSG_CONN_RESP */ uint16_t type:NCCL_OFI_RDMA_CTRL_TYPE_BITS; uint16_t pad:(16 - NCCL_OFI_RDMA_CTRL_TYPE_BITS); /* Number of rails */ uint16_t num_rails; uint16_t num_control_rails; /* A comm identitifer that uniquely identifies the comm on the sender side. The receiver must use this ID when sending messages to sender */ uint32_t local_comm_id; /* A comm identitifer that uniquely identifies the comm * on the receiver side */ uint32_t remote_comm_id; /* Arrays of `MAX_NUM_RAILS` `nccl_ofi_rdma_ep_name_t` * structs. The member `num_rails` and `num_control_rails` indicate * the number of entries that are in use. */ nccl_ofi_rdma_ep_name_t control_ep_names[MAX_NUM_RAILS]; nccl_ofi_rdma_ep_name_t ep_names[MAX_NUM_RAILS]; } nccl_ofi_rdma_connection_info_t; /* Since this is a message on the wire, check that it has the expected size */ static_assert(sizeof(nccl_ofi_rdma_connection_info_t) == 528, "Wrong size for RDMA connect message"); /* * @brief Send communicator rail * * Communicator rail encapsulates data of a communicator for a * specific rail. */ typedef struct nccl_net_ofi_rdma_send_comm_rail { /* Fabric address of remote endpoint */ fi_addr_t remote_addr; /* Pointer to libfabric endpoint of corresponding rdma * endpoint rail */ struct fid_ep *local_ep; } nccl_net_ofi_rdma_send_comm_rail_t; /* * @brief RDMA send communicator * * Use function `calloc_rdma_send_comm(int num_rails, int num_control_rails)' to * allocate a RDMA send communicator with `num_rails'+`num_control_rails' rails. */ typedef struct nccl_net_ofi_rdma_send_comm { /* This base send communicator must be the first member of this * struct. This allows casting between pointers of this struct * and its base struct. */ nccl_net_ofi_send_comm_t base; uint64_t num_inflight_reqs; uint64_t num_inflight_writes; nccl_ofi_freelist_t *nccl_ofi_reqs_fl; /* Comm ID provided by the local endpoint */ uint32_t local_comm_id; /* Comm ID provided by remote endpoint */ uint32_t remote_comm_id; /* Request to receive connect response message to finalize * connection establishment */ nccl_net_ofi_rdma_req_t *conn_resp_req; /* free list item containing a nccl_ofi_rdma_connection_info_t */ nccl_ofi_freelist_elem_t *conn_msg; uint16_t next_msg_seq_num; nccl_ofi_msgbuff_t *msgbuff; /* Number of rails */ uint16_t num_rails; /* Number of rails */ uint16_t num_control_rails; /* Number of initialized rails. The function * `create_send_comm()' creates a send communicator with one * initialized control rail and sets `num_init_control_rails=1' after the * out-of-bounds message is received. After the connect * response message has been received, the remaining rails * will be initialized via function `init_send_comm_rails()' * and `num_init_control_rails' is adjusted. */ int num_init_control_rails; #if HAVE_NVTX_TRACING nvtxDomainHandle_t nvtx_domain[NCCL_OFI_N_NVTX_DOMAIN_PER_COMM]; #endif pthread_mutex_t ctrl_recv_lock; bool received_close_message; /* Counters for total sent and received control messages */ uint64_t n_ctrl_received; uint64_t n_ctrl_expected; bool comm_active; /* Array of `num_rails` communicator rails */ nccl_net_ofi_rdma_send_comm_rail_t *rails; /* Array of `num_control_rails` communicator rails */ nccl_net_ofi_rdma_send_comm_rail_t *control_rails; } nccl_net_ofi_rdma_send_comm_t; /* * @brief Receive communicator rail * * Communicator rail encapsulates data of a communicator for a * specific rail. */ typedef struct nccl_net_ofi_rdma_recv_comm_rail { /* Fabric address of remote endpoint */ fi_addr_t remote_addr; /* Pointer to libfabric endpoint of corresponding rdma * endpoint rail */ struct fid_ep *local_ep; /* Libfabric address of local endpoint used for flushing */ fi_addr_t local_addr; } nccl_net_ofi_rdma_recv_comm_rail_t; /* Metadata about dummy flush buffer */ typedef struct nccl_net_ofi_rdma_flush_buffer { void *host_buffer; size_t size; /* Memory registration handle of the local buffer */ nccl_net_ofi_rdma_mr_handle_t *mr_handle; } nccl_net_ofi_rdma_flush_buffer_t; /* * @brief RDMA receive communicator * * Use function `calloc_rdma_recv_comm(int num_rails, int num_control_rails)' to * allocate a RDMA receive communicator with `num_rails'+`num_control_rails' rails. */ typedef struct nccl_net_ofi_rdma_recv_comm { /* This base receive communicator must be the first member of * this struct. This allows casting between pointers of this * struct and its base struct. */ nccl_net_ofi_recv_comm_t base; uint64_t num_inflight_reqs; nccl_ofi_freelist_t *nccl_ofi_reqs_fl; /* Comm ID provided by the local endpoint */ uint32_t local_comm_id; /* Comm ID provided by remote endpoint */ uint32_t remote_comm_id; uint16_t next_msg_seq_num; nccl_ofi_msgbuff_t *msgbuff; /* Free list to track control buffers, for sending RDMA control messages */ nccl_ofi_freelist_t *ctrl_buff_fl; #if HAVE_NVTX_TRACING nvtxDomainHandle_t nvtx_domain[NCCL_OFI_N_NVTX_DOMAIN_PER_COMM]; #endif nccl_net_ofi_rdma_req_t *send_close_req; /* Counters for total sent and received control messages */ pthread_mutex_t ctrl_counter_lock; uint64_t n_ctrl_sent; uint64_t n_ctrl_delivered; /* Number of rails */ uint16_t num_rails; /* Number of control rails */ uint16_t num_control_rails; bool comm_active; /* free list item containing a nccl_ofi_rdma_connection_info_t */ nccl_ofi_freelist_elem_t *conn_msg; /* Array of `num_rails` communicator rails */ nccl_net_ofi_rdma_recv_comm_rail_t *rails; /* Array of `num_control_rails` communicator rails */ nccl_net_ofi_rdma_recv_comm_rail_t *control_rails; } nccl_net_ofi_rdma_recv_comm_t; typedef struct nccl_net_ofi_rdma_listen_comm { /* This base listen communicator must be the first member of * this struct. This allows casting between pointers of this * struct and its base struct. */ nccl_net_ofi_listen_comm_t base; /* Comm ID provided by local endpoint */ uint32_t comm_id; /* Communicator created while accept routine is executed */ nccl_net_ofi_rdma_recv_comm_t *r_comm; /* Reusable request for connect and connect response message */ nccl_net_ofi_rdma_req_t req; /* Stage of connection establishment on listen side */ nccl_ofi_comm_stage_t stage; /* Message struct send connect message and receive connect * response message * * TODO: This should really be a list of outstanding connect * messages to allow multiple connects per listen communicator. */ nccl_ofi_rdma_connection_info_t conn_msg; } nccl_net_ofi_rdma_listen_comm_t; /* * @brief Endpoint rail * * Endpoint rail encapsulates data of an endpoint for a * specific rail. */ struct nccl_net_ofi_ep_rail { uint16_t rail_id; /* Local libfabric endpoint handle */ struct fid_ep *ofi_ep; /* Name of local libfabric endpoint */ char local_ep_name[MAX_EP_ADDR]; /* Length of local_ep_name */ size_t local_ep_name_len; /* Address vector handle */ struct fid_av *av; /* Completion Queue handle */ struct fid_cq *cq; /* * Rx buffer management */ /* Number of rx buffers posted */ size_t num_rx_buff_posted; /* Minimum posted rx buffers (see RDMA_MIN_POSTED_BOUNCE_BUFFERS) */ size_t min_rx_buff_posted; /* Maximum posted rx buffers (see RDMA_MAX_POSTED_BOUNCE_BUFFERS) */ size_t max_rx_buff_posted; /* Mutex for rx buffer operations */ pthread_mutex_t rx_buff_mutex; /* Allocate a receive buffer request for this rail (eager or ctrl) */ nccl_net_ofi_rdma_req_t* (*rx_buff_req_alloc)(nccl_net_ofi_rdma_ep_t *ep, nccl_net_ofi_ep_rail_t *rail); }; /* * @brief RDMA Endpoint * * RDMA endpoint implements the nccl_net_ofi_ep_t interface * for the rdma protocol that uses libfabric's fi_tsend and * fi_trecv for communication. */ struct nccl_net_ofi_rdma_ep { /* This base endpoint interface struct provides access to the * rdma endpoint's functions such as rdma_listen() and * rdma_connect(). At construction time of this endpoint, * the constructor assigns these functions to the member * functions of abstract nccl_net_ofi_ep_t endpoint 'base'. * * This base endpoint must be the first member of this * struct. This allows casting between pointers of this struct * and its base struct. */ nccl_net_ofi_ep_t base; /* Number of rails */ uint16_t num_rails; /* Number of control rails */ uint16_t num_control_rails; /* Array of `num_rails` endpoint rails */ nccl_net_ofi_ep_rail_t *rails; /* Array of `num_control_rails` endpoint rails */ nccl_net_ofi_ep_rail_t *control_rails; bool use_long_rkeys; /* Pending requests queue */ std::deque<nccl_net_ofi_rdma_req_t *> *pending_reqs_queue; /* Lock for `pending_reqs_queue` */ pthread_mutex_t pending_reqs_lock; /* Free list of ctrl rx buffers */ nccl_ofi_freelist_t *ctrl_rx_buff_fl; /* Free list of eager rx buffers */ nccl_ofi_freelist_t *eager_rx_buff_fl; /* Free list of rx buffer requests */ nccl_ofi_freelist_t *rx_buff_reqs_fl; /* Free list for connection messages */ nccl_ofi_freelist_t *conn_msg_fl; /* Size of ctrl rx buffers */ size_t ctrl_rx_buff_size; /* Size of eager rx buffers. Will be -1 if eager is entirely * disabled. */ ssize_t eager_rx_buff_size; /* max size of eager messages. This is only separate from * eager_rx_buff_size because the EFA provider incorrectly throws an * EINVAL when posting 0 byte rx buffers. To work around that, * eager_rx_buff_size will either be -1 or positive (but not zero) and * eager_send_size is the comparison that should be used for deciding * whether a message is eligible for eager. eager_send_size will never * be larger than eager_rx_buff_size. Will be -1 if eager is entirely * disabled. */ ssize_t eager_send_size; /* true if the current endpoint is a endpoint_per_communicator receive communicator */ bool is_endpoint_per_communicator_ep; /* thread id of the thread that called get_ep(). Used as the hash key for the endpoint hash */ long creating_thread_id; }; /* * @brief Device rail * * Deivice rail encapsulates data of an endpoint for a * specific rail. */ typedef struct nccl_net_ofi_rdma_device_rail { /* NIC info */ struct fi_info *info; /* Fabric handle */ struct fid_fabric *fabric; } nccl_net_ofi_rdma_device_rail_t; /* * @brief RDMA Device * * Device implementation of the RDMA protocol * * RDMA device implements the nccl_net_ofi_device_t interface for * the rdma protocol that uses libfabric's fi_tsend and fi_trecv * for communication. Internally, the rdma device maintains * rdma endpoints that are per thread to avoid contention over the * endpoint's libfabric resources. Access to endpoints is protected via * locks and the lifetime of resouces is maintained with a reference * counter. */ typedef struct nccl_net_ofi_rdma_device { /* This base device interface struct provides access to the * rdma endpoint's functions such as * rdma_get_properties(), rdma_get_ep(), and * rdma_release_ep(). At construction time of this device, * the constructor assigns these functions to the member * functions of abstract nccl_net_ofi_device_t device * 'device'. * * This base device must be the first member of this * struct. This allows casting between pointers of this struct * and its base struct. */ nccl_net_ofi_device_t base; /* Number of rails */ uint16_t num_rails; /* Array of 'num_rails' device rails */ nccl_net_ofi_rdma_device_rail_t *device_rails; /* Maximum number of supported communicator IDs */ uint32_t num_comm_ids; /* ID pool */ nccl_ofi_idpool_t *comm_idpool; /* Array of open comms associated with this endpoint. This is needed for fast lookup of comms in the RDMA protocol. */ nccl_net_ofi_comm_t **comms; bool use_long_rkeys; #if HAVE_NVTX_TRACING nvtxDomainHandle_t nvtx_domain[MAX_NUM_RAILS]; #endif } nccl_net_ofi_rdma_device_t; typedef struct nccl_net_ofi_rdma_domain_rail { uint16_t rail_id; /* Access domain handles */ struct fid_domain *domain; struct fid_cq *cq; } nccl_net_ofi_rdma_domain_rail_t; typedef struct nccl_net_ofi_rdma_domain { nccl_net_ofi_domain_t base; uint16_t num_rails; nccl_net_ofi_rdma_domain_rail_t *domain_rails; /* The flush buffer */ nccl_net_ofi_rdma_flush_buffer_t flush_buff; /* List of endpoints and set of addresses they have connections to */ nccl_ofi_ep_addr_list_t *ep_addr_list; /* Message scheduler */ nccl_net_ofi_scheduler_t *scheduler; } nccl_net_ofi_rdma_domain_t; struct nccl_net_ofi_rdma_plugin { nccl_net_ofi_plugin_t base; nccl_ofi_topo_t *topo; }; typedef struct nccl_net_ofi_rdma_plugin nccl_net_ofi_rdma_plugin_t; /* * @brief Initialize plugin with rdma protocol structures */ int nccl_net_ofi_rdma_init(const char *provider_filter, nccl_net_ofi_plugin_t **plugin_p, bool *found_multi_rail); #endif // End NCCL_OFI_RDMA_H_