include/nccl_ofi_platform.h (9 lines of code) (raw):
/*
* Copyright (c) 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved.
*/
#ifndef NCCL_OFI_PLATFORM_H_
#define NCCL_OFI_PLATFORM_H_
#include <rdma/fabric.h>
#include <rdma/fi_endpoint.h>
/* Declare platform-specific hooks that can be provided by platform-specific
* source files (such as the optionally compiled platform_aws.c). The functions
* here are declared as weak symbols so that linkage will not break if no
* platform specific hook is provided; in that case the hook will be NULL at
* runtime.
*/
/* Platform-specific initialization hook.
*/
int platform_init(const char **provider_filter) __attribute__((weak));
/* Platform-specific endpoint configuration hook
*/
int platform_config_endpoint(struct fi_info *info, struct fid_ep *ep) __attribute__((weak));
/* Platform-specific hook to sort in the multi-rail protocol of the
* plugin
*
* Rail-oriented networks or traffic flows are a common performance
* optimization for ML networks. Generally, Libfabric providers sort
* their provider list by BDFs, which are indicitive of physical
* ordering and good enough. However, on some platforms (especially
* virtualized platforms), this might not actually be sufficient and
* another sorting mechanism may be required to properly group NICs.
*
* This interface is called in the topology initialization code to
* order NICs that are behind the same PCIe root complex / switch.
* The info_list will have num_rails providers listed, and will later
* be split into num_groups groups (based on the number of
* accelerators that are also behind the PCIe switch).
*
* Providers of this interface should sort the provided info_list such
* that the Nth provider on this node will be assumed to talk to the
* Nth provider on remote nodes (ie, identify the "rail id" and sort
* by that).
*
* @param info_list: pointer to list of `num_rails` info objects
* @param num_rails: number of rails
*/
void platform_sort_rails(struct fi_info **info_list, size_t num_rails, size_t num_groups) __attribute__((weak));
/*
* does the platform have an opinion on domain_per_thread configuration?
*/
bool platform_default_domain_per_thread(void) __attribute__((weak));
#endif // End NCCL_OFI_PLATFORM_H_