astra-sim-alibabacloud/astra-sim/system/MockNccl.h (154 lines of code) (raw):

/* *Copyright (c) 2024, Alibaba Group; *Licensed under the Apache License, Version 2.0 (the "License"); *you may not use this file except in compliance with the License. *You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 *Unless required by applicable law or agreed to in writing, software *distributed under the License is distributed on an "AS IS" BASIS, *WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *See the License for the specific language governing permissions and *limitations under the License. */ #ifndef ASTRA_SIM_MOCKNCCL_MOCKNCCL_H #define ASTRA_SIM_MOCKNCCL_MOCKNCCL_H #define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet* #define NCCL_ALGO_UNDEF -1 #define NCCL_ALGO_TREE 0 #define NCCL_ALGO_RING 1 #define NCCL_ALGO_COLLNET_DIRECT 2 #define NCCL_ALGO_COLLNET_CHAIN 3 #define NCCL_ALGO_NVLS 4 #define NCCL_ALGO_NVLS_TREE 5 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 #define NCCL_PROTO_UNDEF -1 #define NCCL_PROTO_LL 0 #define NCCL_PROTO_LL128 1 #define NCCL_PROTO_SIMPLE 2 #define NCCL_WORK_SIZE 512 /* Array indexes used below */ #define VOLTA_COMPCAP_IDX 0 #define AMPERE_COMPCAP_IDX 1 #define HOPPER_COMPCAP_IDX 2 #define NCCL_TOPO_CPU_VENDOR_AMD 2 #define NCCL_TOPO_CPU_ARCH_X86 1 #define NCCL_TOPO_CPU_ARCH_POWER 2 #define NCCL_TOPO_CPU_VENDOR_INTEL 1 #define NCCL_TOPO_CPU_VENDOR_AMD 2 #define NCCL_TOPO_PATTERN_BALANCED_TREE 1 // 目前正在使用的 Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU) #define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU) #define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU #define NCCL_TOPO_PATTERN_RING 4 // Ring #define NCCL_TOPO_PATTERN_NVLS 5 // NVLS+SHARP and NVLS+Tree // Latencies in us, Bandwidths in GB/s #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t; // LL128 max BW per channel static const double llMaxBws[3][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0}, /* Hopper-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0} }; // Latencies in us, Bandwidths in GB/s // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple } static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 6.8, 14.0, 0 }, { 6.6, 14.0, 8.4 }, // Tree, Ring { 6.8, 14.0, 0 }, { 6.8, 14.0, 0 }, // Collnet Direct, Chain { 0, 0, 23.0 }, { 0, 0, 23.0 }}; // NVLS, NVLS Tree static const double perChMaxRingLL128Bws[3][3] = { /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7}, }; static const double perChMaxTreeLL128Bws[3][3] = { /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0}, }; static const double perChMaxTreeBws[3][3] = { /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8}, /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0}, }; // NVLink, PCI, Network 硬件时延 #define NCCL_HW_NVLINK 0 #define NCCL_HW_PCI 1 #define NCCL_HW_NET 2 static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { /* NVLINK */ { /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 4 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 }, /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 4.75 }, /* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } }, /* PCI */ { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 6 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } }, /* NET */ { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 14 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 }, /* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 14 }, /* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 19 } } }; // We want link types and path types to match as much as possible #define LINK_LOC 0 #define LINK_NVL 1 // Skipping 2 for PATH_NVB #define LINK_PCI 3 // Skipping 4 for PATH_PXB // Skipping 5 for PATH_PXN // Skipping 6 for PATH_PHB #define LINK_SYS 7 #define LINK_NET 8 #define PCI_BW 12.0 // PCI Gen3 x16 // Local (myself) #define PATH_LOC 0 // Connection traversing NVLink #define PATH_NVL 1 // Connection through NVLink using an intermediate GPU #define PATH_NVB 2 // Connection traversing at most a single PCIe bridge #define PATH_PIX 3 // Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) #define PATH_PXB 4 // Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations. #define PATH_PXN 5 // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) #define PATH_PHB 6 // Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) #define PATH_SYS 7 // Connection through the network #define PATH_NET 8 // Disconnected #define PATH_DIS 9 #define DIVUP(x, y) \ (((x)+(y)-1)/(y)) template<typename X, typename Z = decltype(X()+int())> constexpr Z alignUp(X x, int a) { return (x+a-1) & Z(-a); } enum ncclWorkType : uint8_t { ncclWorkTypeUnused=0, ncclWorkTypeColl=1, ncclWorkTypeP2p=2, ncclWorkTypeRegColl=3 }; struct ncclWorkHeader { union { int32_t workNext; // when isLast=0: Offset from kernel argument workHead uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back. }; uint16_t funcIndex; uint8_t isLast:1; // last work for this kernel uint8_t inFifo:1; // is this work in the fifo enum ncclWorkType type; }; struct ncclWorkElem { union { uint8_t flagBits; struct { uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1; }; }; uint8_t nWarps; uint8_t direct; const void * sendbuff; void * recvbuff; size_t count; size_t lastChunkSize; uint32_t root; uint8_t bid; uint8_t nChannels; uint64_t redOpArg; }; #define NCCL_MAX_WORK_ELEMENTS ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem)) // Trees are not perfectly sticking to the model for medium sizes. Applying a static correction // factor is not ideal but works quite well. Powers of two, 64 B to 256MB. static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = { { 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .4, .4, .5, .6, .7, .8, .9, 1.0, 1.0, 1.0, 1.0 }, { 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .7, .6, .6, .6, .6, .6, .6, .8, .9, .9, .9, .9, 1.0, 1.0 }, { .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .6, .7, .8, .7, .7, .8, .9, .9 } }; #define MAXCHANNELS 10 #define NCCL_TOPO_MAX_NODES 10 #define NCCL_MAX_TREE_ARITY 2 struct ncclTopoGraph { // Input / output int id; // ring : 0, tree : 1, collnet : 2 int pattern; int crossNic; int collNet; int minChannels; int maxChannels; // Output int nChannels; //搜索到的Chanel数 float bwIntra; //节点内单个chnnel带宽 float bwInter; //节点间单个channel带宽 float latencyInter; int typeIntra; //节点内的channel路径类型 int typeInter; //节点间的channel路径类型 int sameChannels; //channel是否一样 int nHops; int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES]; //节点内每个channel路径 int inter[MAXCHANNELS*2]; // 节点间每个channel路径 }; #endif // ASTRA_SIM_MOCKNCCL_MOCKNCCL_H