csrc/kernels/configs.cuh

#pragma once #define NUM_MAX_NVL_PEERS 8 #define NUM_MAX_RDMA_PEERS 20 #define NUM_MAX_FIFO_SLOTS 32768 #define NUM_WORKSPACE_BYTES (32 * 1024 * 1024) #define NUM_MAX_LOCAL_EXPERTS 1024 #define NUM_BUFFER_ALIGNMENT_BYTES 128 #define FINISHED_SUM_TAG 1024 #define NUM_CPU_TIMEOUT_SECS 100 #define NUM_TIMEOUT_CYCLES 200000000000ull // 200G cycles ~= 100s #define NUM_WAIT_NANOSECONDS 500 #define LOW_LATENCY_SEND_PHASE 1 #define LOW_LATENCY_RECV_PHASE 2 // Make CLion CUDA indexing work #ifdef __CLION_IDE__ #define __CUDA_ARCH__ 900 // NOLINT(*-reserved-identifier) #define __CUDACC_RDC__ // NOLINT(*-reserved-identifier) __host__ __device__ __forceinline__ void host_device_printf(const char* format, ...) { asm volatile("trap;"); } #define printf host_device_printf #endif // Remove Torch restrictions #ifdef __CUDA_NO_HALF_CONVERSIONS__ #undef __CUDA_NO_HALF_CONVERSIONS__ #endif #ifdef __CUDA_NO_HALF_OPERATORS__ #undef __CUDA_NO_HALF_OPERATORS__ #endif #ifdef __CUDA_NO_HALF2_OPERATORS__ #undef __CUDA_NO_HALF2_OPERATORS__ #endif #ifdef __CUDA_NO_BFLOAT16_CONVERSIONS__ #undef __CUDA_NO_BFLOAT16_CONVERSIONS__ #endif #ifdef __CUDA_NO_BFLOAT162_OPERATORS__ #undef __CUDA_NO_BFLOAT162_OPERATORS__ #endif #include <cuda_bf16.h> #include <cuda_fp8.h> #include <cuda_runtime.h> #include <nvshmem.h> #include <nvshmemx.h> #include <infiniband/mlx5dv.h> #include <non_abi/device/threadgroup/nvshmemi_common_device_defines.cuh> #include <device_host_transport/nvshmem_common_ibgda.h>

csrc/kernels/configs.cuh (44 lines of code) (raw):