maga_transformer/cpp/cuda/cuda

/* * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "maga_transformer/cpp/utils/AssertUtils.h" #include "maga_transformer/cpp/utils/Logger.h" #include "maga_transformer/cpp/core/Types.h" #include <cstddef> #include <cublasLt.h> #include <cublas_v2.h> #include <cuda_runtime.h> #include <nvml.h> #include <fstream> #include <iostream> #include <string> #include <vector> #include <functional> #ifdef ENABLE_BF16 #include <cuda_bf16.h> #endif #ifdef ENABLE_FP8 #include <cuda_fp8.h> #endif #ifndef _WIN32 // Linux #include <sys/sysinfo.h> #endif // not WIN32 #include <vector> #ifdef _WIN32 // Windows #include <windows.h> #undef ERROR // A Windows header file defines ERROR as 0, but it's used in our logger.h enum. Logging breaks without // this undef. #endif // WIN32 namespace rtp_llm { // workspace for cublas gemm : 32MB #define CUBLAS_WORKSPACE_SIZE 33554432 typedef struct __align__(4) { half x, y, z, w; } half4; //////////////////////////////////////////////////////////////////////////////////////////////////// struct __align__(16) Float4_ { float2 x; float2 y; }; //////////////////////////////////////////////////////////////////////////////////////////////////// struct __align__(32) Float8_ { float2 x; float2 y; float2 z; float2 w; }; //////////////////////////////////////////////////////////////////////////////////////////////////// #ifdef ENABLE_BF16 struct __align__(8) bf16_4_t { __nv_bfloat162 x; __nv_bfloat162 y; }; //////////////////////////////////////////////////////////////////////////////////////////////////// struct __align__(16) bf16_8_t { __nv_bfloat162 x; __nv_bfloat162 y; __nv_bfloat162 z; __nv_bfloat162 w; }; #endif #ifdef ENABLE_FP8 using fp8_2_t = __nv_fp8x2_e4m3; using fp8_4_t = __nv_fp8x4_e4m3; struct __align__(8) fp8_8_t { __nv_fp8x2_e4m3 x; __nv_fp8x2_e4m3 y; __nv_fp8x2_e4m3 z; __nv_fp8x2_e4m3 w; }; #endif /* **************************** type definition ***************************** */ enum CublasDataType { FLOAT_DATATYPE = 0, HALF_DATATYPE = 1, BFLOAT16_DATATYPE = 2, INT8_DATATYPE = 3, FP8_DATATYPE = 4 }; enum FtCudaDataType { FP32 = 0, FP16 = 1, BF16 = 2, INT8 = 3, FP8 = 4 }; enum class OperationType { FP32, FP16, BF16, INT8, FP8 }; inline int div_up(int a, int n) { return (a + n - 1) / n; } template <typename T, typename U, typename = std::enable_if_t<std::is_integral<T>::value>, typename = std::enable_if_t<std::is_integral<U>::value>> auto constexpr ceilDiv(T numerator, U denominator) { return (numerator + denominator - 1) / denominator; } inline size_t pad_to_multiple_of_16(const size_t& input) { static constexpr int ALIGNMENT = 16; return ALIGNMENT * ((input + ALIGNMENT - 1) / ALIGNMENT); } inline size_t pad_to_multiple_of_128(const size_t& input) { static constexpr int ALIGNMENT = 128; return ALIGNMENT * ((input + ALIGNMENT - 1) / ALIGNMENT); } template<typename T> void check(T result, const char* const file, int const line); #define check_cuda_error(val) rtp_llm::check((val), __FILE__, __LINE__) void syncAndCheck(const char* const file, int const line); #define sync_check_cuda_error() rtp_llm::syncAndCheck(__FILE__, __LINE__) int get_sm(); bool is_sm70(); bool is_sm8x(); bool is_sm90(); float timing_function(const std::function<void(cudaStream_t)>& operation, int64_t timing_iterations, cudaStream_t stream); int getDevice(); int getDeviceCount(); int currentDeviceId(); void priorityRange(int *low_priority, int *high_priority, int device_id = -1); std::tuple<size_t, size_t> getDeviceMemoryInfo(bool const useUvm); bool shared_mem_sufficient(int smem_size); std::string getDriverVersion(); int getCudaVersion(); bool checkAllNVLinks(std::vector<size_t> device_ids); bool checkOnSameNumaNodes(std::vector<size_t> device_ids); int getVisibleDeviceNum(); bool checkP2PAvailable(const std::vector<size_t>& tp_ranks, size_t rank); template<typename T> T getCudaValue(const T* ptr, int index) { T tmp; check_cuda_error(cudaMemcpy(&tmp, ptr + index, sizeof(T), cudaMemcpyDeviceToHost)); return tmp; } template<typename T> void setCudaValue(T* ptr, int index, T value) { check_cuda_error(cudaMemcpy(ptr + index, &value, sizeof(T), cudaMemcpyHostToDevice)); } template<typename T> CublasDataType getCublasDataType() { if (std::is_same<T, half>::value) { return HALF_DATATYPE; } #ifdef ENABLE_BF16 else if (std::is_same<T, __nv_bfloat16>::value) { return BFLOAT16_DATATYPE; } #endif else if (std::is_same<T, float>::value) { return FLOAT_DATATYPE; } else { RTP_LLM_CHECK(false); return FLOAT_DATATYPE; } } template<typename T> cudaDataType_t getCudaDataType() { if (std::is_same<T, half>::value) { return CUDA_R_16F; } #ifdef ENABLE_BF16 else if (std::is_same<T, __nv_bfloat16>::value) { return CUDA_R_16BF; } #endif else if (std::is_same<T, float>::value) { return CUDA_R_32F; } else { RTP_LLM_CHECK(false); return CUDA_R_32F; } } template<CublasDataType T> struct getTypeFromCudaDataType { using Type = float; }; template<> struct getTypeFromCudaDataType<HALF_DATATYPE> { using Type = half; }; #ifdef ENABLE_BF16 template<> struct getTypeFromCudaDataType<BFLOAT16_DATATYPE> { using Type = __nv_bfloat16; }; #endif // clang-format off template<typename T> struct packed_type_2; template <> struct packed_type_2<float> { using type = float; }; // we don't need to pack float by default template <> struct packed_type_2<half> { using type = half2; }; #ifdef ENABLE_BF16 template<> struct packed_type_2<__nv_bfloat16> { using type = __nv_bfloat162; }; #endif template <typename T, int N> struct packed_type; template <typename T> struct packed_type<T, 1> { using type = T; }; template <> struct packed_type<int8_t, 1> { using type = int8_t; }; template <> struct packed_type<int8_t, 2> { using type = int16_t; }; template <> struct packed_type<int8_t, 4> { using type = int32_t; }; template <> struct packed_type<int8_t, 8> { using type = int64_t; }; #ifdef ENABLE_FP8 template <> struct packed_type<__nv_fp8_e4m3, 1> { using type = __nv_fp8_e4m3; }; template <> struct packed_type<__nv_fp8_e4m3, 2> { using type = fp8_2_t; }; template <> struct packed_type<__nv_fp8_e4m3, 4> { using type = fp8_4_t; }; template <> struct packed_type<__nv_fp8_e4m3, 8> { using type = fp8_8_t; }; #endif // ENABLE_FP8 template <> struct packed_type<uint16_t, 2> { using type = uint32_t; }; template <> struct packed_type<uint16_t, 4> { using type = uint2; }; template <> struct packed_type<uint16_t, 8> { using type = uint4; }; template <> struct packed_type<half, 2> { using type = uint32_t; }; template <> struct packed_type<half, 4> { using type = uint2; }; template <> struct packed_type<half, 8> { using type = uint4; }; #ifdef ENABLE_BF16 template <> struct packed_type<__nv_bfloat16, 2> { using type = __nv_bfloat162; }; template <> struct packed_type<__nv_bfloat16, 4> { using type = bf16_4_t; }; template <> struct packed_type<__nv_bfloat16, 8> { using type = bf16_8_t; }; #endif template <> struct packed_type<float, 2> { using type = float2; }; template <> struct packed_type<float, 4> { using type = float4; }; template <> struct packed_type<float, 8> { using type = Float8_; }; template<typename T> struct num_elems; template <> struct num_elems<float> { static constexpr int value = 1; }; template <> struct num_elems<float2> { static constexpr int value = 2; }; template <> struct num_elems<float4> { static constexpr int value = 4; }; template <> struct num_elems<Float4_> { static constexpr int value = 4; }; template <> struct num_elems<Float8_> { static constexpr int value = 8; }; template <> struct num_elems<half> { static constexpr int value = 1; }; template <> struct num_elems<half2> { static constexpr int value = 2; }; template <> struct num_elems<uint32_t> { static constexpr int value = 2; }; template <> struct num_elems<int32_t> { static constexpr int value = 2; }; template <> struct num_elems<int64_t> { static constexpr int value = 4; }; template <> struct num_elems<uint2> { static constexpr int value = 4; }; template <> struct num_elems<uint4> { static constexpr int value = 8; }; #ifdef ENABLE_BF16 template <> struct num_elems<__nv_bfloat16> { static constexpr int value = 1; }; template <> struct num_elems<__nv_bfloat162> { static constexpr int value = 2; }; template <> struct num_elems<bf16_4_t> { static constexpr int value = 4; }; template <> struct num_elems<bf16_8_t> { static constexpr int value = 8; }; #endif #ifdef ENABLE_FP8 template <> struct num_elems<__nv_fp8_e4m3> { static constexpr int value = 1; }; template <> struct num_elems<fp8_2_t> { static constexpr int value = 2; }; template <> struct num_elems<fp8_4_t> { static constexpr int value = 4; }; template <> struct num_elems<fp8_8_t> { static constexpr int value = 8; }; #endif template<typename T, int num> struct packed_as; template<typename T> struct packed_as<T, 1> { using type = T; }; template<> struct packed_as<half, 2> { using type = half2; }; template<> struct packed_as<float, 2> { using type = float2; }; template<> struct packed_as<int8_t, 2> { using type = int16_t; }; template<> struct packed_as<int32_t, 2> { using type = int2; }; template<> struct packed_as<half2, 1> { using type = half; }; template<> struct packed_as<float2, 1> { using type = float; }; #ifdef ENABLE_BF16 template<> struct packed_as<__nv_bfloat16, 2> { using type = __nv_bfloat162; }; template<> struct packed_as<__nv_bfloat162, 1> { using type = __nv_bfloat16; }; #endif #ifdef ENABLE_FP8 template<> struct packed_as<__nv_fp8_e4m3, 2> { using type = __nv_fp8x2_e4m3; }; template<> struct packed_as<__nv_fp8x2_e4m3, 1> { using type = __nv_fp8_e4m3; }; template<> struct packed_as<__nv_fp8_e5m2, 2> { using type = __nv_fp8x2_e5m2; }; template<> struct packed_as<__nv_fp8x2_e5m2, 1> { using type = __nv_fp8_e5m2; }; #endif inline __device__ float2 operator*(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); } inline __device__ float2 operator+(float2 a, float2 b) { return make_float2(a.x + b.x, a.y + b.y); } inline __device__ float2 operator-(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); } inline __device__ float2 operator*(float2 a, float b) { return make_float2(a.x * b, a.y * b); } inline __device__ float2 operator+(float2 a, float b) { return make_float2(a.x + b, a.y + b); } inline __device__ float2 operator-(float2 a, float b) { return make_float2(a.x - b, a.y - b); } bool should_print(); template<typename T> void print_bshd(const int layer_id, const char* name, const T* ptr, int batch_size, int seq_len, int num_heads, int hidden_size_per_head, int total_num_heads = 0, int heads_offset = 0, bool is_device_ptr = true); template<typename T> void print_bhsd(const int layer_id, const char* name, const T* ptr, int batch_size, int num_heads, int seq_len, int hidden_size_per_head, bool is_device_ptr = true); template<typename T> void print_bhss(const int layer_id, const char* name, const T* ptr, int batch_size, int num_heads, int seq_len, int seq_len2, bool is_device_ptr = true); template<typename T> void print_bsd(const int layer_id, const char* name, const T* ptr, int batch_size, int seq_len, int hidden_size, int start = 0, int end = 20, bool is_device_ptr = true); template<typename T> void print_bsd_sum_and_square(const int layer_id, const char* name, const T* ptr, int batch_size, int seq_len, int hidden_size, int start = 0, int end = 20, bool is_device_ptr = true); template<typename T> void print_kv_cache(const int layer_id, const char* name, const T* ptr, int dim1, int dim2, int dim3, int dim4, int dim5, int dim6, bool print_all = true, bool is_device_ptr = true); } // namespace rtp_llm

maga_transformer/cpp/cuda/cuda_utils.h (416 lines of code) (raw):