source/backend/cuda/execution/cutlass_common/tune/CutlassGemmParamTune.hpp (1,435 lines of code) (raw):
#ifdef ENABLE_CUDA_TUNE_PARAM
#include "../../CutlassGemmParam.hpp"
namespace MNN {
namespace CUDA {
using GemmTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<32, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Linear,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Linear,
SwizzleThreadBlock,
2,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Linear,
SwizzleThreadBlock,
6,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Linear,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Linear,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<256, 64, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Linear,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Linear,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<32, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Linear,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Linear,
SwizzleThreadBlock,
2,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Linear,
SwizzleThreadBlock,
6,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Linear,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Linear,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<256, 64, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Linear,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Linear,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<32, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Linear,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Linear,
SwizzleThreadBlock,
2,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Linear,
SwizzleThreadBlock,
6,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Linear,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Linear,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<256, 64, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Linear,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Linear,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<32, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Linear,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Linear,
SwizzleThreadBlock,
2,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Linear,
SwizzleThreadBlock,
6,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Linear,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Linear,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<256, 64, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Linear,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Linear,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu_AlignCuda_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<32, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Relu,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu_AlignCuda_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Relu,
SwizzleThreadBlock,
2,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu_AlignCuda_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Relu,
SwizzleThreadBlock,
6,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu_AlignCuda_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Relu,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu_AlignCuda_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Relu,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu_AlignCuda_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<256, 64, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Relu,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu_AlignCuda_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Relu,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu_AlignTensor_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<32, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Relu,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu_AlignTensor_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Relu,
SwizzleThreadBlock,
2,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu_AlignTensor_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Relu,
SwizzleThreadBlock,
6,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu_AlignTensor_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Relu,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu_AlignTensor_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Relu,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu_AlignTensor_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<256, 64, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Relu,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu_AlignTensor_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Relu,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu_AlignCuda_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<32, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Relu,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu_AlignCuda_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Relu,
SwizzleThreadBlock,
2,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu_AlignCuda_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Relu,
SwizzleThreadBlock,
6,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu_AlignCuda_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Relu,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu_AlignCuda_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Relu,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu_AlignCuda_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<256, 64, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Relu,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu_AlignCuda_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Relu,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu_AlignTensor_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<32, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Relu,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu_AlignTensor_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Relu,
SwizzleThreadBlock,
2,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu_AlignTensor_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Relu,
SwizzleThreadBlock,
6,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu_AlignTensor_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Relu,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu_AlignTensor_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Relu,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu_AlignTensor_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<256, 64, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Relu,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu_AlignTensor_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Relu,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu6_AlignCuda_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<32, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Relu6,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu6_AlignCuda_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Relu6,
SwizzleThreadBlock,
2,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu6_AlignCuda_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Relu6,
SwizzleThreadBlock,
6,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu6_AlignCuda_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Relu6,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu6_AlignCuda_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Relu6,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu6_AlignCuda_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<256, 64, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Relu6,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu6_AlignCuda_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F16_Relu6,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu6_AlignTensor_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<32, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Relu6,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu6_AlignTensor_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Relu6,
SwizzleThreadBlock,
2,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu6_AlignTensor_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Relu6,
SwizzleThreadBlock,
6,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu6_AlignTensor_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Relu6,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu6_AlignTensor_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Relu6,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu6_AlignTensor_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<256, 64, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Relu6,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu6_AlignTensor_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
cutlass::half_t,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F16_Relu6,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu6_AlignCuda_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<32, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Relu6,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu6_AlignCuda_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Relu6,
SwizzleThreadBlock,
2,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu6_AlignCuda_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Relu6,
SwizzleThreadBlock,
6,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu6_AlignCuda_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Relu6,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu6_AlignCuda_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Relu6,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu6_AlignCuda_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<256, 64, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Relu6,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu6_AlignCuda_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueCudaOp_F32_Relu6,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu6_AlignTensor_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<32, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Relu6,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu6_AlignTensor_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Relu6,
SwizzleThreadBlock,
2,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu6_AlignTensor_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Relu6,
SwizzleThreadBlock,
6,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu6_AlignTensor_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Relu6,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu6_AlignTensor_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<64, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Relu6,
SwizzleThreadBlock,
4,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu6_AlignTensor_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<256, 64, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Relu6,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu6_AlignTensor_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
cutlass::half_t,
LayoutInputA,
cutlass::half_t,
LayoutInputB,
float,
LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 16>,
EpilogueTensorOp_F32_Relu6,
SwizzleThreadBlock,
3,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
}
}
#endif