source/backend/opencl/execution/cl/conv_2d_buf_mnn_cl.cpp (1,499 lines of code) (raw):
#include "opencl_source_map.hpp"
namespace MNN {
#ifndef MNN_OPENCL_BUFFER_CLOSED
const char* conv_2d_buf =
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define GLOBAL_SIZE_2_DIMS __private const int global_size_dim0,__private const int global_size_dim1,\n"
"#define DEAL_NON_UNIFORM_DIM2(input1, input2) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { "" return; "" }\n"
"#ifdef CONV_LOCAL_SIZE\n"
"__kernel\n"
"void conv_2d_1x1_local(__private const int out_w_blocks,\n"
" __global const FLOAT *input,\n"
" __global const FLOAT *kernel_ptr,\n"
" __global const FLOAT *bias_ptr,\n"
" __global FLOAT *output,\n"
" __private const int in_c_block,\n"
" __private const int batch,\n"
" __private const int out_h,\n"
" __private const int out_w,\n"
" __private const int out_c_block,\n"
" __private const int out_c_pack) {\n"
" const int lid=get_local_id(0);\n"
" const int out_c_w_idx=get_global_id(1); //c/4 w\n"
" const int out_b_h_idx=get_global_id(2); //b h\n"
" \n"
" COMPUTE_FLOAT4 local sum_mnn[CONV_LOCAL_SIZE];\n"
" \n"
" const int out_c_idx=out_c_w_idx/out_w_blocks;\n"
" const int out_w_idx=out_c_w_idx % out_w_blocks;\n"
" const int out_b_idx=out_b_h_idx/out_h; // equal to in_b_idx\n"
" const int out_h_idx=out_b_h_idx % out_h; // equal to in_h_idx\n"
" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias_ptr));\n"
" COMPUTE_FLOAT4 out0=(COMPUTE_FLOAT4)0;\n"
" int offset=out_c_idx*4;\n"
" int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+out_w_idx) << 2;\n"
" \n"
" const int inp_add=batch*out_h*out_w*4;\n"
" for (ushort in_channel_block_idx=lid; in_channel_block_idx<in_c_block; in_channel_block_idx+=CONV_LOCAL_SIZE) {\n"
" \n"
" int offset=mad24(in_channel_block_idx*4,out_c_pack,out_c_idx*4);\n"
" COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset+in_channel_block_idx*inp_add));\n"
" COMPUTE_FLOAT4 weights0=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset));\n"
" COMPUTE_FLOAT4 weights1=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack));\n"
" COMPUTE_FLOAT4 weights2=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack));\n"
" COMPUTE_FLOAT4 weights3=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n"
" out0=mad(in0.x,weights0,out0);\n"
" out0=mad(in0.y,weights1,out0);\n"
" out0=mad(in0.z,weights2,out0);\n"
" out0=mad(in0.w,weights3,out0);\n"
" }\n"
" \n"
" sum_mnn[lid]=out0;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for(int i=CONV_LOCAL_SIZE/2; i>0; i /= 2){\n"
" if (lid<i)\n"
" sum_mnn[lid]=sum_mnn[lid]+sum_mnn[lid+i];\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" out0=sum_mnn[0]+bias0;\n"
" if(lid == 0){\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_h+out_h_idx)* out_w+out_w_idx)*4;\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" }\n"
"}\n"
"#endif\n"
"__kernel\n"
"void conv_2d_1x1_c4h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,\n"
" __global const FLOAT *input,\n"
" __global const FLOAT *kernel_ptr,\n"
" __global const FLOAT *bias_ptr,\n"
" __global FLOAT *output,\n"
" __private const int in_c_block,\n"
" __private const int out_h,\n"
" __private const int out_w,\n"
" __private const int out_b,\n"
" __private const int out_c_block,\n"
" __private const int out_c_pack) {\n"
" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
" const int out_b_h_idx=get_global_id(1); //b h\n"
" DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
" const int out_c_idx=out_c_w_idx/out_w_blocks;\n"
" const int out_w_idx=out_c_w_idx % out_w_blocks;\n"
" const int out_b_idx=out_b_h_idx/out_h; // equal to in_b_idx\n"
" const int out_h_idx=out_b_h_idx % out_h; // equal to in_h_idx\n"
" const int out_w4_idx=mul24(out_w_idx,4);\n"
" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias_ptr));\n"
" COMPUTE_FLOAT4 out1=out0;\n"
" COMPUTE_FLOAT4 out2=out0;\n"
" COMPUTE_FLOAT4 out3=out0;\n"
" const int intput_width_idx0=out_w4_idx;\n"
" int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+intput_width_idx0) << 2;\n"
" int offset=out_c_idx*4;\n"
" const int inp_add=out_b*out_h*out_w*4;\n"
" for (ushort in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n"
" \n"
" COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n"
" COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(1,input+inp_offset));\n"
" COMPUTE_FLOAT4 in2=CONVERT_COMPUTE_FLOAT4(vload4(2,input+inp_offset));\n"
" COMPUTE_FLOAT4 in3=CONVERT_COMPUTE_FLOAT4(vload4(3,input+inp_offset));\n"
" COMPUTE_FLOAT4 weights0=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset));\n"
" COMPUTE_FLOAT4 weights1=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack));\n"
" COMPUTE_FLOAT4 weights2=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack));\n"
" COMPUTE_FLOAT4 weights3=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n"
" out0=mad(in0.x,weights0,out0);\n"
" out0=mad(in0.y,weights1,out0);\n"
" out0=mad(in0.z,weights2,out0);\n"
" out0=mad(in0.w,weights3,out0);\n"
" \n"
" out1=mad(in1.x,weights0,out1);\n"
" out1=mad(in1.y,weights1,out1);\n"
" out1=mad(in1.z,weights2,out1);\n"
" out1=mad(in1.w,weights3,out1);\n"
" \n"
" out2=mad(in2.x,weights0,out2);\n"
" out2=mad(in2.y,weights1,out2);\n"
" out2=mad(in2.z,weights2,out2);\n"
" out2=mad(in2.w,weights3,out2);\n"
" \n"
" out3=mad(in3.x,weights0,out3);\n"
" out3=mad(in3.y,weights1,out3);\n"
" out3=mad(in3.z,weights2,out3);\n"
" out3=mad(in3.w,weights3,out3);\n"
" \n"
" offset += 4*out_c_pack;\n"
" inp_offset += inp_add;\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
" out2=fmax(out2,(COMPUTE_FLOAT4)0);\n"
" out3=fmax(out3,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" const int out_offset=(((out_b_idx+out_c_idx*out_b)*out_h+out_h_idx)* out_w+out_w4_idx)*4;\n"
"#ifdef BLOCK_LEAVE\n"
" const int remain=out_w-out_w4_idx;\n"
" if (remain >= 4) {\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n"
" } else if (remain == 3) {\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out2),2,output+out_offset);\n"
" } else if (remain == 2) {\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n"
" } else if (remain == 1) {\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" }\n"
"#else\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n"
"#endif\n"
"}\n"
"__kernel\n"
"void conv_2d_1x1_c8h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,\n"
" __global const FLOAT *input,\n"
" __global const FLOAT *kernel_ptr,\n"
" __global const FLOAT *bias_ptr,\n"
" __global FLOAT *output,\n"
" __private const int in_c_block,\n"
" __private const int out_h,\n"
" __private const int out_w,\n"
" __private const int out_b,\n"
" __private const int out_c_block,\n"
" __private const int out_c_pack) {\n"
" const int out_c_w_idx=get_global_id(0); //c/8 w/4\n"
" const int out_b_h_idx=get_global_id(1); //b h\n"
" DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
" const int out_c_idx_0=(out_c_w_idx/out_w_blocks) << 1;\n"
" const int out_c_idx_1=out_c_idx_0+1;\n"
" const int out_w_idx=out_c_w_idx % out_w_blocks;\n"
" const int out_b_idx=out_b_h_idx/out_h;//equal to in_b_idx\n"
" const int out_h_idx=out_b_h_idx % out_h;//equal to in_h_idx\n"
" const int out_w4_idx=mul24(out_w_idx,4);\n"
" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias_ptr));\n"
" COMPUTE_FLOAT4 out1=out0;\n"
" COMPUTE_FLOAT4 out2=out0;\n"
" COMPUTE_FLOAT4 out3=out0;\n"
" \n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" COMPUTE_FLOAT4 out4=out_c_idx_1 >= out_c_block ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias_ptr));\n"
" COMPUTE_FLOAT4 out5=out4;\n"
" COMPUTE_FLOAT4 out6=out4;\n"
" COMPUTE_FLOAT4 out7=out4;\n"
" #else\n"
" COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias_ptr));\n"
" COMPUTE_FLOAT4 out5=out4;\n"
" COMPUTE_FLOAT4 out6=out4;\n"
" COMPUTE_FLOAT4 out7=out4;\n"
" #endif\n"
" const int intput_width_idx0=out_w4_idx;\n"
" int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+intput_width_idx0)<<2;\n"
" int offset=out_c_idx_0*4;\n"
" const int inp_add=out_b*out_h*out_w*4;\n"
" for (int in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n"
" \n"
" COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n"
" COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(1,input+inp_offset));\n"
" COMPUTE_FLOAT4 in2=CONVERT_COMPUTE_FLOAT4(vload4(2,input+inp_offset));\n"
" COMPUTE_FLOAT4 in3=CONVERT_COMPUTE_FLOAT4(vload4(3,input+inp_offset));\n"
" \n"
" // output_channel at least pack to 8,no need boundry protect\n"
" COMPUTE_FLOAT4 weights0=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset));\n"
" COMPUTE_FLOAT4 weights1=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset));\n"
" COMPUTE_FLOAT4 weights2=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack));\n"
" COMPUTE_FLOAT4 weights3=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset+out_c_pack));\n"
" COMPUTE_FLOAT4 weights4=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack));\n"
" COMPUTE_FLOAT4 weights5=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset+out_c_pack+out_c_pack));\n"
" COMPUTE_FLOAT4 weights6=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n"
" COMPUTE_FLOAT4 weights7=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n"
" out0=mad(in0.x,weights0,out0);\n"
" out0=mad(in0.y,weights2,out0);\n"
" out0=mad(in0.z,weights4,out0);\n"
" out0=mad(in0.w,weights6,out0);\n"
" \n"
" out1=mad(in1.x,weights0,out1);\n"
" out1=mad(in1.y,weights2,out1);\n"
" out1=mad(in1.z,weights4,out1);\n"
" out1=mad(in1.w,weights6,out1);\n"
" \n"
" out2=mad(in2.x,weights0,out2);\n"
" out2=mad(in2.y,weights2,out2);\n"
" out2=mad(in2.z,weights4,out2);\n"
" out2=mad(in2.w,weights6,out2);\n"
" \n"
" out3=mad(in3.x,weights0,out3);\n"
" out3=mad(in3.y,weights2,out3);\n"
" out3=mad(in3.z,weights4,out3);\n"
" out3=mad(in3.w,weights6,out3);\n"
" \n"
" out4=mad(in0.x,weights1,out4);\n"
" out4=mad(in0.y,weights3,out4);\n"
" out4=mad(in0.z,weights5,out4);\n"
" out4=mad(in0.w,weights7,out4);\n"
" \n"
" out5=mad(in1.x,weights1,out5);\n"
" out5=mad(in1.y,weights3,out5);\n"
" out5=mad(in1.z,weights5,out5);\n"
" out5=mad(in1.w,weights7,out5);\n"
" \n"
" out6=mad(in2.x,weights1,out6);\n"
" out6=mad(in2.y,weights3,out6);\n"
" out6=mad(in2.z,weights5,out6);\n"
" out6=mad(in2.w,weights7,out6);\n"
" \n"
" out7=mad(in3.x,weights1,out7);\n"
" out7=mad(in3.y,weights3,out7);\n"
" out7=mad(in3.z,weights5,out7);\n"
" out7=mad(in3.w,weights7,out7);\n"
" \n"
" offset += 4*out_c_pack;\n"
" inp_offset += inp_add;\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
" out2=fmax(out2,(COMPUTE_FLOAT4)0);\n"
" out3=fmax(out3,(COMPUTE_FLOAT4)0);\n"
" \n"
" out4=fmax(out4,(COMPUTE_FLOAT4)0);\n"
" out5=fmax(out5,(COMPUTE_FLOAT4)0);\n"
" out6=fmax(out6,(COMPUTE_FLOAT4)0);\n"
" out7=fmax(out7,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" \n"
" out4=clamp(out4,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out5=clamp(out5,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" const int out_offset=(((out_b_idx+out_c_idx_0*out_b)*out_h+out_h_idx)* out_w+out_w4_idx)*4;\n"
" __global FLOAT*_tempoutput=output+out_offset;\n"
" __global FLOAT*_tempoutput1=_tempoutput+4*out_h*out_w*out_b;\n"
"#ifdef BLOCK_LEAVE\n"
" const int remain=out_w-out_w4_idx;\n"
" if (remain >= 4) {\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,_tempoutput);\n"
" } else if (remain == 3) {\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,_tempoutput);\n"
" vstore4(CONVERT_FLOAT4(out2),2,_tempoutput);\n"
" } else if (remain == 2) {\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,_tempoutput);\n"
" } else if (remain == 1) {\n"
" vstore4(CONVERT_FLOAT4(out0),0,_tempoutput);\n"
" }\n"
"#ifdef CHANNEL_BOUNDARY_PROTECT\n"
" if(out_c_idx_1 >= out_c_block) {\n"
" return;\n"
" }\n"
"#endif\n"
" if (remain >= 4) {\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,_tempoutput1);\n"
" } else if (remain == 3) {\n"
" vstore8(CONVERT_FLOAT8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4,out5))),0,_tempoutput1);\n"
" vstore4(CONVERT_FLOAT4(out6),2,_tempoutput1);\n"
" } else if (remain == 2) {\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4,out5)),0,_tempoutput1);\n"
" } else if (remain == 1) {\n"
" vstore4(CONVERT_FLOAT4(out4),0,_tempoutput1);\n"
" }\n"
"#else\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,_tempoutput);\n"
"#ifdef CHANNEL_BOUNDARY_PROTECT\n"
" if(out_c_idx_1 >= out_c_block) {\n"
" return;\n"
" }\n"
"#endif\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,_tempoutput1);\n"
"#endif\n"
"}\n"
"__kernel\n"
"void conv_2d_1x1_c8h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,\n"
" __global const FLOAT *input,\n"
" __global const FLOAT *kernel_ptr,\n"
" __global const FLOAT *bias_ptr,\n"
" __global FLOAT *output,\n"
" __private const int in_c_block,\n"
" __private const int out_h,\n"
" __private const int out_w,\n"
" __private const int out_b,\n"
" __private const int out_c_block,\n"
" __private const int out_c_pack) {\n"
" const int out_c_w_idx=get_global_id(0); //c/8 w/4\n"
" const int out_b_h_idx=get_global_id(1); //b h\n"
" DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
" const int out_c_idx_0=(out_c_w_idx/out_w_blocks) << 1;\n"
" const int out_c_idx_1=out_c_idx_0+1;\n"
" const int out_w_idx=out_c_w_idx % out_w_blocks;\n"
" const int out_b_idx=out_b_h_idx/out_h;//equal to in_b_idx\n"
" const int out_h_idx=out_b_h_idx % out_h;//equal to in_h_idx\n"
" \n"
" const int out_w2_idx=mul24(out_w_idx,2);\n"
" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias_ptr));\n"
" COMPUTE_FLOAT4 out1=out0;\n"
" \n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" COMPUTE_FLOAT4 out4=out_c_idx_1 >= out_c_block ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias_ptr));\n"
" #else\n"
" COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias_ptr));\n"
" #endif\n"
" COMPUTE_FLOAT4 out5=out4;\n"
" const int intput_width_idx0=out_w2_idx;\n"
" int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+intput_width_idx0)<<2;\n"
" int offset=out_c_idx_0*4;\n"
" const int inp_add=out_b*out_h*out_w*4;\n"
" for (int in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n"
" \n"
" COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n"
" COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(1,input+inp_offset));\n"
" COMPUTE_FLOAT4 weights0=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset));\n"
" COMPUTE_FLOAT4 weights1=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset));\n"
" COMPUTE_FLOAT4 weights2=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack));\n"
" COMPUTE_FLOAT4 weights3=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset+out_c_pack));\n"
" COMPUTE_FLOAT4 weights4=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack));\n"
" COMPUTE_FLOAT4 weights5=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset+out_c_pack+out_c_pack));\n"
" COMPUTE_FLOAT4 weights6=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n"
" COMPUTE_FLOAT4 weights7=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n"
" out0=mad(in0.x,weights0,out0);\n"
" out0=mad(in0.y,weights2,out0);\n"
" out0=mad(in0.z,weights4,out0);\n"
" out0=mad(in0.w,weights6,out0);\n"
" \n"
" out1=mad(in1.x,weights0,out1);\n"
" out1=mad(in1.y,weights2,out1);\n"
" out1=mad(in1.z,weights4,out1);\n"
" out1=mad(in1.w,weights6,out1);\n"
" \n"
" out4=mad(in0.x,weights1,out4);\n"
" out4=mad(in0.y,weights3,out4);\n"
" out4=mad(in0.z,weights5,out4);\n"
" out4=mad(in0.w,weights7,out4);\n"
" \n"
" out5=mad(in1.x,weights1,out5);\n"
" out5=mad(in1.y,weights3,out5);\n"
" out5=mad(in1.z,weights5,out5);\n"
" out5=mad(in1.w,weights7,out5);\n"
" \n"
" offset += 4*out_c_pack;\n"
" inp_offset += inp_add;\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
" out4=fmax(out4,(COMPUTE_FLOAT4)0);\n"
" out5=fmax(out5,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out4=clamp(out4,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out5=clamp(out5,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" const int out_offset=(((out_b_idx+out_c_idx_0*out_b)*out_h+out_h_idx)* out_w+out_w2_idx)*4;\n"
" __global FLOAT*_tempoutput=output+out_offset;\n"
" __global FLOAT*_tempoutput1=_tempoutput+4*out_h*out_w*out_b;\n"
"#ifdef BLOCK_LEAVE\n"
" const int remain=out_w-out_w2_idx;\n"
" if (remain >= 2) {\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,_tempoutput);\n"
" } else if (remain == 1) {\n"
" vstore4(CONVERT_FLOAT4(out0),0,_tempoutput);\n"
" }\n"
"#ifdef CHANNEL_BOUNDARY_PROTECT\n"
" if(out_c_idx_1 >= out_c_block) {\n"
" return;\n"
" }\n"
"#endif\n"
" if (remain >= 2) {\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4,out5)),0,_tempoutput1);\n"
" } else if (remain == 1) {\n"
" vstore4(CONVERT_FLOAT4(out4),0,_tempoutput1);\n"
" }\n"
"#else\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,_tempoutput);\n"
"#ifdef CHANNEL_BOUNDARY_PROTECT\n"
" if(out_c_idx_1 >= out_c_block) {\n"
" return;\n"
" }\n"
"#endif\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4,out5)),0,_tempoutput1);\n"
"#endif\n"
"}\n"
"__kernel\n"
"void conv_2d_1x1_c4h1w1(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,\n"
" __global const FLOAT *input,\n"
" __global const FLOAT *kernel_ptr,\n"
" __global const FLOAT *bias_ptr,\n"
" __global FLOAT *output,\n"
" __private const int in_c_block,\n"
" __private const int out_h,\n"
" __private const int out_w,\n"
" __private const int out_b,\n"
" __private const int out_c_block,\n"
" __private const int out_c_pack) {\n"
" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
" const int out_b_h_idx=get_global_id(1); //b h\n"
" DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
" const int out_c_idx=out_c_w_idx/out_w;\n"
" const int out_w_idx=out_c_w_idx % out_w;\n"
" const int out_b_idx=out_b_h_idx/out_h;//equal to in_b_idx\n"
" const int out_h_idx=out_b_h_idx % out_h;//equal to in_h_idx\n"
" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias_ptr));\n"
" const int intput_width_idx0=out_w_idx;\n"
" int offset=out_c_idx*4;\n"
" int inp_offset=((out_b_idx*out_h+out_h_idx)*out_w+intput_width_idx0)*4;\n"
" const int inp_add=out_b*out_h*out_w*4;\n"
" \n"
" for (int in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n"
" \n"
" \n"
" COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n"
" COMPUTE_FLOAT4 weights0=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset));\n"
" COMPUTE_FLOAT4 weights1=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack));\n"
" COMPUTE_FLOAT4 weights2=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack));\n"
" COMPUTE_FLOAT4 weights3=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n"
" out0=mad(in0.x,weights0,out0);\n"
" out0=mad(in0.y,weights1,out0);\n"
" out0=mad(in0.z,weights2,out0);\n"
" out0=mad(in0.w,weights3,out0);\n"
" \n"
" offset += 4*out_c_pack;\n"
" inp_offset += inp_add;\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" const int out_offset=(((out_b_idx+out_c_idx*out_b)*out_h+out_h_idx)* out_w+out_w_idx)*4;\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
"}\n"
"__kernel\n"
"void conv_2d_1x1_c4h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,\n"
" __global const FLOAT *input,\n"
" __global const FLOAT *kernel_ptr,\n"
" __global const FLOAT *bias_ptr,\n"
" __global FLOAT *output,\n"
" __private const int in_c_block,\n"
" __private const int out_h,\n"
" __private const int out_w,\n"
" __private const int out_b,\n"
" __private const int out_c_block,\n"
" __private const int out_c_pack) {\n"
" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
" const int out_b_h_idx=get_global_id(1); //b h\n"
" DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
" const int out_c_idx=out_c_w_idx/out_w_blocks;\n"
" const int out_w_idx=out_c_w_idx % out_w_blocks;\n"
" const int out_b_idx=out_b_h_idx/out_h;//equal to in_b_idx\n"
" const int out_h_idx=out_b_h_idx % out_h;//equal to in_h_idx\n"
" const int out_w2_idx=mul24(out_w_idx,2);\n"
" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias_ptr));\n"
" COMPUTE_FLOAT4 out1=out0;\n"
" const int intput_width_idx0=out_w2_idx;\n"
" int offset=out_c_idx*4;\n"
" int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+intput_width_idx0)*4;\n"
" const int inp_add=out_b*out_h*out_w*4;\n"
" \n"
" for (int in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n"
" \n"
" COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n"
" COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(1,input+inp_offset));\n"
" COMPUTE_FLOAT4 weights0=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset));\n"
" COMPUTE_FLOAT4 weights1=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack));\n"
" COMPUTE_FLOAT4 weights2=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack));\n"
" COMPUTE_FLOAT4 weights3=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n"
" out0=mad(in0.x,weights0,out0);\n"
" out0=mad(in0.y,weights1,out0);\n"
" out0=mad(in0.z,weights2,out0);\n"
" out0=mad(in0.w,weights3,out0);\n"
" \n"
" out1=mad(in1.x,weights0,out1);\n"
" out1=mad(in1.y,weights1,out1);\n"
" out1=mad(in1.z,weights2,out1);\n"
" out1=mad(in1.w,weights3,out1);\n"
" \n"
" offset += 4*out_c_pack;\n"
" inp_offset += inp_add;\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" const int out_offset=(((out_b_idx+out_c_idx*out_b)*out_h+out_h_idx)* out_w+out_w2_idx)*4;\n"
"#ifdef BLOCK_LEAVE\n"
" const int remain=out_w-out_w2_idx;\n"
" if (remain >= 2) {\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n"
" } else if (remain == 1) {\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" }\n"
"#else\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n"
"#endif\n"
"}\n"
"__kernel\n"
"void conv_2d_c4h1w1(GLOBAL_SIZE_2_DIMS\n"
" __global const FLOAT *input,\n"
" __global const FLOAT *weight,\n"
" __global const FLOAT *bias,\n"
" __global FLOAT *output,\n"
" __private const int2 in_hw,\n"
" __private const int inChannel,\n"
" __private const int in_c_blocks,\n"
" __private const int batch,\n"
" __private const int2 out_hw,\n"
" __private const int2 filter_hw,\n"
" __private const int2 stride_hw,\n"
" __private const int2 pad_hw,\n"
" __private const int2 dilate_hw,\n"
" __private const int out_w_blocks,\n"
" __private const int out_c_blocks,\n"
" __private const int out_h_blocks,\n"
" __private const int out_c_base_index\n"
") {\n"
" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
" const int out_b_h_idx=get_global_id(1); //b h\n"
" DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
" const int out_c_idx=out_c_w_idx/out_hw.y+out_c_base_index;\n"
" if(out_c_idx >= out_c_blocks) return;\n"
" const int out_w_idx=out_c_w_idx % out_hw.y;\n"
" const int out_b_idx=out_b_h_idx/out_hw.x;//equal to in_b_idx\n"
" const int out_h_idx=out_b_h_idx % out_hw.x;\n"
" \n"
" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n"
" \n"
" const int in_w_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n"
" const int in_h_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n"
" \n"
" const int kw_start=select(0,(-in_w_idx_base+dilate_hw.y-1)/dilate_hw.y,in_w_idx_base<0);\n"
" const int kh_start=select(0,(-in_h_idx_base+dilate_hw.x-1)/dilate_hw.x,in_h_idx_base<0);\n"
" const int in_w_idx_start=mad24(kw_start,dilate_hw.y,in_w_idx_base);\n"
" const int in_w_idx_end=min(mad24(filter_hw.y,dilate_hw.y,in_w_idx_base),in_hw.y);\n"
" \n"
" const int in_h_idx_start=mad24(kh_start,dilate_hw.x,in_h_idx_base);\n"
" const int in_h_idx_end=min(mad24(filter_hw.x,dilate_hw.x,in_h_idx_base),in_hw.x);\n"
" \n"
" const int weight_oc_offset=out_c_blocks*filter_hw.x*filter_hw.y*4;\n"
" for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
" //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
" //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+kw_start)*4;\n"
" for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
" for(int ix=in_w_idx_start; ix<in_w_idx_end; ix += dilate_hw.y) {\n"
" int inp_offset=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+ix)*4;\n"
" COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n"
" \n"
" const int filter_w_inc=(ix-in_w_idx_start)/dilate_hw.y;\n"
" COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(vload4(filter_w_inc,weight+weight_offset));\n"
" COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(vload4(filter_w_inc,weight+weight_offset+weight_oc_offset));\n"
" COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(vload4(filter_w_inc,weight+weight_offset+weight_oc_offset*2));\n"
" COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(vload4(filter_w_inc,weight+weight_offset+weight_oc_offset*3));\n"
" out0=mad(in0.x,weight0,out0);\n"
" out0=mad(in0.y,weight1,out0);\n"
" out0=mad(in0.z,weight2,out0);\n"
" out0=mad(in0.w,weight3,out0);\n"
" }\n"
" weight_offset += 4*filter_hw.y;\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" \n"
"}\n"
"__kernel\n"
"void conv_2d_c4h1w2(GLOBAL_SIZE_2_DIMS\n"
" __global const FLOAT *input,\n"
" __global const FLOAT *weight,\n"
" __global const FLOAT *bias,\n"
" __global FLOAT *output,\n"
" __private const int2 in_hw,\n"
" __private const int inChannel,\n"
" __private const int in_c_blocks,\n"
" __private const int batch,\n"
" __private const int2 out_hw,\n"
" __private const int2 filter_hw,\n"
" __private const int2 stride_hw,\n"
" __private const int2 pad_hw,\n"
" __private const int2 dilate_hw,\n"
" __private const int out_w_blocks,//generate width's num\n"
" __private const int out_c_blocks,\n"
" __private const int out_h_blocks,\n"
" __private const int out_c_base_index\n"
") {\n"
" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
" const int out_b_h_idx=get_global_id(1); //b h\n"
" DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
" const int out_c_idx=out_c_w_idx/out_w_blocks+out_c_base_index;\n"
" if(out_c_idx >= out_c_blocks) return;\n"
" const int out_w_idx=(out_c_w_idx % out_w_blocks) << 1;\n"
" const int out_b_idx=out_b_h_idx/out_hw.x;//equal to in_b_idx\n"
" const int out_h_idx=out_b_h_idx % out_hw.x;\n"
" \n"
" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n"
" COMPUTE_FLOAT4 out1=out0;\n"
" \n"
" const int in_w0_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n"
" const int in_w1_idx_base=in_w0_idx_base+stride_hw.y;\n"
" const int in_h_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n"
" \n"
" const int kh_start=select(0,(-in_h_idx_base+dilate_hw.x-1)/dilate_hw.x,in_h_idx_base<0);\n"
" const int in_h_idx_start=mad24(kh_start,dilate_hw.x,in_h_idx_base);\n"
" const int in_h_idx_end=min(mad24(filter_hw.x,dilate_hw.x,in_h_idx_base),in_hw.x);\n"
" \n"
" const int weight_oc_offset=out_c_blocks*filter_hw.x*filter_hw.y*4;\n"
" for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
" //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
" //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n"
" for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
" const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n"
" for(int fw=0; fw<filter_hw.y; fw++) {\n"
" const int in_w0_idx=fw*dilate_hw.y+in_w0_idx_base;\n"
" const int in_w1_idx=fw*dilate_hw.y+in_w1_idx_base;\n"
" COMPUTE_FLOAT4 in0=(in_w0_idx<0 || in_w0_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w0_idx,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in1=(in_w1_idx<0 || in_w1_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w1_idx,input+inp_offset_base));\n"
" \n"
" COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset));\n"
" COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n"
" COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset*2));\n"
" COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset*3));\n"
" out0=mad(in0.x,weight0,out0);\n"
" out0=mad(in0.y,weight1,out0);\n"
" out0=mad(in0.z,weight2,out0);\n"
" out0=mad(in0.w,weight3,out0);\n"
" \n"
" out1=mad(in1.x,weight0,out1);\n"
" out1=mad(in1.y,weight1,out1);\n"
" out1=mad(in1.z,weight2,out1);\n"
" out1=mad(in1.w,weight3,out1);\n"
" \n"
" weight_offset += 4;\n"
" }\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
"#ifdef BLOCK_LEAVE\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" if(out_w_idx+1 >= out_hw.y) return;\n"
" vstore4(CONVERT_FLOAT4(out1),1,output+out_offset);\n"
"#else\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n"
"#endif\n"
"}\n"
"__kernel\n"
"void conv_2d_c4h1w4(GLOBAL_SIZE_2_DIMS\n"
" __global const FLOAT *input,\n"
" __global const FLOAT *weight,\n"
" __global const FLOAT *bias,\n"
" __global FLOAT *output,\n"
" __private const int2 in_hw,\n"
" __private const int inChannel,\n"
" __private const int in_c_blocks,\n"
" __private const int batch,\n"
" __private const int2 out_hw,\n"
" __private const int2 filter_hw,\n"
" __private const int2 stride_hw,\n"
" __private const int2 pad_hw,\n"
" __private const int2 dilate_hw,\n"
" __private const int out_w_blocks,\n"
" __private const int out_c_blocks,\n"
" __private const int out_h_blocks,\n"
" __private const int out_c_base_index\n"
") {\n"
" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
" const int out_b_h_idx=get_global_id(1); //b h\n"
" DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
" const int out_c_idx=out_c_w_idx/out_w_blocks+out_c_base_index;\n"
" if(out_c_idx >= out_c_blocks) return;\n"
" const int out_w_idx=(out_c_w_idx % out_w_blocks) << 2;\n"
" const int out_b_idx=out_b_h_idx/out_hw.x;//equal to in_b_idx\n"
" const int out_h_idx=out_b_h_idx % out_hw.x;\n"
" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n"
" COMPUTE_FLOAT4 out1=out0;\n"
" COMPUTE_FLOAT4 out2=out0;\n"
" COMPUTE_FLOAT4 out3=out0;\n"
" const int in_w0_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n"
" const int in_w1_idx_base=in_w0_idx_base+stride_hw.y;\n"
" const int in_w2_idx_base=in_w1_idx_base+stride_hw.y;\n"
" const int in_w3_idx_base=in_w2_idx_base+stride_hw.y;\n"
" const int in_h_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n"
" \n"
" const int kh_start=select(0,(-in_h_idx_base+dilate_hw.x-1)/dilate_hw.x,in_h_idx_base<0);\n"
" const int in_h_idx_start=mad24(kh_start,dilate_hw.x,in_h_idx_base);\n"
" const int in_h_idx_end=min(mad24(filter_hw.x,dilate_hw.x,in_h_idx_base),in_hw.x);\n"
" \n"
" const int weight_oc_offset=out_c_blocks*filter_hw.x*filter_hw.y*4;\n"
" for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
" //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
" //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n"
" for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
" const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n"
" for(int fw=0; fw<filter_hw.y; fw++) {\n"
" const int in_w0_idx=fw*dilate_hw.y+in_w0_idx_base;\n"
" const int in_w1_idx=fw*dilate_hw.y+in_w1_idx_base;\n"
" const int in_w2_idx=fw*dilate_hw.y+in_w2_idx_base;\n"
" const int in_w3_idx=fw*dilate_hw.y+in_w3_idx_base;\n"
" COMPUTE_FLOAT4 in0=(in_w0_idx<0 || in_w0_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w0_idx,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in1=(in_w1_idx<0 || in_w1_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w1_idx,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in2=(in_w2_idx<0 || in_w2_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w2_idx,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in3=(in_w3_idx<0 || in_w3_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w3_idx,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset));\n"
" COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n"
" COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset*2));\n"
" COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset*3));\n"
" out0=mad(in0.x,weight0,out0);\n"
" out0=mad(in0.y,weight1,out0);\n"
" out0=mad(in0.z,weight2,out0);\n"
" out0=mad(in0.w,weight3,out0);\n"
" \n"
" out1=mad(in1.x,weight0,out1);\n"
" out1=mad(in1.y,weight1,out1);\n"
" out1=mad(in1.z,weight2,out1);\n"
" out1=mad(in1.w,weight3,out1);\n"
" \n"
" out2=mad(in2.x,weight0,out2);\n"
" out2=mad(in2.y,weight1,out2);\n"
" out2=mad(in2.z,weight2,out2);\n"
" out2=mad(in2.w,weight3,out2);\n"
" \n"
" out3=mad(in3.x,weight0,out3);\n"
" out3=mad(in3.y,weight1,out3);\n"
" out3=mad(in3.z,weight2,out3);\n"
" out3=mad(in3.w,weight3,out3);\n"
" \n"
" weight_offset += 4;\n"
" }\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
" out2=fmax(out2,(COMPUTE_FLOAT4)0);\n"
" out3=fmax(out3,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
"#ifdef BLOCK_LEAVE\n"
" const int remain=out_hw.y-out_w_idx;\n"
" if (remain >= 4) {\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n"
" }else if(remain == 3){\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out2),2,output+out_offset);\n"
" }else if(remain == 2){\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n"
" }else if(remain == 1){\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" }\n"
"#else\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n"
"#endif\n"
"}\n"
"__kernel\n"
"void conv_2d_c4h4w1(GLOBAL_SIZE_2_DIMS\n"
" __global const FLOAT *input,\n"
" __global const FLOAT *weight,\n"
" __global const FLOAT *bias,\n"
" __global FLOAT *output,\n"
" __private const int2 in_hw,\n"
" __private const int inChannel,\n"
" __private const int in_c_blocks,\n"
" __private const int batch,\n"
" __private const int2 out_hw,\n"
" __private const int2 filter_hw,\n"
" __private const int2 stride_hw,\n"
" __private const int2 pad_hw,\n"
" __private const int2 dilate_hw,\n"
" __private const int out_w_blocks,\n"
" __private const int out_c_blocks,\n"
" __private const int out_h_blocks,\n"
" __private const int out_c_base_index\n"
") {\n"
" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
" const int out_b_h_idx=get_global_id(1); //b h\n"
" DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
" const int out_c_idx=out_c_w_idx/out_w_blocks+out_c_base_index;\n"
" if(out_c_idx >= out_c_blocks) return;\n"
" const int out_w_idx=out_c_w_idx % out_w_blocks;\n"
" const int out_b_idx=out_b_h_idx/out_h_blocks;//equal to in_b_idx\n"
" const int out_h_idx=(out_b_h_idx % out_h_blocks) << 2;\n"
" \n"
" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n"
" COMPUTE_FLOAT4 out1=out0;\n"
" COMPUTE_FLOAT4 out2=out0;\n"
" COMPUTE_FLOAT4 out3=out0;\n"
" const int in_w_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n"
" const int in_h0_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n"
" const int in_h1_idx_base=in_h0_idx_base+stride_hw.x;\n"
" const int in_h2_idx_base=in_h1_idx_base+stride_hw.x;\n"
" const int in_h3_idx_base=in_h2_idx_base+stride_hw.x;\n"
" \n"
" const int kw_start=select(0,(-in_w_idx_base+dilate_hw.y-1)/dilate_hw.y,in_w_idx_base<0);\n"
" const int in_w_idx_start=mad24(kw_start,dilate_hw.y,in_w_idx_base);\n"
" const int in_w_idx_end=min(mad24(filter_hw.y,dilate_hw.y,in_w_idx_base),in_hw.y);\n"
" \n"
" const int weight_oc_offset=out_c_blocks*filter_hw.x*filter_hw.y*4;\n"
" const int in_hw_size=in_hw.x*in_hw.y;\n"
" for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
" //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
" //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
" const int inp_offset_base=(out_b_idx+in_c_idx*batch)*in_hw.x*in_hw.y*4;\n"
" for(int iy=0; iy<filter_hw.x; iy++) {\n"
" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
" const int in_h0_idx=(iy*dilate_hw.x+in_h0_idx_base)*in_hw.y;\n"
" const int in_h1_idx=(iy*dilate_hw.x+in_h1_idx_base)*in_hw.y;\n"
" const int in_h2_idx=(iy*dilate_hw.x+in_h2_idx_base)*in_hw.y;\n"
" const int in_h3_idx=(iy*dilate_hw.x+in_h3_idx_base)*in_hw.y;\n"
" for(int fw=in_w_idx_start; fw<in_w_idx_end; fw += dilate_hw.y) {\n"
" COMPUTE_FLOAT4 in0=(in_h0_idx<0 || in_h0_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h0_idx+fw,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in1=(in_h1_idx<0 || in_h1_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h1_idx+fw,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in2=(in_h2_idx<0 || in_h2_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h2_idx+fw,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in3=(in_h3_idx<0 || in_h3_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h3_idx+fw,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset));\n"
" COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n"
" COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset*2));\n"
" COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset*3));\n"
" \n"
" out0=mad(in0.x,weight0,out0);\n"
" out0=mad(in0.y,weight1,out0);\n"
" out0=mad(in0.z,weight2,out0);\n"
" out0=mad(in0.w,weight3,out0);\n"
" \n"
" out1=mad(in1.x,weight0,out1);\n"
" out1=mad(in1.y,weight1,out1);\n"
" out1=mad(in1.z,weight2,out1);\n"
" out1=mad(in1.w,weight3,out1);\n"
" \n"
" out2=mad(in2.x,weight0,out2);\n"
" out2=mad(in2.y,weight1,out2);\n"
" out2=mad(in2.z,weight2,out2);\n"
" out2=mad(in2.w,weight3,out2);\n"
" \n"
" out3=mad(in3.x,weight0,out3);\n"
" out3=mad(in3.y,weight1,out3);\n"
" out3=mad(in3.z,weight2,out3);\n"
" out3=mad(in3.w,weight3,out3);\n"
" \n"
" weight_offset += 4;\n"
" }\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
" out2=fmax(out2,(COMPUTE_FLOAT4)0);\n"
" out3=fmax(out3,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
"#ifdef BLOCK_LEAVE\n"
" const int remain=out_hw.x-out_h_idx;\n"
" if(remain >= 4){\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out2),2*out_hw.y,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out3),3*out_hw.y,output+out_offset);\n"
" }else if(remain == 3){\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out2),2*out_hw.y,output+out_offset);\n"
" }else if(remain == 2){\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n"
" }else if(remain == 1){\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" }\n"
"#else\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out2),2*out_hw.y,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out3),3*out_hw.y,output+out_offset);\n"
"#endif\n"
"}\n"
"__kernel\n"
"void conv_2d_c8h4w1(GLOBAL_SIZE_2_DIMS\n"
" __global const FLOAT *input,\n"
" __global const FLOAT *weight,\n"
" __global const FLOAT *bias,\n"
" __global FLOAT *output,\n"
" __private const int2 in_hw,\n"
" __private const int inChannel,\n"
" __private const int in_c_blocks,\n"
" __private const int batch,\n"
" __private const int2 out_hw,\n"
" __private const int2 filter_hw,\n"
" __private const int2 stride_hw,\n"
" __private const int2 pad_hw,\n"
" __private const int2 dilate_hw,\n"
" __private const int out_w_blocks,\n"
" __private const int out_c_blocks,\n"
" __private const int out_h_blocks,\n"
" __private const int out_c_base_index\n"
") {\n"
" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
" const int out_b_h_idx=get_global_id(1); //b h\n"
" DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
" const int out_c_idx_0=((out_c_w_idx/out_w_blocks+out_c_base_index) << 1);\n"
" if(out_c_idx_0 >= out_c_blocks) return;\n"
" const int out_c_idx_1=out_c_idx_0+1;\n"
" const int out_w_idx=out_c_w_idx % out_w_blocks;\n"
" const int out_b_idx=out_b_h_idx/out_h_blocks;//equal to in_b_idx\n"
" const int out_h_idx=(out_b_h_idx % out_h_blocks) << 2;\n"
" \n"
" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias));\n"
" COMPUTE_FLOAT4 out1=out0;\n"
" COMPUTE_FLOAT4 out2=out0;\n"
" COMPUTE_FLOAT4 out3=out0;\n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" COMPUTE_FLOAT4 out4=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n"
" #else\n"
" COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n"
" #endif\n"
" COMPUTE_FLOAT4 out5=out4;\n"
" COMPUTE_FLOAT4 out6=out4;\n"
" COMPUTE_FLOAT4 out7=out4;\n"
" const int in_w_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n"
" const int in_h0_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n"
" const int in_h1_idx_base=in_h0_idx_base+stride_hw.x;\n"
" const int in_h2_idx_base=in_h1_idx_base+stride_hw.x;\n"
" const int in_h3_idx_base=in_h2_idx_base+stride_hw.x;\n"
" \n"
" const int kw_start=select(0,(-in_w_idx_base+dilate_hw.y-1)/dilate_hw.y,in_w_idx_base<0);\n"
" const int in_w_idx_start=mad24(kw_start,dilate_hw.y,in_w_idx_base);\n"
" const int in_w_idx_end=min(mad24(filter_hw.y,dilate_hw.y,in_w_idx_base),in_hw.y);\n"
" \n"
" const int weight_oc_offset=filter_hw.x*filter_hw.y*4;\n"
" const int weight_ic_offset=out_c_blocks*weight_oc_offset;\n"
" const int in_hw_size=in_hw.x*in_hw.y;\n"
" for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
" //weights NC4HW4 [ic/4,ic_4,oc/4,kh*kw,oc_4]\n"
" //index: [0,4*in_c_idx,out_c_idx_0*kh*kw+kh_start*kw+kw_start,0]\n"
" const int inp_offset_base=(out_b_idx+in_c_idx*batch)*in_hw.x*in_hw.y*4;\n"
" for(int iy=0; iy<filter_hw.x; iy++) {\n"
" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx_0) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
" const int in_h0_idx=(iy*dilate_hw.x+in_h0_idx_base)*in_hw.y;\n"
" const int in_h1_idx=(iy*dilate_hw.x+in_h1_idx_base)*in_hw.y;\n"
" const int in_h2_idx=(iy*dilate_hw.x+in_h2_idx_base)*in_hw.y;\n"
" const int in_h3_idx=(iy*dilate_hw.x+in_h3_idx_base)*in_hw.y;\n"
" for(int fw=in_w_idx_start; fw<in_w_idx_end; fw += dilate_hw.y) {\n"
" COMPUTE_FLOAT4 in0=(in_h0_idx<0 || in_h0_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h0_idx+fw,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in1=(in_h1_idx<0 || in_h1_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h1_idx+fw,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in2=(in_h2_idx<0 || in_h2_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h2_idx+fw,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in3=(in_h3_idx<0 || in_h3_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h3_idx+fw,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset));\n"
" COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset));\n"
" COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset*2));\n"
" COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset*3));\n"
" \n"
" out0=mad(in0.x,weight0,out0);\n"
" out0=mad(in0.y,weight1,out0);\n"
" out0=mad(in0.z,weight2,out0);\n"
" out0=mad(in0.w,weight3,out0);\n"
" \n"
" out1=mad(in1.x,weight0,out1);\n"
" out1=mad(in1.y,weight1,out1);\n"
" out1=mad(in1.z,weight2,out1);\n"
" out1=mad(in1.w,weight3,out1);\n"
" \n"
" out2=mad(in2.x,weight0,out2);\n"
" out2=mad(in2.y,weight1,out2);\n"
" out2=mad(in2.z,weight2,out2);\n"
" out2=mad(in2.w,weight3,out2);\n"
" \n"
" out3=mad(in3.x,weight0,out3);\n"
" out3=mad(in3.y,weight1,out3);\n"
" out3=mad(in3.z,weight2,out3);\n"
" out3=mad(in3.w,weight3,out3);\n"
" // weight: [ic/4,ic_4,oc/4,kh*kw,oc_4]\n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" weight0=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n"
" weight1=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n"
" weight2=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n"
" weight3=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n"
" #else\n"
" weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n"
" weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n"
" weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n"
" weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n"
" #endif\n"
" out4=mad(in0.x,weight0,out4);\n"
" out4=mad(in0.y,weight1,out4);\n"
" out4=mad(in0.z,weight2,out4);\n"
" out4=mad(in0.w,weight3,out4);\n"
" \n"
" out5=mad(in1.x,weight0,out5);\n"
" out5=mad(in1.y,weight1,out5);\n"
" out5=mad(in1.z,weight2,out5);\n"
" out5=mad(in1.w,weight3,out5);\n"
" \n"
" out6=mad(in2.x,weight0,out6);\n"
" out6=mad(in2.y,weight1,out6);\n"
" out6=mad(in2.z,weight2,out6);\n"
" out6=mad(in2.w,weight3,out6);\n"
" \n"
" out7=mad(in3.x,weight0,out7);\n"
" out7=mad(in3.y,weight1,out7);\n"
" out7=mad(in3.z,weight2,out7);\n"
" out7=mad(in3.w,weight3,out7);\n"
" \n"
" weight_offset += 4;\n"
" }\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
" out2=fmax(out2,(COMPUTE_FLOAT4)0);\n"
" out3=fmax(out3,(COMPUTE_FLOAT4)0);\n"
" out4=fmax(out4,(COMPUTE_FLOAT4)0);\n"
" out5=fmax(out5,(COMPUTE_FLOAT4)0);\n"
" out6=fmax(out6,(COMPUTE_FLOAT4)0);\n"
" out7=fmax(out7,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out4=clamp(out4,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out5=clamp(out5,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" int out_offset=(((out_b_idx+out_c_idx_0*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
"#ifdef BLOCK_LEAVE\n"
" const int remain=out_hw.x-out_h_idx;\n"
" if(remain >= 4){\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out2),2*out_hw.y,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out3),3*out_hw.y,output+out_offset);\n"
" }else if(remain == 3){\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out2),2*out_hw.y,output+out_offset);\n"
" }else if(remain == 2){\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n"
" }else if(remain == 1){\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" }\n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" if(out_c_idx_1 >= out_c_blocks){\n"
" return;\n"
" }\n"
" #endif\n"
" out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
" if(remain >= 4){\n"
" vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out5),out_hw.y,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out6),2*out_hw.y,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out7),3*out_hw.y,output+out_offset);\n"
" }else if(remain == 3){\n"
" vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out5),out_hw.y,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out6),2*out_hw.y,output+out_offset);\n"
" }else if(remain == 2){\n"
" vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out5),out_hw.y,output+out_offset);\n"
" }else if(remain == 1){\n"
" vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n"
" }\n"
"#else\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out2),2*out_hw.y,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out3),3*out_hw.y,output+out_offset);\n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" if(out_c_idx_1 >= out_c_blocks){\n"
" return;\n"
" }\n"
" #endif\n"
" out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
" vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out5),out_hw.y,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out6),2*out_hw.y,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out7),3*out_hw.y,output+out_offset);\n"
"#endif\n"
"}\n"
"__kernel\n"
"void conv_2d_c8h2w1(GLOBAL_SIZE_2_DIMS\n"
" __global const FLOAT *input,\n"
" __global const FLOAT *weight,\n"
" __global const FLOAT *bias,\n"
" __global FLOAT *output,\n"
" __private const int2 in_hw,\n"
" __private const int inChannel,\n"
" __private const int in_c_blocks,\n"
" __private const int batch,\n"
" __private const int2 out_hw,\n"
" __private const int2 filter_hw,\n"
" __private const int2 stride_hw,\n"
" __private const int2 pad_hw,\n"
" __private const int2 dilate_hw,\n"
" __private const int out_w_blocks,\n"
" __private const int out_c_blocks,\n"
" __private const int out_h_blocks,\n"
" __private const int out_c_base_index\n"
") {\n"
" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
" const int out_b_h_idx=get_global_id(1); //b h\n"
" DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
" const int out_c_idx_0=(out_c_w_idx/out_w_blocks+out_c_base_index) << 1;\n"
" if(out_c_idx_0 >= out_c_blocks) return;\n"
" const int out_c_idx_1=out_c_idx_0+1;\n"
" const int out_w_idx=out_c_w_idx % out_w_blocks;\n"
" const int out_b_idx=out_b_h_idx/out_h_blocks;//equal to in_b_idx\n"
" const int out_h_idx=(out_b_h_idx % out_h_blocks) << 1;\n"
" \n"
" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias));\n"
" COMPUTE_FLOAT4 out1=out0;\n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" COMPUTE_FLOAT4 out2=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n"
" #else\n"
" COMPUTE_FLOAT4 out2=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n"
" #endif\n"
" COMPUTE_FLOAT4 out3=out2;\n"
" \n"
" const int in_w_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n"
" const int in_h0_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n"
" const int in_h1_idx_base=in_h0_idx_base+stride_hw.x;\n"
" \n"
" const int kw_start=select(0,(-in_w_idx_base+dilate_hw.y-1)/dilate_hw.y,in_w_idx_base<0);\n"
" const int in_w_idx_start=mad24(kw_start,dilate_hw.y,in_w_idx_base);\n"
" const int in_w_idx_end=min(mad24(filter_hw.y,dilate_hw.y,in_w_idx_base),in_hw.y);\n"
" \n"
" const int weight_oc_offset=filter_hw.x*filter_hw.y*4;\n"
" const int weight_ic_offset=out_c_blocks*weight_oc_offset;\n"
" const int in_hw_size=in_hw.x*in_hw.y;\n"
" // weight: [ic/4,oc,4],loop: ic/4\n"
" for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
" //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
" //index: [0,4*in_c_idx,out_c_idx_0*kh*kw+kh_start*kw+kw_start,0]\n"
" const int inp_offset_base=(out_b_idx+in_c_idx*batch)*in_hw.x*in_hw.y*4;\n"
" for(int iy=0; iy<filter_hw.x; iy++) {\n"
" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx_0) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
" const int in_h0_idx=(iy*dilate_hw.x+in_h0_idx_base)*in_hw.y;\n"
" const int in_h1_idx=(iy*dilate_hw.x+in_h1_idx_base)*in_hw.y;\n"
" for(int fw=in_w_idx_start; fw<in_w_idx_end; fw += dilate_hw.y) {\n"
" COMPUTE_FLOAT4 in0=(in_h0_idx<0 || in_h0_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h0_idx+fw,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in1=(in_h1_idx<0 || in_h1_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h1_idx+fw,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset));\n"
" COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset));\n"
" COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset*2));\n"
" COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset*3));\n"
" \n"
" out0=mad(in0.x,weight0,out0);\n"
" out0=mad(in0.y,weight1,out0);\n"
" out0=mad(in0.z,weight2,out0);\n"
" out0=mad(in0.w,weight3,out0);\n"
" \n"
" out1=mad(in1.x,weight0,out1);\n"
" out1=mad(in1.y,weight1,out1);\n"
" out1=mad(in1.z,weight2,out1);\n"
" out1=mad(in1.w,weight3,out1);\n"
" \n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" weight0=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n"
" weight1=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n"
" weight2=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n"
" weight3=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n"
" #else\n"
" weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n"
" weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n"
" weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n"
" weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n"
" #endif\n"
" out2=mad(in0.x,weight0,out2);\n"
" out2=mad(in0.y,weight1,out2);\n"
" out2=mad(in0.z,weight2,out2);\n"
" out2=mad(in0.w,weight3,out2);\n"
" \n"
" out3=mad(in1.x,weight0,out3);\n"
" out3=mad(in1.y,weight1,out3);\n"
" out3=mad(in1.z,weight2,out3);\n"
" out3=mad(in1.w,weight3,out3);\n"
" \n"
" weight_offset += 4;\n"
" }\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
" out2=fmax(out2,(COMPUTE_FLOAT4)0);\n"
" out3=fmax(out3,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" int out_offset=(((out_b_idx+out_c_idx_0*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
"#ifdef BLOCK_LEAVE\n"
" const int remain=out_hw.x-out_h_idx;\n"
" if(remain >= 2){\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n"
" }else if(remain == 1){\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" }\n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" if(out_c_idx_1 >= out_c_blocks){\n"
" return;\n"
" }\n"
" #endif\n"
" out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
" if(remain >= 2){\n"
" vstore4(CONVERT_FLOAT4(out2),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out3),out_hw.y,output+out_offset);\n"
" }else if(remain == 1){\n"
" vstore4(CONVERT_FLOAT4(out2),0,output+out_offset);\n"
" }\n"
"#else\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" if(out_c_idx_1 >= out_c_blocks){\n"
" return;\n"
" }\n"
" #endif\n"
" out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
" vstore4(CONVERT_FLOAT4(out2),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out3),out_hw.y,output+out_offset);\n"
"#endif\n"
"}\n"
"__kernel\n"
"void conv_2d_c8h1w4(GLOBAL_SIZE_2_DIMS\n"
" __global const FLOAT *input,\n"
" __global const FLOAT *weight,\n"
" __global const FLOAT *bias,\n"
" __global FLOAT *output,\n"
" __private const int2 in_hw,\n"
" __private const int inChannel,\n"
" __private const int in_c_blocks,\n"
" __private const int batch,\n"
" __private const int2 out_hw,\n"
" __private const int2 filter_hw,\n"
" __private const int2 stride_hw,\n"
" __private const int2 pad_hw,\n"
" __private const int2 dilate_hw,\n"
" __private const int out_w_blocks,\n"
" __private const int out_c_blocks,\n"
" __private const int out_h_blocks,\n"
" __private const int out_c_base_index\n"
") {\n"
" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
" const int out_b_h_idx=get_global_id(1); //b h\n"
" DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
" const int out_c_idx_0=(out_c_w_idx/out_w_blocks+out_c_base_index) << 1;\n"
" if(out_c_idx_0 >= out_c_blocks) return;\n"
" const int out_c_idx_1=out_c_idx_0+1;\n"
" const int out_w_idx=(out_c_w_idx % out_w_blocks) << 2;\n"
" const int out_b_idx=out_b_h_idx/out_hw.x;//equal to in_b_idx\n"
" const int out_h_idx=out_b_h_idx % out_hw.x;\n"
" \n"
" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias));\n"
" COMPUTE_FLOAT4 out1=out0;\n"
" COMPUTE_FLOAT4 out2=out0;\n"
" COMPUTE_FLOAT4 out3=out0;\n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" COMPUTE_FLOAT4 out4=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n"
" #else\n"
" COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n"
" #endif\n"
" COMPUTE_FLOAT4 out5=out4;\n"
" COMPUTE_FLOAT4 out6=out4;\n"
" COMPUTE_FLOAT4 out7=out4;\n"
" const int in_w0_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n"
" const int in_w1_idx_base=in_w0_idx_base+stride_hw.y;\n"
" const int in_w2_idx_base=in_w1_idx_base+stride_hw.y;\n"
" const int in_w3_idx_base=in_w2_idx_base+stride_hw.y;\n"
" const int in_h_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n"
" \n"
" const int kh_start=select(0,(-in_h_idx_base+dilate_hw.x-1)/dilate_hw.x,in_h_idx_base<0);\n"
" const int in_h_idx_start=mad24(kh_start,dilate_hw.x,in_h_idx_base);\n"
" const int in_h_idx_end=min(mad24(filter_hw.x,dilate_hw.x,in_h_idx_base),in_hw.x);\n"
" \n"
" const int weight_oc_offset=filter_hw.x*filter_hw.y*4;\n"
" const int weight_ic_offset=out_c_blocks*weight_oc_offset;\n"
" for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
" //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
" //index: [0,4*in_c_idx,out_c_idx_0*kh*kw+kh_start*kw+kw_start,0]\n"
" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx_0) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n"
" for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
" const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n"
" for(int fw=0; fw<filter_hw.y; fw++) {\n"
" const int in_w0_idx=fw*dilate_hw.y+in_w0_idx_base;\n"
" const int in_w1_idx=fw*dilate_hw.y+in_w1_idx_base;\n"
" const int in_w2_idx=fw*dilate_hw.y+in_w2_idx_base;\n"
" const int in_w3_idx=fw*dilate_hw.y+in_w3_idx_base;\n"
" COMPUTE_FLOAT4 in0=(in_w0_idx<0 || in_w0_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w0_idx,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in1=(in_w1_idx<0 || in_w1_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w1_idx,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in2=(in_w2_idx<0 || in_w2_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w2_idx,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in3=(in_w3_idx<0 || in_w3_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w3_idx,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset));\n"
" COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset));\n"
" COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset*2));\n"
" COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset*3));\n"
" out0=mad(in0.x,weight0,out0);\n"
" out0=mad(in0.y,weight1,out0);\n"
" out0=mad(in0.z,weight2,out0);\n"
" out0=mad(in0.w,weight3,out0);\n"
" \n"
" out1=mad(in1.x,weight0,out1);\n"
" out1=mad(in1.y,weight1,out1);\n"
" out1=mad(in1.z,weight2,out1);\n"
" out1=mad(in1.w,weight3,out1);\n"
" \n"
" out2=mad(in2.x,weight0,out2);\n"
" out2=mad(in2.y,weight1,out2);\n"
" out2=mad(in2.z,weight2,out2);\n"
" out2=mad(in2.w,weight3,out2);\n"
" \n"
" out3=mad(in3.x,weight0,out3);\n"
" out3=mad(in3.y,weight1,out3);\n"
" out3=mad(in3.z,weight2,out3);\n"
" out3=mad(in3.w,weight3,out3);\n"
" \n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" weight0=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n"
" weight1=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n"
" weight2=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n"
" weight3=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n"
" #else\n"
" weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n"
" weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n"
" weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n"
" weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n"
" #endif\n"
" out4=mad(in0.x,weight0,out4);\n"
" out4=mad(in0.y,weight1,out4);\n"
" out4=mad(in0.z,weight2,out4);\n"
" out4=mad(in0.w,weight3,out4);\n"
" \n"
" out5=mad(in1.x,weight0,out5);\n"
" out5=mad(in1.y,weight1,out5);\n"
" out5=mad(in1.z,weight2,out5);\n"
" out5=mad(in1.w,weight3,out5);\n"
" \n"
" out6=mad(in2.x,weight0,out6);\n"
" out6=mad(in2.y,weight1,out6);\n"
" out6=mad(in2.z,weight2,out6);\n"
" out6=mad(in2.w,weight3,out6);\n"
" \n"
" out7=mad(in3.x,weight0,out7);\n"
" out7=mad(in3.y,weight1,out7);\n"
" out7=mad(in3.z,weight2,out7);\n"
" out7=mad(in3.w,weight3,out7);\n"
" \n"
" weight_offset += 4;\n"
" }\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
" out2=fmax(out2,(COMPUTE_FLOAT4)0);\n"
" out3=fmax(out3,(COMPUTE_FLOAT4)0);\n"
" out4=fmax(out4,(COMPUTE_FLOAT4)0);\n"
" out5=fmax(out5,(COMPUTE_FLOAT4)0);\n"
" out6=fmax(out6,(COMPUTE_FLOAT4)0);\n"
" out7=fmax(out7,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out4=clamp(out4,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out5=clamp(out5,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" int out_offset=(((out_b_idx+out_c_idx_0*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
"#ifdef BLOCK_LEAVE\n"
" const int remain=out_hw.y-out_w_idx;\n"
" if(remain >= 4){\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n"
" }else if(remain == 3){\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out2),2,output+out_offset);\n"
" }else if(remain == 2){\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n"
" }else if(remain == 1){\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" }\n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" if(out_c_idx_1 >= out_c_blocks)return;\n"
" #endif\n"
" out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
" if(remain >= 4){\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,output+out_offset);\n"
" }else if(remain == 3){\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4,out5)),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out6),2,output+out_offset);\n"
" }else if(remain == 2){\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4,out5)),0,output+out_offset);\n"
" }else if(remain == 1){\n"
" vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n"
" }\n"
"#else\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" if(out_c_idx_1 >= out_c_blocks)return;\n"
" #endif\n"
" out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,output+out_offset);\n"
"#endif\n"
"}\n"
;
#endif
}