source/backend/opencl/execution/cl/conv_2d_buf_mnn

#include "opencl_source_map.hpp" namespace MNN { #ifndef MNN_OPENCL_BUFFER_CLOSED const char* conv_2d_buf = "#ifdef MNN_SUPPORT_FP16\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "#endif\n" "#define GLOBAL_SIZE_2_DIMS __private const int global_size_dim0,__private const int global_size_dim1,\n" "#define DEAL_NON_UNIFORM_DIM2(input1, input2) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { "" return; "" }\n" "#ifdef CONV_LOCAL_SIZE\n" "__kernel\n" "void conv_2d_1x1_local(__private const int out_w_blocks,\n" " __global const FLOAT *input,\n" " __global const FLOAT *kernel_ptr,\n" " __global const FLOAT *bias_ptr,\n" " __global FLOAT *output,\n" " __private const int in_c_block,\n" " __private const int batch,\n" " __private const int out_h,\n" " __private const int out_w,\n" " __private const int out_c_block,\n" " __private const int out_c_pack) {\n" " const int lid=get_local_id(0);\n" " const int out_c_w_idx=get_global_id(1); //c/4 w\n" " const int out_b_h_idx=get_global_id(2); //b h\n" " \n" " COMPUTE_FLOAT4 local sum_mnn[CONV_LOCAL_SIZE];\n" " \n" " const int out_c_idx=out_c_w_idx/out_w_blocks;\n" " const int out_w_idx=out_c_w_idx % out_w_blocks;\n" " const int out_b_idx=out_b_h_idx/out_h; // equal to in_b_idx\n" " const int out_h_idx=out_b_h_idx % out_h; // equal to in_h_idx\n" " COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias_ptr));\n" " COMPUTE_FLOAT4 out0=(COMPUTE_FLOAT4)0;\n" " int offset=out_c_idx*4;\n" " int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+out_w_idx) << 2;\n" " \n" " const int inp_add=batch*out_h*out_w*4;\n" " for (ushort in_channel_block_idx=lid; in_channel_block_idx<in_c_block; in_channel_block_idx+=CONV_LOCAL_SIZE) {\n" " \n" " int offset=mad24(in_channel_block_idx*4,out_c_pack,out_c_idx*4);\n" " COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset+in_channel_block_idx*inp_add));\n" " COMPUTE_FLOAT4 weights0=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset));\n" " COMPUTE_FLOAT4 weights1=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack));\n" " COMPUTE_FLOAT4 weights2=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack));\n" " COMPUTE_FLOAT4 weights3=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n" " out0=mad(in0.x,weights0,out0);\n" " out0=mad(in0.y,weights1,out0);\n" " out0=mad(in0.z,weights2,out0);\n" " out0=mad(in0.w,weights3,out0);\n" " }\n" " \n" " sum_mnn[lid]=out0;\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" " for(int i=CONV_LOCAL_SIZE/2; i>0; i /= 2){\n" " if (lid<i)\n" " sum_mnn[lid]=sum_mnn[lid]+sum_mnn[lid+i];\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" " }\n" " out0=sum_mnn[0]+bias0;\n" " if(lid == 0){\n" "#ifdef RELU\n" " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n" "#endif\n" "#ifdef RELU6\n" " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" "#endif\n" " const int out_offset=(((out_b_idx+out_c_idx*batch)*out_h+out_h_idx)* out_w+out_w_idx)*4;\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " }\n" "}\n" "#endif\n" "__kernel\n" "void conv_2d_1x1_c4h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,\n" " __global const FLOAT *input,\n" " __global const FLOAT *kernel_ptr,\n" " __global const FLOAT *bias_ptr,\n" " __global FLOAT *output,\n" " __private const int in_c_block,\n" " __private const int out_h,\n" " __private const int out_w,\n" " __private const int out_b,\n" " __private const int out_c_block,\n" " __private const int out_c_pack) {\n" " const int out_c_w_idx=get_global_id(0); //c/4 w\n" " const int out_b_h_idx=get_global_id(1); //b h\n" " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n" " const int out_c_idx=out_c_w_idx/out_w_blocks;\n" " const int out_w_idx=out_c_w_idx % out_w_blocks;\n" " const int out_b_idx=out_b_h_idx/out_h; // equal to in_b_idx\n" " const int out_h_idx=out_b_h_idx % out_h; // equal to in_h_idx\n" " const int out_w4_idx=mul24(out_w_idx,4);\n" " COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias_ptr));\n" " COMPUTE_FLOAT4 out1=out0;\n" " COMPUTE_FLOAT4 out2=out0;\n" " COMPUTE_FLOAT4 out3=out0;\n" " const int intput_width_idx0=out_w4_idx;\n" " int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+intput_width_idx0) << 2;\n" " int offset=out_c_idx*4;\n" " const int inp_add=out_b*out_h*out_w*4;\n" " for (ushort in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n" " \n" " COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n" " COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(1,input+inp_offset));\n" " COMPUTE_FLOAT4 in2=CONVERT_COMPUTE_FLOAT4(vload4(2,input+inp_offset));\n" " COMPUTE_FLOAT4 in3=CONVERT_COMPUTE_FLOAT4(vload4(3,input+inp_offset));\n" " COMPUTE_FLOAT4 weights0=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset));\n" " COMPUTE_FLOAT4 weights1=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack));\n" " COMPUTE_FLOAT4 weights2=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack));\n" " COMPUTE_FLOAT4 weights3=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n" " out0=mad(in0.x,weights0,out0);\n" " out0=mad(in0.y,weights1,out0);\n" " out0=mad(in0.z,weights2,out0);\n" " out0=mad(in0.w,weights3,out0);\n" " \n" " out1=mad(in1.x,weights0,out1);\n" " out1=mad(in1.y,weights1,out1);\n" " out1=mad(in1.z,weights2,out1);\n" " out1=mad(in1.w,weights3,out1);\n" " \n" " out2=mad(in2.x,weights0,out2);\n" " out2=mad(in2.y,weights1,out2);\n" " out2=mad(in2.z,weights2,out2);\n" " out2=mad(in2.w,weights3,out2);\n" " \n" " out3=mad(in3.x,weights0,out3);\n" " out3=mad(in3.y,weights1,out3);\n" " out3=mad(in3.z,weights2,out3);\n" " out3=mad(in3.w,weights3,out3);\n" " \n" " offset += 4*out_c_pack;\n" " inp_offset += inp_add;\n" " }\n" "#ifdef RELU\n" " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n" " out1=fmax(out1,(COMPUTE_FLOAT4)0);\n" " out2=fmax(out2,(COMPUTE_FLOAT4)0);\n" " out3=fmax(out3,(COMPUTE_FLOAT4)0);\n" "#endif\n" "#ifdef RELU6\n" " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" "#endif\n" " const int out_offset=(((out_b_idx+out_c_idx*out_b)*out_h+out_h_idx)* out_w+out_w4_idx)*4;\n" "#ifdef BLOCK_LEAVE\n" " const int remain=out_w-out_w4_idx;\n" " if (remain >= 4) {\n" " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n" " } else if (remain == 3) {\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out2),2,output+out_offset);\n" " } else if (remain == 2) {\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n" " } else if (remain == 1) {\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " }\n" "#else\n" " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n" "#endif\n" "}\n" "__kernel\n" "void conv_2d_1x1_c8h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,\n" " __global const FLOAT *input,\n" " __global const FLOAT *kernel_ptr,\n" " __global const FLOAT *bias_ptr,\n" " __global FLOAT *output,\n" " __private const int in_c_block,\n" " __private const int out_h,\n" " __private const int out_w,\n" " __private const int out_b,\n" " __private const int out_c_block,\n" " __private const int out_c_pack) {\n" " const int out_c_w_idx=get_global_id(0); //c/8 w/4\n" " const int out_b_h_idx=get_global_id(1); //b h\n" " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n" " const int out_c_idx_0=(out_c_w_idx/out_w_blocks) << 1;\n" " const int out_c_idx_1=out_c_idx_0+1;\n" " const int out_w_idx=out_c_w_idx % out_w_blocks;\n" " const int out_b_idx=out_b_h_idx/out_h;//equal to in_b_idx\n" " const int out_h_idx=out_b_h_idx % out_h;//equal to in_h_idx\n" " const int out_w4_idx=mul24(out_w_idx,4);\n" " COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias_ptr));\n" " COMPUTE_FLOAT4 out1=out0;\n" " COMPUTE_FLOAT4 out2=out0;\n" " COMPUTE_FLOAT4 out3=out0;\n" " \n" " #ifdef CHANNEL_BOUNDARY_PROTECT\n" " COMPUTE_FLOAT4 out4=out_c_idx_1 >= out_c_block ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias_ptr));\n" " COMPUTE_FLOAT4 out5=out4;\n" " COMPUTE_FLOAT4 out6=out4;\n" " COMPUTE_FLOAT4 out7=out4;\n" " #else\n" " COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias_ptr));\n" " COMPUTE_FLOAT4 out5=out4;\n" " COMPUTE_FLOAT4 out6=out4;\n" " COMPUTE_FLOAT4 out7=out4;\n" " #endif\n" " const int intput_width_idx0=out_w4_idx;\n" " int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+intput_width_idx0)<<2;\n" " int offset=out_c_idx_0*4;\n" " const int inp_add=out_b*out_h*out_w*4;\n" " for (int in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n" " \n" " COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n" " COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(1,input+inp_offset));\n" " COMPUTE_FLOAT4 in2=CONVERT_COMPUTE_FLOAT4(vload4(2,input+inp_offset));\n" " COMPUTE_FLOAT4 in3=CONVERT_COMPUTE_FLOAT4(vload4(3,input+inp_offset));\n" " \n" " // output_channel at least pack to 8,no need boundry protect\n" " COMPUTE_FLOAT4 weights0=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset));\n" " COMPUTE_FLOAT4 weights1=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset));\n" " COMPUTE_FLOAT4 weights2=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack));\n" " COMPUTE_FLOAT4 weights3=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset+out_c_pack));\n" " COMPUTE_FLOAT4 weights4=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack));\n" " COMPUTE_FLOAT4 weights5=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset+out_c_pack+out_c_pack));\n" " COMPUTE_FLOAT4 weights6=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n" " COMPUTE_FLOAT4 weights7=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n" " out0=mad(in0.x,weights0,out0);\n" " out0=mad(in0.y,weights2,out0);\n" " out0=mad(in0.z,weights4,out0);\n" " out0=mad(in0.w,weights6,out0);\n" " \n" " out1=mad(in1.x,weights0,out1);\n" " out1=mad(in1.y,weights2,out1);\n" " out1=mad(in1.z,weights4,out1);\n" " out1=mad(in1.w,weights6,out1);\n" " \n" " out2=mad(in2.x,weights0,out2);\n" " out2=mad(in2.y,weights2,out2);\n" " out2=mad(in2.z,weights4,out2);\n" " out2=mad(in2.w,weights6,out2);\n" " \n" " out3=mad(in3.x,weights0,out3);\n" " out3=mad(in3.y,weights2,out3);\n" " out3=mad(in3.z,weights4,out3);\n" " out3=mad(in3.w,weights6,out3);\n" " \n" " out4=mad(in0.x,weights1,out4);\n" " out4=mad(in0.y,weights3,out4);\n" " out4=mad(in0.z,weights5,out4);\n" " out4=mad(in0.w,weights7,out4);\n" " \n" " out5=mad(in1.x,weights1,out5);\n" " out5=mad(in1.y,weights3,out5);\n" " out5=mad(in1.z,weights5,out5);\n" " out5=mad(in1.w,weights7,out5);\n" " \n" " out6=mad(in2.x,weights1,out6);\n" " out6=mad(in2.y,weights3,out6);\n" " out6=mad(in2.z,weights5,out6);\n" " out6=mad(in2.w,weights7,out6);\n" " \n" " out7=mad(in3.x,weights1,out7);\n" " out7=mad(in3.y,weights3,out7);\n" " out7=mad(in3.z,weights5,out7);\n" " out7=mad(in3.w,weights7,out7);\n" " \n" " offset += 4*out_c_pack;\n" " inp_offset += inp_add;\n" " }\n" "#ifdef RELU\n" " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n" " out1=fmax(out1,(COMPUTE_FLOAT4)0);\n" " out2=fmax(out2,(COMPUTE_FLOAT4)0);\n" " out3=fmax(out3,(COMPUTE_FLOAT4)0);\n" " \n" " out4=fmax(out4,(COMPUTE_FLOAT4)0);\n" " out5=fmax(out5,(COMPUTE_FLOAT4)0);\n" " out6=fmax(out6,(COMPUTE_FLOAT4)0);\n" " out7=fmax(out7,(COMPUTE_FLOAT4)0);\n" "#endif\n" "#ifdef RELU6\n" " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " \n" " out4=clamp(out4,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out5=clamp(out5,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" "#endif\n" " const int out_offset=(((out_b_idx+out_c_idx_0*out_b)*out_h+out_h_idx)* out_w+out_w4_idx)*4;\n" " __global FLOAT*_tempoutput=output+out_offset;\n" " __global FLOAT*_tempoutput1=_tempoutput+4*out_h*out_w*out_b;\n" "#ifdef BLOCK_LEAVE\n" " const int remain=out_w-out_w4_idx;\n" " if (remain >= 4) {\n" " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,_tempoutput);\n" " } else if (remain == 3) {\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,_tempoutput);\n" " vstore4(CONVERT_FLOAT4(out2),2,_tempoutput);\n" " } else if (remain == 2) {\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,_tempoutput);\n" " } else if (remain == 1) {\n" " vstore4(CONVERT_FLOAT4(out0),0,_tempoutput);\n" " }\n" "#ifdef CHANNEL_BOUNDARY_PROTECT\n" " if(out_c_idx_1 >= out_c_block) {\n" " return;\n" " }\n" "#endif\n" " if (remain >= 4) {\n" " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,_tempoutput1);\n" " } else if (remain == 3) {\n" " vstore8(CONVERT_FLOAT8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4,out5))),0,_tempoutput1);\n" " vstore4(CONVERT_FLOAT4(out6),2,_tempoutput1);\n" " } else if (remain == 2) {\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4,out5)),0,_tempoutput1);\n" " } else if (remain == 1) {\n" " vstore4(CONVERT_FLOAT4(out4),0,_tempoutput1);\n" " }\n" "#else\n" " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,_tempoutput);\n" "#ifdef CHANNEL_BOUNDARY_PROTECT\n" " if(out_c_idx_1 >= out_c_block) {\n" " return;\n" " }\n" "#endif\n" " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,_tempoutput1);\n" "#endif\n" "}\n" "__kernel\n" "void conv_2d_1x1_c8h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,\n" " __global const FLOAT *input,\n" " __global const FLOAT *kernel_ptr,\n" " __global const FLOAT *bias_ptr,\n" " __global FLOAT *output,\n" " __private const int in_c_block,\n" " __private const int out_h,\n" " __private const int out_w,\n" " __private const int out_b,\n" " __private const int out_c_block,\n" " __private const int out_c_pack) {\n" " const int out_c_w_idx=get_global_id(0); //c/8 w/4\n" " const int out_b_h_idx=get_global_id(1); //b h\n" " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n" " const int out_c_idx_0=(out_c_w_idx/out_w_blocks) << 1;\n" " const int out_c_idx_1=out_c_idx_0+1;\n" " const int out_w_idx=out_c_w_idx % out_w_blocks;\n" " const int out_b_idx=out_b_h_idx/out_h;//equal to in_b_idx\n" " const int out_h_idx=out_b_h_idx % out_h;//equal to in_h_idx\n" " \n" " const int out_w2_idx=mul24(out_w_idx,2);\n" " COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias_ptr));\n" " COMPUTE_FLOAT4 out1=out0;\n" " \n" " #ifdef CHANNEL_BOUNDARY_PROTECT\n" " COMPUTE_FLOAT4 out4=out_c_idx_1 >= out_c_block ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias_ptr));\n" " #else\n" " COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias_ptr));\n" " #endif\n" " COMPUTE_FLOAT4 out5=out4;\n" " const int intput_width_idx0=out_w2_idx;\n" " int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+intput_width_idx0)<<2;\n" " int offset=out_c_idx_0*4;\n" " const int inp_add=out_b*out_h*out_w*4;\n" " for (int in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n" " \n" " COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n" " COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(1,input+inp_offset));\n" " COMPUTE_FLOAT4 weights0=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset));\n" " COMPUTE_FLOAT4 weights1=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset));\n" " COMPUTE_FLOAT4 weights2=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack));\n" " COMPUTE_FLOAT4 weights3=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset+out_c_pack));\n" " COMPUTE_FLOAT4 weights4=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack));\n" " COMPUTE_FLOAT4 weights5=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset+out_c_pack+out_c_pack));\n" " COMPUTE_FLOAT4 weights6=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n" " COMPUTE_FLOAT4 weights7=CONVERT_COMPUTE_FLOAT4(vload4(1,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n" " out0=mad(in0.x,weights0,out0);\n" " out0=mad(in0.y,weights2,out0);\n" " out0=mad(in0.z,weights4,out0);\n" " out0=mad(in0.w,weights6,out0);\n" " \n" " out1=mad(in1.x,weights0,out1);\n" " out1=mad(in1.y,weights2,out1);\n" " out1=mad(in1.z,weights4,out1);\n" " out1=mad(in1.w,weights6,out1);\n" " \n" " out4=mad(in0.x,weights1,out4);\n" " out4=mad(in0.y,weights3,out4);\n" " out4=mad(in0.z,weights5,out4);\n" " out4=mad(in0.w,weights7,out4);\n" " \n" " out5=mad(in1.x,weights1,out5);\n" " out5=mad(in1.y,weights3,out5);\n" " out5=mad(in1.z,weights5,out5);\n" " out5=mad(in1.w,weights7,out5);\n" " \n" " offset += 4*out_c_pack;\n" " inp_offset += inp_add;\n" " }\n" "#ifdef RELU\n" " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n" " out1=fmax(out1,(COMPUTE_FLOAT4)0);\n" " out4=fmax(out4,(COMPUTE_FLOAT4)0);\n" " out5=fmax(out5,(COMPUTE_FLOAT4)0);\n" "#endif\n" "#ifdef RELU6\n" " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out4=clamp(out4,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out5=clamp(out5,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" "#endif\n" " const int out_offset=(((out_b_idx+out_c_idx_0*out_b)*out_h+out_h_idx)* out_w+out_w2_idx)*4;\n" " __global FLOAT*_tempoutput=output+out_offset;\n" " __global FLOAT*_tempoutput1=_tempoutput+4*out_h*out_w*out_b;\n" "#ifdef BLOCK_LEAVE\n" " const int remain=out_w-out_w2_idx;\n" " if (remain >= 2) {\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,_tempoutput);\n" " } else if (remain == 1) {\n" " vstore4(CONVERT_FLOAT4(out0),0,_tempoutput);\n" " }\n" "#ifdef CHANNEL_BOUNDARY_PROTECT\n" " if(out_c_idx_1 >= out_c_block) {\n" " return;\n" " }\n" "#endif\n" " if (remain >= 2) {\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4,out5)),0,_tempoutput1);\n" " } else if (remain == 1) {\n" " vstore4(CONVERT_FLOAT4(out4),0,_tempoutput1);\n" " }\n" "#else\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,_tempoutput);\n" "#ifdef CHANNEL_BOUNDARY_PROTECT\n" " if(out_c_idx_1 >= out_c_block) {\n" " return;\n" " }\n" "#endif\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4,out5)),0,_tempoutput1);\n" "#endif\n" "}\n" "__kernel\n" "void conv_2d_1x1_c4h1w1(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,\n" " __global const FLOAT *input,\n" " __global const FLOAT *kernel_ptr,\n" " __global const FLOAT *bias_ptr,\n" " __global FLOAT *output,\n" " __private const int in_c_block,\n" " __private const int out_h,\n" " __private const int out_w,\n" " __private const int out_b,\n" " __private const int out_c_block,\n" " __private const int out_c_pack) {\n" " const int out_c_w_idx=get_global_id(0); //c/4 w\n" " const int out_b_h_idx=get_global_id(1); //b h\n" " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n" " const int out_c_idx=out_c_w_idx/out_w;\n" " const int out_w_idx=out_c_w_idx % out_w;\n" " const int out_b_idx=out_b_h_idx/out_h;//equal to in_b_idx\n" " const int out_h_idx=out_b_h_idx % out_h;//equal to in_h_idx\n" " COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias_ptr));\n" " const int intput_width_idx0=out_w_idx;\n" " int offset=out_c_idx*4;\n" " int inp_offset=((out_b_idx*out_h+out_h_idx)*out_w+intput_width_idx0)*4;\n" " const int inp_add=out_b*out_h*out_w*4;\n" " \n" " for (int in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n" " \n" " \n" " COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n" " COMPUTE_FLOAT4 weights0=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset));\n" " COMPUTE_FLOAT4 weights1=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack));\n" " COMPUTE_FLOAT4 weights2=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack));\n" " COMPUTE_FLOAT4 weights3=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n" " out0=mad(in0.x,weights0,out0);\n" " out0=mad(in0.y,weights1,out0);\n" " out0=mad(in0.z,weights2,out0);\n" " out0=mad(in0.w,weights3,out0);\n" " \n" " offset += 4*out_c_pack;\n" " inp_offset += inp_add;\n" " }\n" "#ifdef RELU\n" " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n" "#endif\n" "#ifdef RELU6\n" " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" "#endif\n" " const int out_offset=(((out_b_idx+out_c_idx*out_b)*out_h+out_h_idx)* out_w+out_w_idx)*4;\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" "}\n" "__kernel\n" "void conv_2d_1x1_c4h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,\n" " __global const FLOAT *input,\n" " __global const FLOAT *kernel_ptr,\n" " __global const FLOAT *bias_ptr,\n" " __global FLOAT *output,\n" " __private const int in_c_block,\n" " __private const int out_h,\n" " __private const int out_w,\n" " __private const int out_b,\n" " __private const int out_c_block,\n" " __private const int out_c_pack) {\n" " const int out_c_w_idx=get_global_id(0); //c/4 w\n" " const int out_b_h_idx=get_global_id(1); //b h\n" " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n" " const int out_c_idx=out_c_w_idx/out_w_blocks;\n" " const int out_w_idx=out_c_w_idx % out_w_blocks;\n" " const int out_b_idx=out_b_h_idx/out_h;//equal to in_b_idx\n" " const int out_h_idx=out_b_h_idx % out_h;//equal to in_h_idx\n" " const int out_w2_idx=mul24(out_w_idx,2);\n" " COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias_ptr));\n" " COMPUTE_FLOAT4 out1=out0;\n" " const int intput_width_idx0=out_w2_idx;\n" " int offset=out_c_idx*4;\n" " int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+intput_width_idx0)*4;\n" " const int inp_add=out_b*out_h*out_w*4;\n" " \n" " for (int in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n" " \n" " COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n" " COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(1,input+inp_offset));\n" " COMPUTE_FLOAT4 weights0=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset));\n" " COMPUTE_FLOAT4 weights1=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack));\n" " COMPUTE_FLOAT4 weights2=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack));\n" " COMPUTE_FLOAT4 weights3=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n" " out0=mad(in0.x,weights0,out0);\n" " out0=mad(in0.y,weights1,out0);\n" " out0=mad(in0.z,weights2,out0);\n" " out0=mad(in0.w,weights3,out0);\n" " \n" " out1=mad(in1.x,weights0,out1);\n" " out1=mad(in1.y,weights1,out1);\n" " out1=mad(in1.z,weights2,out1);\n" " out1=mad(in1.w,weights3,out1);\n" " \n" " offset += 4*out_c_pack;\n" " inp_offset += inp_add;\n" " }\n" "#ifdef RELU\n" " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n" " out1=fmax(out1,(COMPUTE_FLOAT4)0);\n" "#endif\n" "#ifdef RELU6\n" " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" "#endif\n" " const int out_offset=(((out_b_idx+out_c_idx*out_b)*out_h+out_h_idx)* out_w+out_w2_idx)*4;\n" "#ifdef BLOCK_LEAVE\n" " const int remain=out_w-out_w2_idx;\n" " if (remain >= 2) {\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n" " } else if (remain == 1) {\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " }\n" "#else\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n" "#endif\n" "}\n" "__kernel\n" "void conv_2d_c4h1w1(GLOBAL_SIZE_2_DIMS\n" " __global const FLOAT *input,\n" " __global const FLOAT *weight,\n" " __global const FLOAT *bias,\n" " __global FLOAT *output,\n" " __private const int2 in_hw,\n" " __private const int inChannel,\n" " __private const int in_c_blocks,\n" " __private const int batch,\n" " __private const int2 out_hw,\n" " __private const int2 filter_hw,\n" " __private const int2 stride_hw,\n" " __private const int2 pad_hw,\n" " __private const int2 dilate_hw,\n" " __private const int out_w_blocks,\n" " __private const int out_c_blocks,\n" " __private const int out_h_blocks,\n" " __private const int out_c_base_index\n" ") {\n" " const int out_c_w_idx=get_global_id(0); //c/4 w\n" " const int out_b_h_idx=get_global_id(1); //b h\n" " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n" " const int out_c_idx=out_c_w_idx/out_hw.y+out_c_base_index;\n" " if(out_c_idx >= out_c_blocks) return;\n" " const int out_w_idx=out_c_w_idx % out_hw.y;\n" " const int out_b_idx=out_b_h_idx/out_hw.x;//equal to in_b_idx\n" " const int out_h_idx=out_b_h_idx % out_hw.x;\n" " \n" " COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n" " \n" " const int in_w_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n" " const int in_h_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n" " \n" " const int kw_start=select(0,(-in_w_idx_base+dilate_hw.y-1)/dilate_hw.y,in_w_idx_base<0);\n" " const int kh_start=select(0,(-in_h_idx_base+dilate_hw.x-1)/dilate_hw.x,in_h_idx_base<0);\n" " const int in_w_idx_start=mad24(kw_start,dilate_hw.y,in_w_idx_base);\n" " const int in_w_idx_end=min(mad24(filter_hw.y,dilate_hw.y,in_w_idx_base),in_hw.y);\n" " \n" " const int in_h_idx_start=mad24(kh_start,dilate_hw.x,in_h_idx_base);\n" " const int in_h_idx_end=min(mad24(filter_hw.x,dilate_hw.x,in_h_idx_base),in_hw.x);\n" " \n" " const int weight_oc_offset=out_c_blocks*filter_hw.x*filter_hw.y*4;\n" " for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n" " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n" " //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n" " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+kw_start)*4;\n" " for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n" " for(int ix=in_w_idx_start; ix<in_w_idx_end; ix += dilate_hw.y) {\n" " int inp_offset=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+ix)*4;\n" " COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n" " \n" " const int filter_w_inc=(ix-in_w_idx_start)/dilate_hw.y;\n" " COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(vload4(filter_w_inc,weight+weight_offset));\n" " COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(vload4(filter_w_inc,weight+weight_offset+weight_oc_offset));\n" " COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(vload4(filter_w_inc,weight+weight_offset+weight_oc_offset*2));\n" " COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(vload4(filter_w_inc,weight+weight_offset+weight_oc_offset*3));\n" " out0=mad(in0.x,weight0,out0);\n" " out0=mad(in0.y,weight1,out0);\n" " out0=mad(in0.z,weight2,out0);\n" " out0=mad(in0.w,weight3,out0);\n" " }\n" " weight_offset += 4*filter_hw.y;\n" " }\n" " }\n" "#ifdef RELU\n" " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n" "#endif\n" "#ifdef RELU6\n" " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" "#endif\n" " const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " \n" "}\n" "__kernel\n" "void conv_2d_c4h1w2(GLOBAL_SIZE_2_DIMS\n" " __global const FLOAT *input,\n" " __global const FLOAT *weight,\n" " __global const FLOAT *bias,\n" " __global FLOAT *output,\n" " __private const int2 in_hw,\n" " __private const int inChannel,\n" " __private const int in_c_blocks,\n" " __private const int batch,\n" " __private const int2 out_hw,\n" " __private const int2 filter_hw,\n" " __private const int2 stride_hw,\n" " __private const int2 pad_hw,\n" " __private const int2 dilate_hw,\n" " __private const int out_w_blocks,//generate width's num\n" " __private const int out_c_blocks,\n" " __private const int out_h_blocks,\n" " __private const int out_c_base_index\n" ") {\n" " const int out_c_w_idx=get_global_id(0); //c/4 w\n" " const int out_b_h_idx=get_global_id(1); //b h\n" " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n" " const int out_c_idx=out_c_w_idx/out_w_blocks+out_c_base_index;\n" " if(out_c_idx >= out_c_blocks) return;\n" " const int out_w_idx=(out_c_w_idx % out_w_blocks) << 1;\n" " const int out_b_idx=out_b_h_idx/out_hw.x;//equal to in_b_idx\n" " const int out_h_idx=out_b_h_idx % out_hw.x;\n" " \n" " COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n" " COMPUTE_FLOAT4 out1=out0;\n" " \n" " const int in_w0_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n" " const int in_w1_idx_base=in_w0_idx_base+stride_hw.y;\n" " const int in_h_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n" " \n" " const int kh_start=select(0,(-in_h_idx_base+dilate_hw.x-1)/dilate_hw.x,in_h_idx_base<0);\n" " const int in_h_idx_start=mad24(kh_start,dilate_hw.x,in_h_idx_base);\n" " const int in_h_idx_end=min(mad24(filter_hw.x,dilate_hw.x,in_h_idx_base),in_hw.x);\n" " \n" " const int weight_oc_offset=out_c_blocks*filter_hw.x*filter_hw.y*4;\n" " for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n" " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n" " //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n" " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n" " for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n" " const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n" " for(int fw=0; fw<filter_hw.y; fw++) {\n" " const int in_w0_idx=fw*dilate_hw.y+in_w0_idx_base;\n" " const int in_w1_idx=fw*dilate_hw.y+in_w1_idx_base;\n" " COMPUTE_FLOAT4 in0=(in_w0_idx<0 || in_w0_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w0_idx,input+inp_offset_base));\n" " COMPUTE_FLOAT4 in1=(in_w1_idx<0 || in_w1_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w1_idx,input+inp_offset_base));\n" " \n" " COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset));\n" " COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n" " COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset*2));\n" " COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset*3));\n" " out0=mad(in0.x,weight0,out0);\n" " out0=mad(in0.y,weight1,out0);\n" " out0=mad(in0.z,weight2,out0);\n" " out0=mad(in0.w,weight3,out0);\n" " \n" " out1=mad(in1.x,weight0,out1);\n" " out1=mad(in1.y,weight1,out1);\n" " out1=mad(in1.z,weight2,out1);\n" " out1=mad(in1.w,weight3,out1);\n" " \n" " weight_offset += 4;\n" " }\n" " }\n" " }\n" "#ifdef RELU\n" " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n" " out1=fmax(out1,(COMPUTE_FLOAT4)0);\n" "#endif\n" "#ifdef RELU6\n" " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" "#endif\n" " const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n" "#ifdef BLOCK_LEAVE\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " if(out_w_idx+1 >= out_hw.y) return;\n" " vstore4(CONVERT_FLOAT4(out1),1,output+out_offset);\n" "#else\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n" "#endif\n" "}\n" "__kernel\n" "void conv_2d_c4h1w4(GLOBAL_SIZE_2_DIMS\n" " __global const FLOAT *input,\n" " __global const FLOAT *weight,\n" " __global const FLOAT *bias,\n" " __global FLOAT *output,\n" " __private const int2 in_hw,\n" " __private const int inChannel,\n" " __private const int in_c_blocks,\n" " __private const int batch,\n" " __private const int2 out_hw,\n" " __private const int2 filter_hw,\n" " __private const int2 stride_hw,\n" " __private const int2 pad_hw,\n" " __private const int2 dilate_hw,\n" " __private const int out_w_blocks,\n" " __private const int out_c_blocks,\n" " __private const int out_h_blocks,\n" " __private const int out_c_base_index\n" ") {\n" " const int out_c_w_idx=get_global_id(0); //c/4 w\n" " const int out_b_h_idx=get_global_id(1); //b h\n" " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n" " const int out_c_idx=out_c_w_idx/out_w_blocks+out_c_base_index;\n" " if(out_c_idx >= out_c_blocks) return;\n" " const int out_w_idx=(out_c_w_idx % out_w_blocks) << 2;\n" " const int out_b_idx=out_b_h_idx/out_hw.x;//equal to in_b_idx\n" " const int out_h_idx=out_b_h_idx % out_hw.x;\n" " COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n" " COMPUTE_FLOAT4 out1=out0;\n" " COMPUTE_FLOAT4 out2=out0;\n" " COMPUTE_FLOAT4 out3=out0;\n" " const int in_w0_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n" " const int in_w1_idx_base=in_w0_idx_base+stride_hw.y;\n" " const int in_w2_idx_base=in_w1_idx_base+stride_hw.y;\n" " const int in_w3_idx_base=in_w2_idx_base+stride_hw.y;\n" " const int in_h_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n" " \n" " const int kh_start=select(0,(-in_h_idx_base+dilate_hw.x-1)/dilate_hw.x,in_h_idx_base<0);\n" " const int in_h_idx_start=mad24(kh_start,dilate_hw.x,in_h_idx_base);\n" " const int in_h_idx_end=min(mad24(filter_hw.x,dilate_hw.x,in_h_idx_base),in_hw.x);\n" " \n" " const int weight_oc_offset=out_c_blocks*filter_hw.x*filter_hw.y*4;\n" " for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n" " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n" " //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n" " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n" " for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n" " const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n" " for(int fw=0; fw<filter_hw.y; fw++) {\n" " const int in_w0_idx=fw*dilate_hw.y+in_w0_idx_base;\n" " const int in_w1_idx=fw*dilate_hw.y+in_w1_idx_base;\n" " const int in_w2_idx=fw*dilate_hw.y+in_w2_idx_base;\n" " const int in_w3_idx=fw*dilate_hw.y+in_w3_idx_base;\n" " COMPUTE_FLOAT4 in0=(in_w0_idx<0 || in_w0_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w0_idx,input+inp_offset_base));\n" " COMPUTE_FLOAT4 in1=(in_w1_idx<0 || in_w1_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w1_idx,input+inp_offset_base));\n" " COMPUTE_FLOAT4 in2=(in_w2_idx<0 || in_w2_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w2_idx,input+inp_offset_base));\n" " COMPUTE_FLOAT4 in3=(in_w3_idx<0 || in_w3_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w3_idx,input+inp_offset_base));\n" " COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset));\n" " COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n" " COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset*2));\n" " COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset*3));\n" " out0=mad(in0.x,weight0,out0);\n" " out0=mad(in0.y,weight1,out0);\n" " out0=mad(in0.z,weight2,out0);\n" " out0=mad(in0.w,weight3,out0);\n" " \n" " out1=mad(in1.x,weight0,out1);\n" " out1=mad(in1.y,weight1,out1);\n" " out1=mad(in1.z,weight2,out1);\n" " out1=mad(in1.w,weight3,out1);\n" " \n" " out2=mad(in2.x,weight0,out2);\n" " out2=mad(in2.y,weight1,out2);\n" " out2=mad(in2.z,weight2,out2);\n" " out2=mad(in2.w,weight3,out2);\n" " \n" " out3=mad(in3.x,weight0,out3);\n" " out3=mad(in3.y,weight1,out3);\n" " out3=mad(in3.z,weight2,out3);\n" " out3=mad(in3.w,weight3,out3);\n" " \n" " weight_offset += 4;\n" " }\n" " }\n" " }\n" "#ifdef RELU\n" " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n" " out1=fmax(out1,(COMPUTE_FLOAT4)0);\n" " out2=fmax(out2,(COMPUTE_FLOAT4)0);\n" " out3=fmax(out3,(COMPUTE_FLOAT4)0);\n" "#endif\n" "#ifdef RELU6\n" " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" "#endif\n" " const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n" "#ifdef BLOCK_LEAVE\n" " const int remain=out_hw.y-out_w_idx;\n" " if (remain >= 4) {\n" " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n" " }else if(remain == 3){\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out2),2,output+out_offset);\n" " }else if(remain == 2){\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n" " }else if(remain == 1){\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " }\n" "#else\n" " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n" "#endif\n" "}\n" "__kernel\n" "void conv_2d_c4h4w1(GLOBAL_SIZE_2_DIMS\n" " __global const FLOAT *input,\n" " __global const FLOAT *weight,\n" " __global const FLOAT *bias,\n" " __global FLOAT *output,\n" " __private const int2 in_hw,\n" " __private const int inChannel,\n" " __private const int in_c_blocks,\n" " __private const int batch,\n" " __private const int2 out_hw,\n" " __private const int2 filter_hw,\n" " __private const int2 stride_hw,\n" " __private const int2 pad_hw,\n" " __private const int2 dilate_hw,\n" " __private const int out_w_blocks,\n" " __private const int out_c_blocks,\n" " __private const int out_h_blocks,\n" " __private const int out_c_base_index\n" ") {\n" " const int out_c_w_idx=get_global_id(0); //c/4 w\n" " const int out_b_h_idx=get_global_id(1); //b h\n" " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n" " const int out_c_idx=out_c_w_idx/out_w_blocks+out_c_base_index;\n" " if(out_c_idx >= out_c_blocks) return;\n" " const int out_w_idx=out_c_w_idx % out_w_blocks;\n" " const int out_b_idx=out_b_h_idx/out_h_blocks;//equal to in_b_idx\n" " const int out_h_idx=(out_b_h_idx % out_h_blocks) << 2;\n" " \n" " COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n" " COMPUTE_FLOAT4 out1=out0;\n" " COMPUTE_FLOAT4 out2=out0;\n" " COMPUTE_FLOAT4 out3=out0;\n" " const int in_w_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n" " const int in_h0_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n" " const int in_h1_idx_base=in_h0_idx_base+stride_hw.x;\n" " const int in_h2_idx_base=in_h1_idx_base+stride_hw.x;\n" " const int in_h3_idx_base=in_h2_idx_base+stride_hw.x;\n" " \n" " const int kw_start=select(0,(-in_w_idx_base+dilate_hw.y-1)/dilate_hw.y,in_w_idx_base<0);\n" " const int in_w_idx_start=mad24(kw_start,dilate_hw.y,in_w_idx_base);\n" " const int in_w_idx_end=min(mad24(filter_hw.y,dilate_hw.y,in_w_idx_base),in_hw.y);\n" " \n" " const int weight_oc_offset=out_c_blocks*filter_hw.x*filter_hw.y*4;\n" " const int in_hw_size=in_hw.x*in_hw.y;\n" " for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n" " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n" " //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n" " const int inp_offset_base=(out_b_idx+in_c_idx*batch)*in_hw.x*in_hw.y*4;\n" " for(int iy=0; iy<filter_hw.x; iy++) {\n" " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n" " const int in_h0_idx=(iy*dilate_hw.x+in_h0_idx_base)*in_hw.y;\n" " const int in_h1_idx=(iy*dilate_hw.x+in_h1_idx_base)*in_hw.y;\n" " const int in_h2_idx=(iy*dilate_hw.x+in_h2_idx_base)*in_hw.y;\n" " const int in_h3_idx=(iy*dilate_hw.x+in_h3_idx_base)*in_hw.y;\n" " for(int fw=in_w_idx_start; fw<in_w_idx_end; fw += dilate_hw.y) {\n" " COMPUTE_FLOAT4 in0=(in_h0_idx<0 || in_h0_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h0_idx+fw,input+inp_offset_base));\n" " COMPUTE_FLOAT4 in1=(in_h1_idx<0 || in_h1_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h1_idx+fw,input+inp_offset_base));\n" " COMPUTE_FLOAT4 in2=(in_h2_idx<0 || in_h2_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h2_idx+fw,input+inp_offset_base));\n" " COMPUTE_FLOAT4 in3=(in_h3_idx<0 || in_h3_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h3_idx+fw,input+inp_offset_base));\n" " COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset));\n" " COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n" " COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset*2));\n" " COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset*3));\n" " \n" " out0=mad(in0.x,weight0,out0);\n" " out0=mad(in0.y,weight1,out0);\n" " out0=mad(in0.z,weight2,out0);\n" " out0=mad(in0.w,weight3,out0);\n" " \n" " out1=mad(in1.x,weight0,out1);\n" " out1=mad(in1.y,weight1,out1);\n" " out1=mad(in1.z,weight2,out1);\n" " out1=mad(in1.w,weight3,out1);\n" " \n" " out2=mad(in2.x,weight0,out2);\n" " out2=mad(in2.y,weight1,out2);\n" " out2=mad(in2.z,weight2,out2);\n" " out2=mad(in2.w,weight3,out2);\n" " \n" " out3=mad(in3.x,weight0,out3);\n" " out3=mad(in3.y,weight1,out3);\n" " out3=mad(in3.z,weight2,out3);\n" " out3=mad(in3.w,weight3,out3);\n" " \n" " weight_offset += 4;\n" " }\n" " }\n" " }\n" "#ifdef RELU\n" " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n" " out1=fmax(out1,(COMPUTE_FLOAT4)0);\n" " out2=fmax(out2,(COMPUTE_FLOAT4)0);\n" " out3=fmax(out3,(COMPUTE_FLOAT4)0);\n" "#endif\n" "#ifdef RELU6\n" " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" "#endif\n" " const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n" "#ifdef BLOCK_LEAVE\n" " const int remain=out_hw.x-out_h_idx;\n" " if(remain >= 4){\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out2),2*out_hw.y,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out3),3*out_hw.y,output+out_offset);\n" " }else if(remain == 3){\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out2),2*out_hw.y,output+out_offset);\n" " }else if(remain == 2){\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n" " }else if(remain == 1){\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " }\n" "#else\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out2),2*out_hw.y,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out3),3*out_hw.y,output+out_offset);\n" "#endif\n" "}\n" "__kernel\n" "void conv_2d_c8h4w1(GLOBAL_SIZE_2_DIMS\n" " __global const FLOAT *input,\n" " __global const FLOAT *weight,\n" " __global const FLOAT *bias,\n" " __global FLOAT *output,\n" " __private const int2 in_hw,\n" " __private const int inChannel,\n" " __private const int in_c_blocks,\n" " __private const int batch,\n" " __private const int2 out_hw,\n" " __private const int2 filter_hw,\n" " __private const int2 stride_hw,\n" " __private const int2 pad_hw,\n" " __private const int2 dilate_hw,\n" " __private const int out_w_blocks,\n" " __private const int out_c_blocks,\n" " __private const int out_h_blocks,\n" " __private const int out_c_base_index\n" ") {\n" " const int out_c_w_idx=get_global_id(0); //c/4 w\n" " const int out_b_h_idx=get_global_id(1); //b h\n" " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n" " const int out_c_idx_0=((out_c_w_idx/out_w_blocks+out_c_base_index) << 1);\n" " if(out_c_idx_0 >= out_c_blocks) return;\n" " const int out_c_idx_1=out_c_idx_0+1;\n" " const int out_w_idx=out_c_w_idx % out_w_blocks;\n" " const int out_b_idx=out_b_h_idx/out_h_blocks;//equal to in_b_idx\n" " const int out_h_idx=(out_b_h_idx % out_h_blocks) << 2;\n" " \n" " COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias));\n" " COMPUTE_FLOAT4 out1=out0;\n" " COMPUTE_FLOAT4 out2=out0;\n" " COMPUTE_FLOAT4 out3=out0;\n" " #ifdef CHANNEL_BOUNDARY_PROTECT\n" " COMPUTE_FLOAT4 out4=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n" " #else\n" " COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n" " #endif\n" " COMPUTE_FLOAT4 out5=out4;\n" " COMPUTE_FLOAT4 out6=out4;\n" " COMPUTE_FLOAT4 out7=out4;\n" " const int in_w_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n" " const int in_h0_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n" " const int in_h1_idx_base=in_h0_idx_base+stride_hw.x;\n" " const int in_h2_idx_base=in_h1_idx_base+stride_hw.x;\n" " const int in_h3_idx_base=in_h2_idx_base+stride_hw.x;\n" " \n" " const int kw_start=select(0,(-in_w_idx_base+dilate_hw.y-1)/dilate_hw.y,in_w_idx_base<0);\n" " const int in_w_idx_start=mad24(kw_start,dilate_hw.y,in_w_idx_base);\n" " const int in_w_idx_end=min(mad24(filter_hw.y,dilate_hw.y,in_w_idx_base),in_hw.y);\n" " \n" " const int weight_oc_offset=filter_hw.x*filter_hw.y*4;\n" " const int weight_ic_offset=out_c_blocks*weight_oc_offset;\n" " const int in_hw_size=in_hw.x*in_hw.y;\n" " for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n" " //weights NC4HW4 [ic/4,ic_4,oc/4,kh*kw,oc_4]\n" " //index: [0,4*in_c_idx,out_c_idx_0*kh*kw+kh_start*kw+kw_start,0]\n" " const int inp_offset_base=(out_b_idx+in_c_idx*batch)*in_hw.x*in_hw.y*4;\n" " for(int iy=0; iy<filter_hw.x; iy++) {\n" " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx_0) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n" " const int in_h0_idx=(iy*dilate_hw.x+in_h0_idx_base)*in_hw.y;\n" " const int in_h1_idx=(iy*dilate_hw.x+in_h1_idx_base)*in_hw.y;\n" " const int in_h2_idx=(iy*dilate_hw.x+in_h2_idx_base)*in_hw.y;\n" " const int in_h3_idx=(iy*dilate_hw.x+in_h3_idx_base)*in_hw.y;\n" " for(int fw=in_w_idx_start; fw<in_w_idx_end; fw += dilate_hw.y) {\n" " COMPUTE_FLOAT4 in0=(in_h0_idx<0 || in_h0_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h0_idx+fw,input+inp_offset_base));\n" " COMPUTE_FLOAT4 in1=(in_h1_idx<0 || in_h1_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h1_idx+fw,input+inp_offset_base));\n" " COMPUTE_FLOAT4 in2=(in_h2_idx<0 || in_h2_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h2_idx+fw,input+inp_offset_base));\n" " COMPUTE_FLOAT4 in3=(in_h3_idx<0 || in_h3_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h3_idx+fw,input+inp_offset_base));\n" " COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset));\n" " COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset));\n" " COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset*2));\n" " COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset*3));\n" " \n" " out0=mad(in0.x,weight0,out0);\n" " out0=mad(in0.y,weight1,out0);\n" " out0=mad(in0.z,weight2,out0);\n" " out0=mad(in0.w,weight3,out0);\n" " \n" " out1=mad(in1.x,weight0,out1);\n" " out1=mad(in1.y,weight1,out1);\n" " out1=mad(in1.z,weight2,out1);\n" " out1=mad(in1.w,weight3,out1);\n" " \n" " out2=mad(in2.x,weight0,out2);\n" " out2=mad(in2.y,weight1,out2);\n" " out2=mad(in2.z,weight2,out2);\n" " out2=mad(in2.w,weight3,out2);\n" " \n" " out3=mad(in3.x,weight0,out3);\n" " out3=mad(in3.y,weight1,out3);\n" " out3=mad(in3.z,weight2,out3);\n" " out3=mad(in3.w,weight3,out3);\n" " // weight: [ic/4,ic_4,oc/4,kh*kw,oc_4]\n" " #ifdef CHANNEL_BOUNDARY_PROTECT\n" " weight0=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n" " weight1=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n" " weight2=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n" " weight3=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n" " #else\n" " weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n" " weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n" " weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n" " weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n" " #endif\n" " out4=mad(in0.x,weight0,out4);\n" " out4=mad(in0.y,weight1,out4);\n" " out4=mad(in0.z,weight2,out4);\n" " out4=mad(in0.w,weight3,out4);\n" " \n" " out5=mad(in1.x,weight0,out5);\n" " out5=mad(in1.y,weight1,out5);\n" " out5=mad(in1.z,weight2,out5);\n" " out5=mad(in1.w,weight3,out5);\n" " \n" " out6=mad(in2.x,weight0,out6);\n" " out6=mad(in2.y,weight1,out6);\n" " out6=mad(in2.z,weight2,out6);\n" " out6=mad(in2.w,weight3,out6);\n" " \n" " out7=mad(in3.x,weight0,out7);\n" " out7=mad(in3.y,weight1,out7);\n" " out7=mad(in3.z,weight2,out7);\n" " out7=mad(in3.w,weight3,out7);\n" " \n" " weight_offset += 4;\n" " }\n" " }\n" " }\n" "#ifdef RELU\n" " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n" " out1=fmax(out1,(COMPUTE_FLOAT4)0);\n" " out2=fmax(out2,(COMPUTE_FLOAT4)0);\n" " out3=fmax(out3,(COMPUTE_FLOAT4)0);\n" " out4=fmax(out4,(COMPUTE_FLOAT4)0);\n" " out5=fmax(out5,(COMPUTE_FLOAT4)0);\n" " out6=fmax(out6,(COMPUTE_FLOAT4)0);\n" " out7=fmax(out7,(COMPUTE_FLOAT4)0);\n" "#endif\n" "#ifdef RELU6\n" " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out4=clamp(out4,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out5=clamp(out5,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" "#endif\n" " int out_offset=(((out_b_idx+out_c_idx_0*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n" "#ifdef BLOCK_LEAVE\n" " const int remain=out_hw.x-out_h_idx;\n" " if(remain >= 4){\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out2),2*out_hw.y,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out3),3*out_hw.y,output+out_offset);\n" " }else if(remain == 3){\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out2),2*out_hw.y,output+out_offset);\n" " }else if(remain == 2){\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n" " }else if(remain == 1){\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " }\n" " #ifdef CHANNEL_BOUNDARY_PROTECT\n" " if(out_c_idx_1 >= out_c_blocks){\n" " return;\n" " }\n" " #endif\n" " out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n" " if(remain >= 4){\n" " vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out5),out_hw.y,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out6),2*out_hw.y,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out7),3*out_hw.y,output+out_offset);\n" " }else if(remain == 3){\n" " vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out5),out_hw.y,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out6),2*out_hw.y,output+out_offset);\n" " }else if(remain == 2){\n" " vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out5),out_hw.y,output+out_offset);\n" " }else if(remain == 1){\n" " vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n" " }\n" "#else\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out2),2*out_hw.y,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out3),3*out_hw.y,output+out_offset);\n" " #ifdef CHANNEL_BOUNDARY_PROTECT\n" " if(out_c_idx_1 >= out_c_blocks){\n" " return;\n" " }\n" " #endif\n" " out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n" " vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out5),out_hw.y,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out6),2*out_hw.y,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out7),3*out_hw.y,output+out_offset);\n" "#endif\n" "}\n" "__kernel\n" "void conv_2d_c8h2w1(GLOBAL_SIZE_2_DIMS\n" " __global const FLOAT *input,\n" " __global const FLOAT *weight,\n" " __global const FLOAT *bias,\n" " __global FLOAT *output,\n" " __private const int2 in_hw,\n" " __private const int inChannel,\n" " __private const int in_c_blocks,\n" " __private const int batch,\n" " __private const int2 out_hw,\n" " __private const int2 filter_hw,\n" " __private const int2 stride_hw,\n" " __private const int2 pad_hw,\n" " __private const int2 dilate_hw,\n" " __private const int out_w_blocks,\n" " __private const int out_c_blocks,\n" " __private const int out_h_blocks,\n" " __private const int out_c_base_index\n" ") {\n" " const int out_c_w_idx=get_global_id(0); //c/4 w\n" " const int out_b_h_idx=get_global_id(1); //b h\n" " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n" " const int out_c_idx_0=(out_c_w_idx/out_w_blocks+out_c_base_index) << 1;\n" " if(out_c_idx_0 >= out_c_blocks) return;\n" " const int out_c_idx_1=out_c_idx_0+1;\n" " const int out_w_idx=out_c_w_idx % out_w_blocks;\n" " const int out_b_idx=out_b_h_idx/out_h_blocks;//equal to in_b_idx\n" " const int out_h_idx=(out_b_h_idx % out_h_blocks) << 1;\n" " \n" " COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias));\n" " COMPUTE_FLOAT4 out1=out0;\n" " #ifdef CHANNEL_BOUNDARY_PROTECT\n" " COMPUTE_FLOAT4 out2=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n" " #else\n" " COMPUTE_FLOAT4 out2=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n" " #endif\n" " COMPUTE_FLOAT4 out3=out2;\n" " \n" " const int in_w_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n" " const int in_h0_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n" " const int in_h1_idx_base=in_h0_idx_base+stride_hw.x;\n" " \n" " const int kw_start=select(0,(-in_w_idx_base+dilate_hw.y-1)/dilate_hw.y,in_w_idx_base<0);\n" " const int in_w_idx_start=mad24(kw_start,dilate_hw.y,in_w_idx_base);\n" " const int in_w_idx_end=min(mad24(filter_hw.y,dilate_hw.y,in_w_idx_base),in_hw.y);\n" " \n" " const int weight_oc_offset=filter_hw.x*filter_hw.y*4;\n" " const int weight_ic_offset=out_c_blocks*weight_oc_offset;\n" " const int in_hw_size=in_hw.x*in_hw.y;\n" " // weight: [ic/4,oc,4],loop: ic/4\n" " for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n" " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n" " //index: [0,4*in_c_idx,out_c_idx_0*kh*kw+kh_start*kw+kw_start,0]\n" " const int inp_offset_base=(out_b_idx+in_c_idx*batch)*in_hw.x*in_hw.y*4;\n" " for(int iy=0; iy<filter_hw.x; iy++) {\n" " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx_0) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n" " const int in_h0_idx=(iy*dilate_hw.x+in_h0_idx_base)*in_hw.y;\n" " const int in_h1_idx=(iy*dilate_hw.x+in_h1_idx_base)*in_hw.y;\n" " for(int fw=in_w_idx_start; fw<in_w_idx_end; fw += dilate_hw.y) {\n" " COMPUTE_FLOAT4 in0=(in_h0_idx<0 || in_h0_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h0_idx+fw,input+inp_offset_base));\n" " COMPUTE_FLOAT4 in1=(in_h1_idx<0 || in_h1_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h1_idx+fw,input+inp_offset_base));\n" " COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset));\n" " COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset));\n" " COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset*2));\n" " COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset*3));\n" " \n" " out0=mad(in0.x,weight0,out0);\n" " out0=mad(in0.y,weight1,out0);\n" " out0=mad(in0.z,weight2,out0);\n" " out0=mad(in0.w,weight3,out0);\n" " \n" " out1=mad(in1.x,weight0,out1);\n" " out1=mad(in1.y,weight1,out1);\n" " out1=mad(in1.z,weight2,out1);\n" " out1=mad(in1.w,weight3,out1);\n" " \n" " #ifdef CHANNEL_BOUNDARY_PROTECT\n" " weight0=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n" " weight1=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n" " weight2=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n" " weight3=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n" " #else\n" " weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n" " weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n" " weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n" " weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n" " #endif\n" " out2=mad(in0.x,weight0,out2);\n" " out2=mad(in0.y,weight1,out2);\n" " out2=mad(in0.z,weight2,out2);\n" " out2=mad(in0.w,weight3,out2);\n" " \n" " out3=mad(in1.x,weight0,out3);\n" " out3=mad(in1.y,weight1,out3);\n" " out3=mad(in1.z,weight2,out3);\n" " out3=mad(in1.w,weight3,out3);\n" " \n" " weight_offset += 4;\n" " }\n" " }\n" " }\n" "#ifdef RELU\n" " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n" " out1=fmax(out1,(COMPUTE_FLOAT4)0);\n" " out2=fmax(out2,(COMPUTE_FLOAT4)0);\n" " out3=fmax(out3,(COMPUTE_FLOAT4)0);\n" "#endif\n" "#ifdef RELU6\n" " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" "#endif\n" " int out_offset=(((out_b_idx+out_c_idx_0*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n" "#ifdef BLOCK_LEAVE\n" " const int remain=out_hw.x-out_h_idx;\n" " if(remain >= 2){\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n" " }else if(remain == 1){\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " }\n" " #ifdef CHANNEL_BOUNDARY_PROTECT\n" " if(out_c_idx_1 >= out_c_blocks){\n" " return;\n" " }\n" " #endif\n" " out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n" " if(remain >= 2){\n" " vstore4(CONVERT_FLOAT4(out2),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out3),out_hw.y,output+out_offset);\n" " }else if(remain == 1){\n" " vstore4(CONVERT_FLOAT4(out2),0,output+out_offset);\n" " }\n" "#else\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n" " #ifdef CHANNEL_BOUNDARY_PROTECT\n" " if(out_c_idx_1 >= out_c_blocks){\n" " return;\n" " }\n" " #endif\n" " out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n" " vstore4(CONVERT_FLOAT4(out2),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out3),out_hw.y,output+out_offset);\n" "#endif\n" "}\n" "__kernel\n" "void conv_2d_c8h1w4(GLOBAL_SIZE_2_DIMS\n" " __global const FLOAT *input,\n" " __global const FLOAT *weight,\n" " __global const FLOAT *bias,\n" " __global FLOAT *output,\n" " __private const int2 in_hw,\n" " __private const int inChannel,\n" " __private const int in_c_blocks,\n" " __private const int batch,\n" " __private const int2 out_hw,\n" " __private const int2 filter_hw,\n" " __private const int2 stride_hw,\n" " __private const int2 pad_hw,\n" " __private const int2 dilate_hw,\n" " __private const int out_w_blocks,\n" " __private const int out_c_blocks,\n" " __private const int out_h_blocks,\n" " __private const int out_c_base_index\n" ") {\n" " const int out_c_w_idx=get_global_id(0); //c/4 w\n" " const int out_b_h_idx=get_global_id(1); //b h\n" " DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n" " const int out_c_idx_0=(out_c_w_idx/out_w_blocks+out_c_base_index) << 1;\n" " if(out_c_idx_0 >= out_c_blocks) return;\n" " const int out_c_idx_1=out_c_idx_0+1;\n" " const int out_w_idx=(out_c_w_idx % out_w_blocks) << 2;\n" " const int out_b_idx=out_b_h_idx/out_hw.x;//equal to in_b_idx\n" " const int out_h_idx=out_b_h_idx % out_hw.x;\n" " \n" " COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias));\n" " COMPUTE_FLOAT4 out1=out0;\n" " COMPUTE_FLOAT4 out2=out0;\n" " COMPUTE_FLOAT4 out3=out0;\n" " #ifdef CHANNEL_BOUNDARY_PROTECT\n" " COMPUTE_FLOAT4 out4=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n" " #else\n" " COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n" " #endif\n" " COMPUTE_FLOAT4 out5=out4;\n" " COMPUTE_FLOAT4 out6=out4;\n" " COMPUTE_FLOAT4 out7=out4;\n" " const int in_w0_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n" " const int in_w1_idx_base=in_w0_idx_base+stride_hw.y;\n" " const int in_w2_idx_base=in_w1_idx_base+stride_hw.y;\n" " const int in_w3_idx_base=in_w2_idx_base+stride_hw.y;\n" " const int in_h_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n" " \n" " const int kh_start=select(0,(-in_h_idx_base+dilate_hw.x-1)/dilate_hw.x,in_h_idx_base<0);\n" " const int in_h_idx_start=mad24(kh_start,dilate_hw.x,in_h_idx_base);\n" " const int in_h_idx_end=min(mad24(filter_hw.x,dilate_hw.x,in_h_idx_base),in_hw.x);\n" " \n" " const int weight_oc_offset=filter_hw.x*filter_hw.y*4;\n" " const int weight_ic_offset=out_c_blocks*weight_oc_offset;\n" " for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n" " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n" " //index: [0,4*in_c_idx,out_c_idx_0*kh*kw+kh_start*kw+kw_start,0]\n" " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx_0) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n" " for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n" " const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n" " for(int fw=0; fw<filter_hw.y; fw++) {\n" " const int in_w0_idx=fw*dilate_hw.y+in_w0_idx_base;\n" " const int in_w1_idx=fw*dilate_hw.y+in_w1_idx_base;\n" " const int in_w2_idx=fw*dilate_hw.y+in_w2_idx_base;\n" " const int in_w3_idx=fw*dilate_hw.y+in_w3_idx_base;\n" " COMPUTE_FLOAT4 in0=(in_w0_idx<0 || in_w0_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w0_idx,input+inp_offset_base));\n" " COMPUTE_FLOAT4 in1=(in_w1_idx<0 || in_w1_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w1_idx,input+inp_offset_base));\n" " COMPUTE_FLOAT4 in2=(in_w2_idx<0 || in_w2_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w2_idx,input+inp_offset_base));\n" " COMPUTE_FLOAT4 in3=(in_w3_idx<0 || in_w3_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w3_idx,input+inp_offset_base));\n" " COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset));\n" " COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset));\n" " COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset*2));\n" " COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_ic_offset*3));\n" " out0=mad(in0.x,weight0,out0);\n" " out0=mad(in0.y,weight1,out0);\n" " out0=mad(in0.z,weight2,out0);\n" " out0=mad(in0.w,weight3,out0);\n" " \n" " out1=mad(in1.x,weight0,out1);\n" " out1=mad(in1.y,weight1,out1);\n" " out1=mad(in1.z,weight2,out1);\n" " out1=mad(in1.w,weight3,out1);\n" " \n" " out2=mad(in2.x,weight0,out2);\n" " out2=mad(in2.y,weight1,out2);\n" " out2=mad(in2.z,weight2,out2);\n" " out2=mad(in2.w,weight3,out2);\n" " \n" " out3=mad(in3.x,weight0,out3);\n" " out3=mad(in3.y,weight1,out3);\n" " out3=mad(in3.z,weight2,out3);\n" " out3=mad(in3.w,weight3,out3);\n" " \n" " #ifdef CHANNEL_BOUNDARY_PROTECT\n" " weight0=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n" " weight1=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n" " weight2=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n" " weight3=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n" " #else\n" " weight0=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset));\n" " weight1=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset));\n" " weight2=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2));\n" " weight3=CONVERT_COMPUTE_FLOAT4(vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3));\n" " #endif\n" " out4=mad(in0.x,weight0,out4);\n" " out4=mad(in0.y,weight1,out4);\n" " out4=mad(in0.z,weight2,out4);\n" " out4=mad(in0.w,weight3,out4);\n" " \n" " out5=mad(in1.x,weight0,out5);\n" " out5=mad(in1.y,weight1,out5);\n" " out5=mad(in1.z,weight2,out5);\n" " out5=mad(in1.w,weight3,out5);\n" " \n" " out6=mad(in2.x,weight0,out6);\n" " out6=mad(in2.y,weight1,out6);\n" " out6=mad(in2.z,weight2,out6);\n" " out6=mad(in2.w,weight3,out6);\n" " \n" " out7=mad(in3.x,weight0,out7);\n" " out7=mad(in3.y,weight1,out7);\n" " out7=mad(in3.z,weight2,out7);\n" " out7=mad(in3.w,weight3,out7);\n" " \n" " weight_offset += 4;\n" " }\n" " }\n" " }\n" "#ifdef RELU\n" " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n" " out1=fmax(out1,(COMPUTE_FLOAT4)0);\n" " out2=fmax(out2,(COMPUTE_FLOAT4)0);\n" " out3=fmax(out3,(COMPUTE_FLOAT4)0);\n" " out4=fmax(out4,(COMPUTE_FLOAT4)0);\n" " out5=fmax(out5,(COMPUTE_FLOAT4)0);\n" " out6=fmax(out6,(COMPUTE_FLOAT4)0);\n" " out7=fmax(out7,(COMPUTE_FLOAT4)0);\n" "#endif\n" "#ifdef RELU6\n" " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out4=clamp(out4,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out5=clamp(out5,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" " out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n" "#endif\n" " int out_offset=(((out_b_idx+out_c_idx_0*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n" "#ifdef BLOCK_LEAVE\n" " const int remain=out_hw.y-out_w_idx;\n" " if(remain >= 4){\n" " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n" " }else if(remain == 3){\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out2),2,output+out_offset);\n" " }else if(remain == 2){\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n" " }else if(remain == 1){\n" " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n" " }\n" " #ifdef CHANNEL_BOUNDARY_PROTECT\n" " if(out_c_idx_1 >= out_c_blocks)return;\n" " #endif\n" " out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n" " if(remain >= 4){\n" " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,output+out_offset);\n" " }else if(remain == 3){\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4,out5)),0,output+out_offset);\n" " vstore4(CONVERT_FLOAT4(out6),2,output+out_offset);\n" " }else if(remain == 2){\n" " vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4,out5)),0,output+out_offset);\n" " }else if(remain == 1){\n" " vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n" " }\n" "#else\n" " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n" " #ifdef CHANNEL_BOUNDARY_PROTECT\n" " if(out_c_idx_1 >= out_c_blocks)return;\n" " #endif\n" " out_offset=(((out_b_idx+(out_c_idx_1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n" " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,output+out_offset);\n" "#endif\n" "}\n" ; #endif }

source/backend/opencl/execution/cl/conv_2d_buf_mnn_cl.cpp (1,499 lines of code) (raw):