source/backend/opencl/execution/cl/conv_2d_buf.cl (1,398 lines of code) (raw):
#ifdef MNN_SUPPORT_FP16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define GLOBAL_SIZE_2_DIMS __private const int global_size_dim0, __private const int global_size_dim1,
#define DEAL_NON_UNIFORM_DIM2(input1, input2) \
if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { \
return; \
}
#ifdef CONV_LOCAL_SIZE
__kernel
void conv_2d_1x1_local(__private const int out_w_blocks,
__global const FLOAT *input,
__global const FLOAT *kernel_ptr,
__global const FLOAT *bias_ptr,
__global FLOAT *output,
__private const int in_c_block,
__private const int batch,
__private const int out_h,
__private const int out_w,
__private const int out_c_block,
__private const int out_c_pack) {
const int lid = get_local_id(0);
const int out_c_w_idx = get_global_id(1); //c/4 w
const int out_b_h_idx = get_global_id(2); //b h
COMPUTE_FLOAT4 local sum_mnn[CONV_LOCAL_SIZE];
const int out_c_idx = out_c_w_idx / out_w_blocks;
const int out_w_idx = out_c_w_idx % out_w_blocks;
const int out_b_idx = out_b_h_idx / out_h; // equal to in_b_idx
const int out_h_idx = out_b_h_idx % out_h; // equal to in_h_idx
COMPUTE_FLOAT4 bias0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias_ptr));
COMPUTE_FLOAT4 out0 = (COMPUTE_FLOAT4)0;
int offset = out_c_idx*4;
int inp_offset = ((out_b_idx*out_h + out_h_idx)* out_w + out_w_idx) << 2;
const int inp_add = batch*out_h*out_w*4;
for (ushort in_channel_block_idx = lid; in_channel_block_idx < in_c_block; in_channel_block_idx+=CONV_LOCAL_SIZE) {
int offset = mad24(in_channel_block_idx*4, out_c_pack, out_c_idx*4);
COMPUTE_FLOAT4 in0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset+in_channel_block_idx*inp_add));
COMPUTE_FLOAT4 weights0 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset));
COMPUTE_FLOAT4 weights1 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack));
COMPUTE_FLOAT4 weights2 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack + out_c_pack));
COMPUTE_FLOAT4 weights3 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack + out_c_pack + out_c_pack));
out0 = mad(in0.x, weights0, out0);
out0 = mad(in0.y, weights1, out0);
out0 = mad(in0.z, weights2, out0);
out0 = mad(in0.w, weights3, out0);
}
sum_mnn[lid] = out0;
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = CONV_LOCAL_SIZE/2; i > 0; i /= 2){
if (lid < i)
sum_mnn[lid] = sum_mnn[lid] + sum_mnn[lid + i];
barrier(CLK_LOCAL_MEM_FENCE);
}
out0 = sum_mnn[0] + bias0;
if(lid == 0){
#ifdef RELU
out0 = fmax(out0, (COMPUTE_FLOAT4)0);
#endif
#ifdef RELU6
out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
#endif
const int out_offset = (((out_b_idx + out_c_idx*batch)*out_h + out_h_idx)* out_w + out_w_idx)*4;
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
}
}
#endif
__kernel
void conv_2d_1x1_c4h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
__global const FLOAT *input,
__global const FLOAT *kernel_ptr,
__global const FLOAT *bias_ptr,
__global FLOAT *output,
__private const int in_c_block,
__private const int out_h,
__private const int out_w,
__private const int out_b,
__private const int out_c_block,
__private const int out_c_pack) {
const int out_c_w_idx = get_global_id(0); //c/4 w
const int out_b_h_idx = get_global_id(1); //b h
DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
const int out_c_idx = out_c_w_idx / out_w_blocks;
const int out_w_idx = out_c_w_idx % out_w_blocks;
const int out_b_idx = out_b_h_idx / out_h; // equal to in_b_idx
const int out_h_idx = out_b_h_idx % out_h; // equal to in_h_idx
const int out_w4_idx = mul24(out_w_idx, 4);
COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias_ptr));
COMPUTE_FLOAT4 out1 = out0;
COMPUTE_FLOAT4 out2 = out0;
COMPUTE_FLOAT4 out3 = out0;
const int intput_width_idx0 = out_w4_idx;
int inp_offset = ((out_b_idx * out_h + out_h_idx)* out_w + intput_width_idx0) << 2;
int offset = out_c_idx*4;
const int inp_add = out_b*out_h*out_w*4;
for (ushort in_channel_block_idx = 0; in_channel_block_idx < in_c_block; ++in_channel_block_idx) {
COMPUTE_FLOAT4 in0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset));
COMPUTE_FLOAT4 in1 = CONVERT_COMPUTE_FLOAT4(vload4(1, input+inp_offset));
COMPUTE_FLOAT4 in2 = CONVERT_COMPUTE_FLOAT4(vload4(2, input+inp_offset));
COMPUTE_FLOAT4 in3 = CONVERT_COMPUTE_FLOAT4(vload4(3, input+inp_offset));
COMPUTE_FLOAT4 weights0 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset));
COMPUTE_FLOAT4 weights1 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack));
COMPUTE_FLOAT4 weights2 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack + out_c_pack));
COMPUTE_FLOAT4 weights3 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack + out_c_pack + out_c_pack));
out0 = mad(in0.x, weights0, out0);
out0 = mad(in0.y, weights1, out0);
out0 = mad(in0.z, weights2, out0);
out0 = mad(in0.w, weights3, out0);
out1 = mad(in1.x, weights0, out1);
out1 = mad(in1.y, weights1, out1);
out1 = mad(in1.z, weights2, out1);
out1 = mad(in1.w, weights3, out1);
out2 = mad(in2.x, weights0, out2);
out2 = mad(in2.y, weights1, out2);
out2 = mad(in2.z, weights2, out2);
out2 = mad(in2.w, weights3, out2);
out3 = mad(in3.x, weights0, out3);
out3 = mad(in3.y, weights1, out3);
out3 = mad(in3.z, weights2, out3);
out3 = mad(in3.w, weights3, out3);
offset += 4 * out_c_pack;
inp_offset += inp_add;
}
#ifdef RELU
out0 = fmax(out0, (COMPUTE_FLOAT4)0);
out1 = fmax(out1, (COMPUTE_FLOAT4)0);
out2 = fmax(out2, (COMPUTE_FLOAT4)0);
out3 = fmax(out3, (COMPUTE_FLOAT4)0);
#endif
#ifdef RELU6
out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out2 = clamp(out2, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
#endif
const int out_offset = (((out_b_idx + out_c_idx * out_b)*out_h + out_h_idx)* out_w + out_w4_idx)*4;
#ifdef BLOCK_LEAVE
const int remain = out_w - out_w4_idx;
if (remain >= 4) {
vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0, out1, out2, out3)), 0, output+out_offset);
} else if (remain == 3) {
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0, out1)), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out2), 2, output+out_offset);
} else if (remain == 2) {
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0, out1)), 0, output+out_offset);
} else if (remain == 1) {
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
}
#else
vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0, out1, out2, out3)), 0, output+out_offset);
#endif
}
__kernel
void conv_2d_1x1_c8h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
__global const FLOAT *input,
__global const FLOAT *kernel_ptr,
__global const FLOAT *bias_ptr,
__global FLOAT *output,
__private const int in_c_block,
__private const int out_h,
__private const int out_w,
__private const int out_b,
__private const int out_c_block,
__private const int out_c_pack) {
const int out_c_w_idx = get_global_id(0); //c/8 w/4
const int out_b_h_idx = get_global_id(1); //b h
DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
const int out_c_idx_0 = (out_c_w_idx / out_w_blocks) << 1;
const int out_c_idx_1 = out_c_idx_0 + 1;
const int out_w_idx = out_c_w_idx % out_w_blocks;
const int out_b_idx = out_b_h_idx / out_h;//equal to in_b_idx
const int out_h_idx = out_b_h_idx % out_h;//equal to in_h_idx
const int out_w4_idx = mul24(out_w_idx, 4);
COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0, bias_ptr));
COMPUTE_FLOAT4 out1 = out0;
COMPUTE_FLOAT4 out2 = out0;
COMPUTE_FLOAT4 out3 = out0;
#ifdef CHANNEL_BOUNDARY_PROTECT
COMPUTE_FLOAT4 out4 = out_c_idx_1 >= out_c_block ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias_ptr));
COMPUTE_FLOAT4 out5 = out4;
COMPUTE_FLOAT4 out6 = out4;
COMPUTE_FLOAT4 out7 = out4;
#else
COMPUTE_FLOAT4 out4 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias_ptr));
COMPUTE_FLOAT4 out5 = out4;
COMPUTE_FLOAT4 out6 = out4;
COMPUTE_FLOAT4 out7 = out4;
#endif
const int intput_width_idx0 = out_w4_idx;
int inp_offset = ((out_b_idx * out_h + out_h_idx)* out_w + intput_width_idx0)<<2;
int offset = out_c_idx_0*4;
const int inp_add = out_b*out_h*out_w*4;
for (int in_channel_block_idx = 0; in_channel_block_idx < in_c_block; ++in_channel_block_idx) {
COMPUTE_FLOAT4 in0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset));
COMPUTE_FLOAT4 in1 = CONVERT_COMPUTE_FLOAT4(vload4(1, input+inp_offset));
COMPUTE_FLOAT4 in2 = CONVERT_COMPUTE_FLOAT4(vload4(2, input+inp_offset));
COMPUTE_FLOAT4 in3 = CONVERT_COMPUTE_FLOAT4(vload4(3, input+inp_offset));
// output_channel at least pack to 8, no need boundry protect
COMPUTE_FLOAT4 weights0 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset));
COMPUTE_FLOAT4 weights1 = CONVERT_COMPUTE_FLOAT4(vload4(1, kernel_ptr + offset));
COMPUTE_FLOAT4 weights2 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack));
COMPUTE_FLOAT4 weights3 = CONVERT_COMPUTE_FLOAT4(vload4(1, kernel_ptr + offset + out_c_pack));
COMPUTE_FLOAT4 weights4 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack + out_c_pack));
COMPUTE_FLOAT4 weights5 = CONVERT_COMPUTE_FLOAT4(vload4(1, kernel_ptr + offset + out_c_pack + out_c_pack));
COMPUTE_FLOAT4 weights6 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack + out_c_pack + out_c_pack));
COMPUTE_FLOAT4 weights7 = CONVERT_COMPUTE_FLOAT4(vload4(1, kernel_ptr + offset + out_c_pack + out_c_pack + out_c_pack));
out0 = mad(in0.x, weights0, out0);
out0 = mad(in0.y, weights2, out0);
out0 = mad(in0.z, weights4, out0);
out0 = mad(in0.w, weights6, out0);
out1 = mad(in1.x, weights0, out1);
out1 = mad(in1.y, weights2, out1);
out1 = mad(in1.z, weights4, out1);
out1 = mad(in1.w, weights6, out1);
out2 = mad(in2.x, weights0, out2);
out2 = mad(in2.y, weights2, out2);
out2 = mad(in2.z, weights4, out2);
out2 = mad(in2.w, weights6, out2);
out3 = mad(in3.x, weights0, out3);
out3 = mad(in3.y, weights2, out3);
out3 = mad(in3.z, weights4, out3);
out3 = mad(in3.w, weights6, out3);
out4 = mad(in0.x, weights1, out4);
out4 = mad(in0.y, weights3, out4);
out4 = mad(in0.z, weights5, out4);
out4 = mad(in0.w, weights7, out4);
out5 = mad(in1.x, weights1, out5);
out5 = mad(in1.y, weights3, out5);
out5 = mad(in1.z, weights5, out5);
out5 = mad(in1.w, weights7, out5);
out6 = mad(in2.x, weights1, out6);
out6 = mad(in2.y, weights3, out6);
out6 = mad(in2.z, weights5, out6);
out6 = mad(in2.w, weights7, out6);
out7 = mad(in3.x, weights1, out7);
out7 = mad(in3.y, weights3, out7);
out7 = mad(in3.z, weights5, out7);
out7 = mad(in3.w, weights7, out7);
offset += 4 * out_c_pack;
inp_offset += inp_add;
}
#ifdef RELU
out0 = fmax(out0, (COMPUTE_FLOAT4)0);
out1 = fmax(out1, (COMPUTE_FLOAT4)0);
out2 = fmax(out2, (COMPUTE_FLOAT4)0);
out3 = fmax(out3, (COMPUTE_FLOAT4)0);
out4 = fmax(out4, (COMPUTE_FLOAT4)0);
out5 = fmax(out5, (COMPUTE_FLOAT4)0);
out6 = fmax(out6, (COMPUTE_FLOAT4)0);
out7 = fmax(out7, (COMPUTE_FLOAT4)0);
#endif
#ifdef RELU6
out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out2 = clamp(out2, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out4 = clamp(out4, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out5 = clamp(out5, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out6 = clamp(out6, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out7 = clamp(out7, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
#endif
const int out_offset = (((out_b_idx + out_c_idx_0*out_b)*out_h + out_h_idx)* out_w + out_w4_idx)*4;
__global FLOAT * _tempoutput = output + out_offset;
__global FLOAT * _tempoutput1 = _tempoutput + 4*out_h*out_w*out_b;
#ifdef BLOCK_LEAVE
const int remain = out_w - out_w4_idx;
if (remain >= 4) {
vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0, out1, out2, out3)), 0, _tempoutput);
} else if (remain == 3) {
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0, out1)), 0, _tempoutput);
vstore4(CONVERT_FLOAT4(out2), 2, _tempoutput);
} else if (remain == 2) {
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0, out1)), 0, _tempoutput);
} else if (remain == 1) {
vstore4(CONVERT_FLOAT4(out0), 0, _tempoutput);
}
#ifdef CHANNEL_BOUNDARY_PROTECT
if(out_c_idx_1 >= out_c_block) {
return;
}
#endif
if (remain >= 4) {
vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4, out5, out6, out7)), 0, _tempoutput1);
} else if (remain == 3) {
vstore8(CONVERT_FLOAT8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4, out5))), 0, _tempoutput1);
vstore4(CONVERT_FLOAT4(out6), 2, _tempoutput1);
} else if (remain == 2) {
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4, out5)), 0, _tempoutput1);
} else if (remain == 1) {
vstore4(CONVERT_FLOAT4(out4), 0, _tempoutput1);
}
#else
vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0, out1, out2, out3)), 0, _tempoutput);
#ifdef CHANNEL_BOUNDARY_PROTECT
if(out_c_idx_1 >= out_c_block) {
return;
}
#endif
vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4, out5, out6, out7)), 0, _tempoutput1);
#endif
}
__kernel
void conv_2d_1x1_c8h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
__global const FLOAT *input,
__global const FLOAT *kernel_ptr,
__global const FLOAT *bias_ptr,
__global FLOAT *output,
__private const int in_c_block,
__private const int out_h,
__private const int out_w,
__private const int out_b,
__private const int out_c_block,
__private const int out_c_pack) {
const int out_c_w_idx = get_global_id(0); //c/8 w/4
const int out_b_h_idx = get_global_id(1); //b h
DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
const int out_c_idx_0 = (out_c_w_idx / out_w_blocks) << 1;
const int out_c_idx_1 = out_c_idx_0 + 1;
const int out_w_idx = out_c_w_idx % out_w_blocks;
const int out_b_idx = out_b_h_idx / out_h;//equal to in_b_idx
const int out_h_idx = out_b_h_idx % out_h;//equal to in_h_idx
const int out_w2_idx = mul24(out_w_idx, 2);
COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0, bias_ptr));
COMPUTE_FLOAT4 out1 = out0;
#ifdef CHANNEL_BOUNDARY_PROTECT
COMPUTE_FLOAT4 out4 = out_c_idx_1 >= out_c_block ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias_ptr));
#else
COMPUTE_FLOAT4 out4 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias_ptr));
#endif
COMPUTE_FLOAT4 out5 = out4;
const int intput_width_idx0 = out_w2_idx;
int inp_offset = ((out_b_idx * out_h + out_h_idx)* out_w + intput_width_idx0)<<2;
int offset = out_c_idx_0*4;
const int inp_add = out_b*out_h*out_w*4;
for (int in_channel_block_idx = 0; in_channel_block_idx < in_c_block; ++in_channel_block_idx) {
COMPUTE_FLOAT4 in0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset));
COMPUTE_FLOAT4 in1 = CONVERT_COMPUTE_FLOAT4(vload4(1, input+inp_offset));
COMPUTE_FLOAT4 weights0 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset));
COMPUTE_FLOAT4 weights1 = CONVERT_COMPUTE_FLOAT4(vload4(1, kernel_ptr + offset));
COMPUTE_FLOAT4 weights2 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack));
COMPUTE_FLOAT4 weights3 = CONVERT_COMPUTE_FLOAT4(vload4(1, kernel_ptr + offset + out_c_pack));
COMPUTE_FLOAT4 weights4 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack + out_c_pack));
COMPUTE_FLOAT4 weights5 = CONVERT_COMPUTE_FLOAT4(vload4(1, kernel_ptr + offset + out_c_pack + out_c_pack));
COMPUTE_FLOAT4 weights6 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack + out_c_pack + out_c_pack));
COMPUTE_FLOAT4 weights7 = CONVERT_COMPUTE_FLOAT4(vload4(1, kernel_ptr + offset + out_c_pack + out_c_pack + out_c_pack));
out0 = mad(in0.x, weights0, out0);
out0 = mad(in0.y, weights2, out0);
out0 = mad(in0.z, weights4, out0);
out0 = mad(in0.w, weights6, out0);
out1 = mad(in1.x, weights0, out1);
out1 = mad(in1.y, weights2, out1);
out1 = mad(in1.z, weights4, out1);
out1 = mad(in1.w, weights6, out1);
out4 = mad(in0.x, weights1, out4);
out4 = mad(in0.y, weights3, out4);
out4 = mad(in0.z, weights5, out4);
out4 = mad(in0.w, weights7, out4);
out5 = mad(in1.x, weights1, out5);
out5 = mad(in1.y, weights3, out5);
out5 = mad(in1.z, weights5, out5);
out5 = mad(in1.w, weights7, out5);
offset += 4 * out_c_pack;
inp_offset += inp_add;
}
#ifdef RELU
out0 = fmax(out0, (COMPUTE_FLOAT4)0);
out1 = fmax(out1, (COMPUTE_FLOAT4)0);
out4 = fmax(out4, (COMPUTE_FLOAT4)0);
out5 = fmax(out5, (COMPUTE_FLOAT4)0);
#endif
#ifdef RELU6
out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out4 = clamp(out4, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out5 = clamp(out5, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
#endif
const int out_offset = (((out_b_idx + out_c_idx_0*out_b)*out_h + out_h_idx)* out_w + out_w2_idx)*4;
__global FLOAT * _tempoutput = output + out_offset;
__global FLOAT * _tempoutput1 = _tempoutput + 4*out_h*out_w*out_b;
#ifdef BLOCK_LEAVE
const int remain = out_w - out_w2_idx;
if (remain >= 2) {
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0, out1)), 0, _tempoutput);
} else if (remain == 1) {
vstore4(CONVERT_FLOAT4(out0), 0, _tempoutput);
}
#ifdef CHANNEL_BOUNDARY_PROTECT
if(out_c_idx_1 >= out_c_block) {
return;
}
#endif
if (remain >= 2) {
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4, out5)), 0, _tempoutput1);
} else if (remain == 1) {
vstore4(CONVERT_FLOAT4(out4), 0, _tempoutput1);
}
#else
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0, out1)), 0, _tempoutput);
#ifdef CHANNEL_BOUNDARY_PROTECT
if(out_c_idx_1 >= out_c_block) {
return;
}
#endif
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4, out5)), 0, _tempoutput1);
#endif
}
__kernel
void conv_2d_1x1_c4h1w1(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
__global const FLOAT *input,
__global const FLOAT *kernel_ptr,
__global const FLOAT *bias_ptr,
__global FLOAT *output,
__private const int in_c_block,
__private const int out_h,
__private const int out_w,
__private const int out_b,
__private const int out_c_block,
__private const int out_c_pack) {
const int out_c_w_idx = get_global_id(0); //c/4 w
const int out_b_h_idx = get_global_id(1); //b h
DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
const int out_c_idx = out_c_w_idx / out_w;
const int out_w_idx = out_c_w_idx % out_w;
const int out_b_idx = out_b_h_idx / out_h;//equal to in_b_idx
const int out_h_idx = out_b_h_idx % out_h;//equal to in_h_idx
COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias_ptr));
const int intput_width_idx0 = out_w_idx;
int offset = out_c_idx*4;
int inp_offset = ((out_b_idx * out_h + out_h_idx) * out_w + intput_width_idx0)*4;
const int inp_add = out_b*out_h*out_w*4;
for (int in_channel_block_idx = 0; in_channel_block_idx < in_c_block; ++in_channel_block_idx) {
COMPUTE_FLOAT4 in0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset));
COMPUTE_FLOAT4 weights0 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset));
COMPUTE_FLOAT4 weights1 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack));
COMPUTE_FLOAT4 weights2 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack + out_c_pack));
COMPUTE_FLOAT4 weights3 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack + out_c_pack + out_c_pack));
out0 = mad(in0.x, weights0, out0);
out0 = mad(in0.y, weights1, out0);
out0 = mad(in0.z, weights2, out0);
out0 = mad(in0.w, weights3, out0);
offset += 4 * out_c_pack;
inp_offset += inp_add;
}
#ifdef RELU
out0 = fmax(out0, (COMPUTE_FLOAT4)0);
#endif
#ifdef RELU6
out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
#endif
const int out_offset = (((out_b_idx + out_c_idx*out_b)*out_h + out_h_idx)* out_w + out_w_idx)*4;
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
}
__kernel
void conv_2d_1x1_c4h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
__global const FLOAT *input,
__global const FLOAT *kernel_ptr,
__global const FLOAT *bias_ptr,
__global FLOAT *output,
__private const int in_c_block,
__private const int out_h,
__private const int out_w,
__private const int out_b,
__private const int out_c_block,
__private const int out_c_pack) {
const int out_c_w_idx = get_global_id(0); //c/4 w
const int out_b_h_idx = get_global_id(1); //b h
DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
const int out_c_idx = out_c_w_idx / out_w_blocks;
const int out_w_idx = out_c_w_idx % out_w_blocks;
const int out_b_idx = out_b_h_idx / out_h;//equal to in_b_idx
const int out_h_idx = out_b_h_idx % out_h;//equal to in_h_idx
const int out_w2_idx = mul24(out_w_idx, 2);
COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias_ptr));
COMPUTE_FLOAT4 out1 = out0;
const int intput_width_idx0 = out_w2_idx;
int offset = out_c_idx*4;
int inp_offset = ((out_b_idx*out_h + out_h_idx)* out_w + intput_width_idx0)*4;
const int inp_add = out_b*out_h*out_w*4;
for (int in_channel_block_idx = 0; in_channel_block_idx < in_c_block; ++in_channel_block_idx) {
COMPUTE_FLOAT4 in0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset));
COMPUTE_FLOAT4 in1 = CONVERT_COMPUTE_FLOAT4(vload4(1, input+inp_offset));
COMPUTE_FLOAT4 weights0 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset));
COMPUTE_FLOAT4 weights1 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack));
COMPUTE_FLOAT4 weights2 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack + out_c_pack));
COMPUTE_FLOAT4 weights3 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack + out_c_pack + out_c_pack));
out0 = mad(in0.x, weights0, out0);
out0 = mad(in0.y, weights1, out0);
out0 = mad(in0.z, weights2, out0);
out0 = mad(in0.w, weights3, out0);
out1 = mad(in1.x, weights0, out1);
out1 = mad(in1.y, weights1, out1);
out1 = mad(in1.z, weights2, out1);
out1 = mad(in1.w, weights3, out1);
offset += 4 * out_c_pack;
inp_offset += inp_add;
}
#ifdef RELU
out0 = fmax(out0, (COMPUTE_FLOAT4)0);
out1 = fmax(out1, (COMPUTE_FLOAT4)0);
#endif
#ifdef RELU6
out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
#endif
const int out_offset = (((out_b_idx + out_c_idx*out_b)*out_h + out_h_idx)* out_w + out_w2_idx)*4;
#ifdef BLOCK_LEAVE
const int remain = out_w - out_w2_idx;
if (remain >= 2) {
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0, out1)), 0, output+out_offset);
} else if (remain == 1) {
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
}
#else
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0, out1)), 0, output+out_offset);
#endif
}
__kernel
void conv_2d_c4h1w1(GLOBAL_SIZE_2_DIMS
__global const FLOAT *input,
__global const FLOAT *weight,
__global const FLOAT *bias,
__global FLOAT *output,
__private const int2 in_hw,
__private const int inChannel,
__private const int in_c_blocks,
__private const int batch,
__private const int2 out_hw,
__private const int2 filter_hw,
__private const int2 stride_hw,
__private const int2 pad_hw,
__private const int2 dilate_hw,
__private const int out_w_blocks,
__private const int out_c_blocks,
__private const int out_h_blocks,
__private const int out_c_base_index
) {
const int out_c_w_idx = get_global_id(0); //c/4 w
const int out_b_h_idx = get_global_id(1); //b h
DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
const int out_c_idx = out_c_w_idx / out_hw.y + out_c_base_index;
if(out_c_idx >= out_c_blocks) return;
const int out_w_idx = out_c_w_idx % out_hw.y;
const int out_b_idx = out_b_h_idx / out_hw.x;//equal to in_b_idx
const int out_h_idx = out_b_h_idx % out_hw.x;
COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias));
const int in_w_idx_base = mad24(out_w_idx, stride_hw.y, -pad_hw.y);
const int in_h_idx_base = mad24(out_h_idx, stride_hw.x, -pad_hw.x);
const int kw_start = select(0, (-in_w_idx_base + dilate_hw.y - 1) / dilate_hw.y, in_w_idx_base < 0);
const int kh_start = select(0, (-in_h_idx_base + dilate_hw.x - 1) / dilate_hw.x, in_h_idx_base < 0);
const int in_w_idx_start = mad24(kw_start, dilate_hw.y, in_w_idx_base);
const int in_w_idx_end = min(mad24(filter_hw.y, dilate_hw.y, in_w_idx_base), in_hw.y);
const int in_h_idx_start = mad24(kh_start, dilate_hw.x, in_h_idx_base);
const int in_h_idx_end = min(mad24(filter_hw.x, dilate_hw.x, in_h_idx_base), in_hw.x);
const int weight_oc_offset = out_c_blocks * filter_hw.x * filter_hw.y * 4;
for(ushort in_c_idx = 0; in_c_idx < in_c_blocks; in_c_idx++) {
//weights NC4HW4 [1, 4*icC4, ocC4*kh*kw, 1] xic4
//index: [0, 4*in_c_idx, out_c_idx*kh*kw + kh_start*kw + kw_start, 0]
int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + kh_start)*filter_hw.y + kw_start) * 4;
for(int iy = in_h_idx_start; iy < in_h_idx_end; iy += dilate_hw.x) {
for(int ix = in_w_idx_start; ix < in_w_idx_end; ix += dilate_hw.y) {
int inp_offset = (((out_b_idx + in_c_idx * batch) * in_hw.x + iy) * in_hw.y + ix) * 4;
COMPUTE_FLOAT4 in0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset));
const int filter_w_inc = (ix-in_w_idx_start)/dilate_hw.y;
COMPUTE_FLOAT4 weight0 = CONVERT_COMPUTE_FLOAT4(vload4(filter_w_inc, weight+weight_offset));
COMPUTE_FLOAT4 weight1 = CONVERT_COMPUTE_FLOAT4(vload4(filter_w_inc, weight+weight_offset+weight_oc_offset));
COMPUTE_FLOAT4 weight2 = CONVERT_COMPUTE_FLOAT4(vload4(filter_w_inc, weight+weight_offset+weight_oc_offset*2));
COMPUTE_FLOAT4 weight3 = CONVERT_COMPUTE_FLOAT4(vload4(filter_w_inc, weight+weight_offset+weight_oc_offset*3));
out0 = mad(in0.x, weight0, out0);
out0 = mad(in0.y, weight1, out0);
out0 = mad(in0.z, weight2, out0);
out0 = mad(in0.w, weight3, out0);
}
weight_offset += 4*filter_hw.y;
}
}
#ifdef RELU
out0 = fmax(out0, (COMPUTE_FLOAT4)0);
#endif
#ifdef RELU6
out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
#endif
const int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
}
__kernel
void conv_2d_c4h1w2(GLOBAL_SIZE_2_DIMS
__global const FLOAT *input,
__global const FLOAT *weight,
__global const FLOAT *bias,
__global FLOAT *output,
__private const int2 in_hw,
__private const int inChannel,
__private const int in_c_blocks,
__private const int batch,
__private const int2 out_hw,
__private const int2 filter_hw,
__private const int2 stride_hw,
__private const int2 pad_hw,
__private const int2 dilate_hw,
__private const int out_w_blocks,//generate width's num
__private const int out_c_blocks,
__private const int out_h_blocks,
__private const int out_c_base_index
) {
const int out_c_w_idx = get_global_id(0); //c/4 w
const int out_b_h_idx = get_global_id(1); //b h
DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
const int out_c_idx = out_c_w_idx / out_w_blocks + out_c_base_index;
if(out_c_idx >= out_c_blocks) return;
const int out_w_idx = (out_c_w_idx % out_w_blocks) << 1;
const int out_b_idx = out_b_h_idx / out_hw.x;//equal to in_b_idx
const int out_h_idx = out_b_h_idx % out_hw.x;
COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias));
COMPUTE_FLOAT4 out1 = out0;
const int in_w0_idx_base = mad24(out_w_idx, stride_hw.y, -pad_hw.y);
const int in_w1_idx_base = in_w0_idx_base + stride_hw.y;
const int in_h_idx_base = mad24(out_h_idx, stride_hw.x, -pad_hw.x);
const int kh_start = select(0, (-in_h_idx_base + dilate_hw.x - 1) / dilate_hw.x, in_h_idx_base < 0);
const int in_h_idx_start = mad24(kh_start, dilate_hw.x, in_h_idx_base);
const int in_h_idx_end = min(mad24(filter_hw.x, dilate_hw.x, in_h_idx_base), in_hw.x);
const int weight_oc_offset = out_c_blocks * filter_hw.x * filter_hw.y * 4;
for(ushort in_c_idx = 0; in_c_idx < in_c_blocks; in_c_idx++) {
//weights NC4HW4 [1, 4*icC4, ocC4*kh*kw, 1] xic4
//index: [0, 4*in_c_idx, out_c_idx*kh*kw + kh_start*kw + kw_start, 0]
int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + kh_start)*filter_hw.y + 0) * 4;
for(int iy = in_h_idx_start; iy < in_h_idx_end; iy += dilate_hw.x) {
const int inp_offset_base = (((out_b_idx + in_c_idx*batch) * in_hw.x + iy) * in_hw.y + 0) * 4;
for(int fw = 0; fw < filter_hw.y; fw++) {
const int in_w0_idx = fw * dilate_hw.y + in_w0_idx_base;
const int in_w1_idx = fw * dilate_hw.y + in_w1_idx_base;
COMPUTE_FLOAT4 in0 = (in_w0_idx < 0 || in_w0_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w0_idx, input+inp_offset_base));
COMPUTE_FLOAT4 in1 = (in_w1_idx < 0 || in_w1_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w1_idx, input+inp_offset_base));
COMPUTE_FLOAT4 weight0 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset));
COMPUTE_FLOAT4 weight1 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset));
COMPUTE_FLOAT4 weight2 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset*2));
COMPUTE_FLOAT4 weight3 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset*3));
out0 = mad(in0.x, weight0, out0);
out0 = mad(in0.y, weight1, out0);
out0 = mad(in0.z, weight2, out0);
out0 = mad(in0.w, weight3, out0);
out1 = mad(in1.x, weight0, out1);
out1 = mad(in1.y, weight1, out1);
out1 = mad(in1.z, weight2, out1);
out1 = mad(in1.w, weight3, out1);
weight_offset += 4;
}
}
}
#ifdef RELU
out0 = fmax(out0, (COMPUTE_FLOAT4)0);
out1 = fmax(out1, (COMPUTE_FLOAT4)0);
#endif
#ifdef RELU6
out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
#endif
const int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
#ifdef BLOCK_LEAVE
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
if(out_w_idx + 1 >= out_hw.y) return;
vstore4(CONVERT_FLOAT4(out1), 1, output+out_offset);
#else
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0, out1)), 0, output+out_offset);
#endif
}
__kernel
void conv_2d_c4h1w4(GLOBAL_SIZE_2_DIMS
__global const FLOAT *input,
__global const FLOAT *weight,
__global const FLOAT *bias,
__global FLOAT *output,
__private const int2 in_hw,
__private const int inChannel,
__private const int in_c_blocks,
__private const int batch,
__private const int2 out_hw,
__private const int2 filter_hw,
__private const int2 stride_hw,
__private const int2 pad_hw,
__private const int2 dilate_hw,
__private const int out_w_blocks,
__private const int out_c_blocks,
__private const int out_h_blocks,
__private const int out_c_base_index
) {
const int out_c_w_idx = get_global_id(0); //c/4 w
const int out_b_h_idx = get_global_id(1); //b h
DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
const int out_c_idx = out_c_w_idx / out_w_blocks + out_c_base_index;
if(out_c_idx >= out_c_blocks) return;
const int out_w_idx = (out_c_w_idx % out_w_blocks) << 2;
const int out_b_idx = out_b_h_idx / out_hw.x;//equal to in_b_idx
const int out_h_idx = out_b_h_idx % out_hw.x;
COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias));
COMPUTE_FLOAT4 out1 = out0;
COMPUTE_FLOAT4 out2 = out0;
COMPUTE_FLOAT4 out3 = out0;
const int in_w0_idx_base = mad24(out_w_idx, stride_hw.y, -pad_hw.y);
const int in_w1_idx_base = in_w0_idx_base + stride_hw.y;
const int in_w2_idx_base = in_w1_idx_base + stride_hw.y;
const int in_w3_idx_base = in_w2_idx_base + stride_hw.y;
const int in_h_idx_base = mad24(out_h_idx, stride_hw.x, -pad_hw.x);
const int kh_start = select(0, (-in_h_idx_base + dilate_hw.x - 1) / dilate_hw.x, in_h_idx_base < 0);
const int in_h_idx_start = mad24(kh_start, dilate_hw.x, in_h_idx_base);
const int in_h_idx_end = min(mad24(filter_hw.x, dilate_hw.x, in_h_idx_base), in_hw.x);
const int weight_oc_offset = out_c_blocks * filter_hw.x * filter_hw.y * 4;
for(ushort in_c_idx = 0; in_c_idx < in_c_blocks; in_c_idx++) {
//weights NC4HW4 [1, 4*icC4, ocC4*kh*kw, 1] xic4
//index: [0, 4*in_c_idx, out_c_idx*kh*kw + kh_start*kw + kw_start, 0]
int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + kh_start)*filter_hw.y + 0) * 4;
for(int iy = in_h_idx_start; iy < in_h_idx_end; iy += dilate_hw.x) {
const int inp_offset_base = (((out_b_idx + in_c_idx*batch) * in_hw.x + iy) * in_hw.y + 0) * 4;
for(int fw = 0; fw < filter_hw.y; fw++) {
const int in_w0_idx = fw * dilate_hw.y + in_w0_idx_base;
const int in_w1_idx = fw * dilate_hw.y + in_w1_idx_base;
const int in_w2_idx = fw * dilate_hw.y + in_w2_idx_base;
const int in_w3_idx = fw * dilate_hw.y + in_w3_idx_base;
COMPUTE_FLOAT4 in0 = (in_w0_idx < 0 || in_w0_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w0_idx, input+inp_offset_base));
COMPUTE_FLOAT4 in1 = (in_w1_idx < 0 || in_w1_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w1_idx, input+inp_offset_base));
COMPUTE_FLOAT4 in2 = (in_w2_idx < 0 || in_w2_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w2_idx, input+inp_offset_base));
COMPUTE_FLOAT4 in3 = (in_w3_idx < 0 || in_w3_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w3_idx, input+inp_offset_base));
COMPUTE_FLOAT4 weight0 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset));
COMPUTE_FLOAT4 weight1 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset));
COMPUTE_FLOAT4 weight2 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset*2));
COMPUTE_FLOAT4 weight3 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset*3));
out0 = mad(in0.x, weight0, out0);
out0 = mad(in0.y, weight1, out0);
out0 = mad(in0.z, weight2, out0);
out0 = mad(in0.w, weight3, out0);
out1 = mad(in1.x, weight0, out1);
out1 = mad(in1.y, weight1, out1);
out1 = mad(in1.z, weight2, out1);
out1 = mad(in1.w, weight3, out1);
out2 = mad(in2.x, weight0, out2);
out2 = mad(in2.y, weight1, out2);
out2 = mad(in2.z, weight2, out2);
out2 = mad(in2.w, weight3, out2);
out3 = mad(in3.x, weight0, out3);
out3 = mad(in3.y, weight1, out3);
out3 = mad(in3.z, weight2, out3);
out3 = mad(in3.w, weight3, out3);
weight_offset += 4;
}
}
}
#ifdef RELU
out0 = fmax(out0, (COMPUTE_FLOAT4)0);
out1 = fmax(out1, (COMPUTE_FLOAT4)0);
out2 = fmax(out2, (COMPUTE_FLOAT4)0);
out3 = fmax(out3, (COMPUTE_FLOAT4)0);
#endif
#ifdef RELU6
out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out2 = clamp(out2, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
#endif
const int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
#ifdef BLOCK_LEAVE
const int remain = out_hw.y - out_w_idx;
if (remain >= 4) {
vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0, out1, out2, out3)), 0, output+out_offset);
}else if(remain == 3){
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0, out1)), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out2), 2, output+out_offset);
}else if(remain == 2){
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0, out1)), 0, output+out_offset);
}else if(remain == 1){
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
}
#else
vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0, out1, out2, out3)), 0, output+out_offset);
#endif
}
__kernel
void conv_2d_c4h4w1(GLOBAL_SIZE_2_DIMS
__global const FLOAT *input,
__global const FLOAT *weight,
__global const FLOAT *bias,
__global FLOAT *output,
__private const int2 in_hw,
__private const int inChannel,
__private const int in_c_blocks,
__private const int batch,
__private const int2 out_hw,
__private const int2 filter_hw,
__private const int2 stride_hw,
__private const int2 pad_hw,
__private const int2 dilate_hw,
__private const int out_w_blocks,
__private const int out_c_blocks,
__private const int out_h_blocks,
__private const int out_c_base_index
) {
const int out_c_w_idx = get_global_id(0); //c/4 w
const int out_b_h_idx = get_global_id(1); //b h
DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
const int out_c_idx = out_c_w_idx / out_w_blocks + out_c_base_index;
if(out_c_idx >= out_c_blocks) return;
const int out_w_idx = out_c_w_idx % out_w_blocks;
const int out_b_idx = out_b_h_idx / out_h_blocks;//equal to in_b_idx
const int out_h_idx = (out_b_h_idx % out_h_blocks) << 2;
COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias));
COMPUTE_FLOAT4 out1 = out0;
COMPUTE_FLOAT4 out2 = out0;
COMPUTE_FLOAT4 out3 = out0;
const int in_w_idx_base = mad24(out_w_idx, stride_hw.y, -pad_hw.y);
const int in_h0_idx_base = mad24(out_h_idx, stride_hw.x, -pad_hw.x);
const int in_h1_idx_base = in_h0_idx_base + stride_hw.x;
const int in_h2_idx_base = in_h1_idx_base + stride_hw.x;
const int in_h3_idx_base = in_h2_idx_base + stride_hw.x;
const int kw_start = select(0, (-in_w_idx_base + dilate_hw.y - 1) / dilate_hw.y, in_w_idx_base < 0);
const int in_w_idx_start = mad24(kw_start, dilate_hw.y, in_w_idx_base);
const int in_w_idx_end = min(mad24(filter_hw.y, dilate_hw.y, in_w_idx_base), in_hw.y);
const int weight_oc_offset = out_c_blocks * filter_hw.x * filter_hw.y * 4;
const int in_hw_size = in_hw.x * in_hw.y;
for(ushort in_c_idx = 0; in_c_idx < in_c_blocks; in_c_idx++) {
//weights NC4HW4 [1, 4*icC4, ocC4*kh*kw, 1] xic4
//index: [0, 4*in_c_idx, out_c_idx*kh*kw + kh_start*kw + kw_start, 0]
const int inp_offset_base = (out_b_idx + in_c_idx*batch) * in_hw.x * in_hw.y * 4;
for(int iy = 0; iy < filter_hw.x; iy++) {
int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
const int in_h0_idx = (iy * dilate_hw.x + in_h0_idx_base) * in_hw.y;
const int in_h1_idx = (iy * dilate_hw.x + in_h1_idx_base) * in_hw.y;
const int in_h2_idx = (iy * dilate_hw.x + in_h2_idx_base) * in_hw.y;
const int in_h3_idx = (iy * dilate_hw.x + in_h3_idx_base) * in_hw.y;
for(int fw = in_w_idx_start; fw < in_w_idx_end; fw += dilate_hw.y) {
COMPUTE_FLOAT4 in0 = (in_h0_idx < 0 || in_h0_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h0_idx + fw, input+inp_offset_base));
COMPUTE_FLOAT4 in1 = (in_h1_idx < 0 || in_h1_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h1_idx + fw, input+inp_offset_base));
COMPUTE_FLOAT4 in2 = (in_h2_idx < 0 || in_h2_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h2_idx + fw, input+inp_offset_base));
COMPUTE_FLOAT4 in3 = (in_h3_idx < 0 || in_h3_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h3_idx + fw, input+inp_offset_base));
COMPUTE_FLOAT4 weight0 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset));
COMPUTE_FLOAT4 weight1 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset));
COMPUTE_FLOAT4 weight2 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset*2));
COMPUTE_FLOAT4 weight3 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset*3));
out0 = mad(in0.x, weight0, out0);
out0 = mad(in0.y, weight1, out0);
out0 = mad(in0.z, weight2, out0);
out0 = mad(in0.w, weight3, out0);
out1 = mad(in1.x, weight0, out1);
out1 = mad(in1.y, weight1, out1);
out1 = mad(in1.z, weight2, out1);
out1 = mad(in1.w, weight3, out1);
out2 = mad(in2.x, weight0, out2);
out2 = mad(in2.y, weight1, out2);
out2 = mad(in2.z, weight2, out2);
out2 = mad(in2.w, weight3, out2);
out3 = mad(in3.x, weight0, out3);
out3 = mad(in3.y, weight1, out3);
out3 = mad(in3.z, weight2, out3);
out3 = mad(in3.w, weight3, out3);
weight_offset += 4;
}
}
}
#ifdef RELU
out0 = fmax(out0, (COMPUTE_FLOAT4)0);
out1 = fmax(out1, (COMPUTE_FLOAT4)0);
out2 = fmax(out2, (COMPUTE_FLOAT4)0);
out3 = fmax(out3, (COMPUTE_FLOAT4)0);
#endif
#ifdef RELU6
out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out2 = clamp(out2, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
#endif
const int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
#ifdef BLOCK_LEAVE
const int remain = out_hw.x - out_h_idx;
if(remain >= 4){
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out1), out_hw.y, output+out_offset);
vstore4(CONVERT_FLOAT4(out2), 2 * out_hw.y, output+out_offset);
vstore4(CONVERT_FLOAT4(out3), 3 * out_hw.y, output+out_offset);
}else if(remain == 3){
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out1), out_hw.y, output+out_offset);
vstore4(CONVERT_FLOAT4(out2), 2 * out_hw.y, output+out_offset);
}else if(remain == 2){
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out1), out_hw.y, output+out_offset);
}else if(remain == 1){
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
}
#else
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out1), out_hw.y, output+out_offset);
vstore4(CONVERT_FLOAT4(out2), 2 * out_hw.y, output+out_offset);
vstore4(CONVERT_FLOAT4(out3), 3 * out_hw.y, output+out_offset);
#endif
}
__kernel
void conv_2d_c8h4w1(GLOBAL_SIZE_2_DIMS
__global const FLOAT *input,
__global const FLOAT *weight,
__global const FLOAT *bias,
__global FLOAT *output,
__private const int2 in_hw,
__private const int inChannel,
__private const int in_c_blocks,
__private const int batch,
__private const int2 out_hw,
__private const int2 filter_hw,
__private const int2 stride_hw,
__private const int2 pad_hw,
__private const int2 dilate_hw,
__private const int out_w_blocks,
__private const int out_c_blocks,
__private const int out_h_blocks,
__private const int out_c_base_index
) {
const int out_c_w_idx = get_global_id(0); //c/4 w
const int out_b_h_idx = get_global_id(1); //b h
DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
const int out_c_idx_0 = ((out_c_w_idx / out_w_blocks + out_c_base_index) << 1);
if(out_c_idx_0 >= out_c_blocks) return;
const int out_c_idx_1 = out_c_idx_0 + 1;
const int out_w_idx = out_c_w_idx % out_w_blocks;
const int out_b_idx = out_b_h_idx / out_h_blocks;//equal to in_b_idx
const int out_h_idx = (out_b_h_idx % out_h_blocks) << 2;
COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0, bias));
COMPUTE_FLOAT4 out1 = out0;
COMPUTE_FLOAT4 out2 = out0;
COMPUTE_FLOAT4 out3 = out0;
#ifdef CHANNEL_BOUNDARY_PROTECT
COMPUTE_FLOAT4 out4 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias));
#else
COMPUTE_FLOAT4 out4 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias));
#endif
COMPUTE_FLOAT4 out5 = out4;
COMPUTE_FLOAT4 out6 = out4;
COMPUTE_FLOAT4 out7 = out4;
const int in_w_idx_base = mad24(out_w_idx, stride_hw.y, -pad_hw.y);
const int in_h0_idx_base = mad24(out_h_idx, stride_hw.x, -pad_hw.x);
const int in_h1_idx_base = in_h0_idx_base + stride_hw.x;
const int in_h2_idx_base = in_h1_idx_base + stride_hw.x;
const int in_h3_idx_base = in_h2_idx_base + stride_hw.x;
const int kw_start = select(0, (-in_w_idx_base + dilate_hw.y - 1) / dilate_hw.y, in_w_idx_base < 0);
const int in_w_idx_start = mad24(kw_start, dilate_hw.y, in_w_idx_base);
const int in_w_idx_end = min(mad24(filter_hw.y, dilate_hw.y, in_w_idx_base), in_hw.y);
const int weight_oc_offset = filter_hw.x * filter_hw.y * 4;
const int weight_ic_offset = out_c_blocks * weight_oc_offset;
const int in_hw_size = in_hw.x * in_hw.y;
for(ushort in_c_idx = 0; in_c_idx < in_c_blocks; in_c_idx++) {
//weights NC4HW4 [ic/4, ic_4, oc/4, kh*kw, oc_4]
//index: [0, 4*in_c_idx, out_c_idx_0*kh*kw + kh_start*kw + kw_start, 0]
const int inp_offset_base = (out_b_idx + in_c_idx * batch) * in_hw.x * in_hw.y * 4;
for(int iy = 0; iy < filter_hw.x; iy++) {
int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx_0) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
const int in_h0_idx = (iy * dilate_hw.x + in_h0_idx_base) * in_hw.y;
const int in_h1_idx = (iy * dilate_hw.x + in_h1_idx_base) * in_hw.y;
const int in_h2_idx = (iy * dilate_hw.x + in_h2_idx_base) * in_hw.y;
const int in_h3_idx = (iy * dilate_hw.x + in_h3_idx_base) * in_hw.y;
for(int fw = in_w_idx_start; fw < in_w_idx_end; fw += dilate_hw.y) {
COMPUTE_FLOAT4 in0 = (in_h0_idx < 0 || in_h0_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h0_idx + fw, input+inp_offset_base));
COMPUTE_FLOAT4 in1 = (in_h1_idx < 0 || in_h1_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h1_idx + fw, input+inp_offset_base));
COMPUTE_FLOAT4 in2 = (in_h2_idx < 0 || in_h2_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h2_idx + fw, input+inp_offset_base));
COMPUTE_FLOAT4 in3 = (in_h3_idx < 0 || in_h3_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h3_idx + fw, input+inp_offset_base));
COMPUTE_FLOAT4 weight0 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset));
COMPUTE_FLOAT4 weight1 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_ic_offset));
COMPUTE_FLOAT4 weight2 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_ic_offset*2));
COMPUTE_FLOAT4 weight3 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_ic_offset*3));
out0 = mad(in0.x, weight0, out0);
out0 = mad(in0.y, weight1, out0);
out0 = mad(in0.z, weight2, out0);
out0 = mad(in0.w, weight3, out0);
out1 = mad(in1.x, weight0, out1);
out1 = mad(in1.y, weight1, out1);
out1 = mad(in1.z, weight2, out1);
out1 = mad(in1.w, weight3, out1);
out2 = mad(in2.x, weight0, out2);
out2 = mad(in2.y, weight1, out2);
out2 = mad(in2.z, weight2, out2);
out2 = mad(in2.w, weight3, out2);
out3 = mad(in3.x, weight0, out3);
out3 = mad(in3.y, weight1, out3);
out3 = mad(in3.z, weight2, out3);
out3 = mad(in3.w, weight3, out3);
// weight: [ic/4, ic_4, oc/4, kh*kw, oc_4]
#ifdef CHANNEL_BOUNDARY_PROTECT
weight0 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset));
weight1 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset));
weight2 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2));
weight3 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3));
#else
weight0 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset));
weight1 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset));
weight2 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2));
weight3 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3));
#endif
out4 = mad(in0.x, weight0, out4);
out4 = mad(in0.y, weight1, out4);
out4 = mad(in0.z, weight2, out4);
out4 = mad(in0.w, weight3, out4);
out5 = mad(in1.x, weight0, out5);
out5 = mad(in1.y, weight1, out5);
out5 = mad(in1.z, weight2, out5);
out5 = mad(in1.w, weight3, out5);
out6 = mad(in2.x, weight0, out6);
out6 = mad(in2.y, weight1, out6);
out6 = mad(in2.z, weight2, out6);
out6 = mad(in2.w, weight3, out6);
out7 = mad(in3.x, weight0, out7);
out7 = mad(in3.y, weight1, out7);
out7 = mad(in3.z, weight2, out7);
out7 = mad(in3.w, weight3, out7);
weight_offset += 4;
}
}
}
#ifdef RELU
out0 = fmax(out0, (COMPUTE_FLOAT4)0);
out1 = fmax(out1, (COMPUTE_FLOAT4)0);
out2 = fmax(out2, (COMPUTE_FLOAT4)0);
out3 = fmax(out3, (COMPUTE_FLOAT4)0);
out4 = fmax(out4, (COMPUTE_FLOAT4)0);
out5 = fmax(out5, (COMPUTE_FLOAT4)0);
out6 = fmax(out6, (COMPUTE_FLOAT4)0);
out7 = fmax(out7, (COMPUTE_FLOAT4)0);
#endif
#ifdef RELU6
out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out2 = clamp(out2, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out4 = clamp(out4, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out5 = clamp(out5, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out6 = clamp(out6, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out7 = clamp(out7, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
#endif
int out_offset = (((out_b_idx + out_c_idx_0*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
#ifdef BLOCK_LEAVE
const int remain = out_hw.x - out_h_idx;
if(remain >= 4){
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out1), out_hw.y, output+out_offset);
vstore4(CONVERT_FLOAT4(out2), 2 * out_hw.y, output+out_offset);
vstore4(CONVERT_FLOAT4(out3), 3 * out_hw.y, output+out_offset);
}else if(remain == 3){
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out1), out_hw.y, output+out_offset);
vstore4(CONVERT_FLOAT4(out2), 2 * out_hw.y, output+out_offset);
}else if(remain == 2){
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out1), out_hw.y, output+out_offset);
}else if(remain == 1){
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
}
#ifdef CHANNEL_BOUNDARY_PROTECT
if(out_c_idx_1 >= out_c_blocks){
return;
}
#endif
out_offset = (((out_b_idx + (out_c_idx_1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
if(remain >= 4){
vstore4(CONVERT_FLOAT4(out4), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out5), out_hw.y, output+out_offset);
vstore4(CONVERT_FLOAT4(out6), 2 * out_hw.y, output+out_offset);
vstore4(CONVERT_FLOAT4(out7), 3 * out_hw.y, output+out_offset);
}else if(remain == 3){
vstore4(CONVERT_FLOAT4(out4), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out5), out_hw.y, output+out_offset);
vstore4(CONVERT_FLOAT4(out6), 2 * out_hw.y, output+out_offset);
}else if(remain == 2){
vstore4(CONVERT_FLOAT4(out4), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out5), out_hw.y, output+out_offset);
}else if(remain == 1){
vstore4(CONVERT_FLOAT4(out4), 0, output+out_offset);
}
#else
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out1), out_hw.y, output+out_offset);
vstore4(CONVERT_FLOAT4(out2), 2 * out_hw.y, output+out_offset);
vstore4(CONVERT_FLOAT4(out3), 3 * out_hw.y, output+out_offset);
#ifdef CHANNEL_BOUNDARY_PROTECT
if(out_c_idx_1 >= out_c_blocks){
return;
}
#endif
out_offset = (((out_b_idx + (out_c_idx_1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
vstore4(CONVERT_FLOAT4(out4), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out5), out_hw.y, output+out_offset);
vstore4(CONVERT_FLOAT4(out6), 2 * out_hw.y, output+out_offset);
vstore4(CONVERT_FLOAT4(out7), 3 * out_hw.y, output+out_offset);
#endif
}
__kernel
void conv_2d_c8h2w1(GLOBAL_SIZE_2_DIMS
__global const FLOAT *input,
__global const FLOAT *weight,
__global const FLOAT *bias,
__global FLOAT *output,
__private const int2 in_hw,
__private const int inChannel,
__private const int in_c_blocks,
__private const int batch,
__private const int2 out_hw,
__private const int2 filter_hw,
__private const int2 stride_hw,
__private const int2 pad_hw,
__private const int2 dilate_hw,
__private const int out_w_blocks,
__private const int out_c_blocks,
__private const int out_h_blocks,
__private const int out_c_base_index
) {
const int out_c_w_idx = get_global_id(0); //c/4 w
const int out_b_h_idx = get_global_id(1); //b h
DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
const int out_c_idx_0 = (out_c_w_idx / out_w_blocks + out_c_base_index) << 1;
if(out_c_idx_0 >= out_c_blocks) return;
const int out_c_idx_1 = out_c_idx_0 + 1;
const int out_w_idx = out_c_w_idx % out_w_blocks;
const int out_b_idx = out_b_h_idx / out_h_blocks;//equal to in_b_idx
const int out_h_idx = (out_b_h_idx % out_h_blocks) << 1;
COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0, bias));
COMPUTE_FLOAT4 out1 = out0;
#ifdef CHANNEL_BOUNDARY_PROTECT
COMPUTE_FLOAT4 out2 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias));
#else
COMPUTE_FLOAT4 out2 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias));
#endif
COMPUTE_FLOAT4 out3 = out2;
const int in_w_idx_base = mad24(out_w_idx, stride_hw.y, -pad_hw.y);
const int in_h0_idx_base = mad24(out_h_idx, stride_hw.x, -pad_hw.x);
const int in_h1_idx_base = in_h0_idx_base + stride_hw.x;
const int kw_start = select(0, (-in_w_idx_base + dilate_hw.y - 1) / dilate_hw.y, in_w_idx_base < 0);
const int in_w_idx_start = mad24(kw_start, dilate_hw.y, in_w_idx_base);
const int in_w_idx_end = min(mad24(filter_hw.y, dilate_hw.y, in_w_idx_base), in_hw.y);
const int weight_oc_offset = filter_hw.x * filter_hw.y * 4;
const int weight_ic_offset = out_c_blocks * weight_oc_offset;
const int in_hw_size = in_hw.x * in_hw.y;
// weight: [ic/4, oc, 4], loop: ic/4
for(ushort in_c_idx = 0; in_c_idx < in_c_blocks; in_c_idx++) {
//weights NC4HW4 [1, 4*icC4, ocC4*kh*kw, 1] xic4
//index: [0, 4*in_c_idx, out_c_idx_0*kh*kw + kh_start*kw + kw_start, 0]
const int inp_offset_base = (out_b_idx + in_c_idx*batch) * in_hw.x * in_hw.y * 4;
for(int iy = 0; iy < filter_hw.x; iy++) {
int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx_0) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
const int in_h0_idx = (iy * dilate_hw.x + in_h0_idx_base) * in_hw.y;
const int in_h1_idx = (iy * dilate_hw.x + in_h1_idx_base) * in_hw.y;
for(int fw = in_w_idx_start; fw < in_w_idx_end; fw += dilate_hw.y) {
COMPUTE_FLOAT4 in0 = (in_h0_idx < 0 || in_h0_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h0_idx + fw, input+inp_offset_base));
COMPUTE_FLOAT4 in1 = (in_h1_idx < 0 || in_h1_idx >= in_hw_size) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_h1_idx + fw, input+inp_offset_base));
COMPUTE_FLOAT4 weight0 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset));
COMPUTE_FLOAT4 weight1 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_ic_offset));
COMPUTE_FLOAT4 weight2 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_ic_offset*2));
COMPUTE_FLOAT4 weight3 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_ic_offset*3));
out0 = mad(in0.x, weight0, out0);
out0 = mad(in0.y, weight1, out0);
out0 = mad(in0.z, weight2, out0);
out0 = mad(in0.w, weight3, out0);
out1 = mad(in1.x, weight0, out1);
out1 = mad(in1.y, weight1, out1);
out1 = mad(in1.z, weight2, out1);
out1 = mad(in1.w, weight3, out1);
#ifdef CHANNEL_BOUNDARY_PROTECT
weight0 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset));
weight1 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset));
weight2 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2));
weight3 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3));
#else
weight0 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset));
weight1 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset));
weight2 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2));
weight3 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3));
#endif
out2 = mad(in0.x, weight0, out2);
out2 = mad(in0.y, weight1, out2);
out2 = mad(in0.z, weight2, out2);
out2 = mad(in0.w, weight3, out2);
out3 = mad(in1.x, weight0, out3);
out3 = mad(in1.y, weight1, out3);
out3 = mad(in1.z, weight2, out3);
out3 = mad(in1.w, weight3, out3);
weight_offset += 4;
}
}
}
#ifdef RELU
out0 = fmax(out0, (COMPUTE_FLOAT4)0);
out1 = fmax(out1, (COMPUTE_FLOAT4)0);
out2 = fmax(out2, (COMPUTE_FLOAT4)0);
out3 = fmax(out3, (COMPUTE_FLOAT4)0);
#endif
#ifdef RELU6
out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out2 = clamp(out2, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
#endif
int out_offset = (((out_b_idx + out_c_idx_0*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
#ifdef BLOCK_LEAVE
const int remain = out_hw.x - out_h_idx;
if(remain >= 2){
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out1), out_hw.y, output+out_offset);
}else if(remain == 1){
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
}
#ifdef CHANNEL_BOUNDARY_PROTECT
if(out_c_idx_1 >= out_c_blocks){
return;
}
#endif
out_offset = (((out_b_idx + (out_c_idx_1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
if(remain >= 2){
vstore4(CONVERT_FLOAT4(out2), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out3), out_hw.y, output+out_offset);
}else if(remain == 1){
vstore4(CONVERT_FLOAT4(out2), 0, output+out_offset);
}
#else
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out1), out_hw.y, output+out_offset);
#ifdef CHANNEL_BOUNDARY_PROTECT
if(out_c_idx_1 >= out_c_blocks){
return;
}
#endif
out_offset = (((out_b_idx + (out_c_idx_1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
vstore4(CONVERT_FLOAT4(out2), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out3), out_hw.y, output+out_offset);
#endif
}
__kernel
void conv_2d_c8h1w4(GLOBAL_SIZE_2_DIMS
__global const FLOAT *input,
__global const FLOAT *weight,
__global const FLOAT *bias,
__global FLOAT *output,
__private const int2 in_hw,
__private const int inChannel,
__private const int in_c_blocks,
__private const int batch,
__private const int2 out_hw,
__private const int2 filter_hw,
__private const int2 stride_hw,
__private const int2 pad_hw,
__private const int2 dilate_hw,
__private const int out_w_blocks,
__private const int out_c_blocks,
__private const int out_h_blocks,
__private const int out_c_base_index
) {
const int out_c_w_idx = get_global_id(0); //c/4 w
const int out_b_h_idx = get_global_id(1); //b h
DEAL_NON_UNIFORM_DIM2(out_c_w_idx, out_b_h_idx);
const int out_c_idx_0 = (out_c_w_idx / out_w_blocks + out_c_base_index) << 1;
if(out_c_idx_0 >= out_c_blocks) return;
const int out_c_idx_1 = out_c_idx_0 + 1;
const int out_w_idx = (out_c_w_idx % out_w_blocks) << 2;
const int out_b_idx = out_b_h_idx / out_hw.x;//equal to in_b_idx
const int out_h_idx = out_b_h_idx % out_hw.x;
COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0, bias));
COMPUTE_FLOAT4 out1 = out0;
COMPUTE_FLOAT4 out2 = out0;
COMPUTE_FLOAT4 out3 = out0;
#ifdef CHANNEL_BOUNDARY_PROTECT
COMPUTE_FLOAT4 out4 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias));
#else
COMPUTE_FLOAT4 out4 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1, bias));
#endif
COMPUTE_FLOAT4 out5 = out4;
COMPUTE_FLOAT4 out6 = out4;
COMPUTE_FLOAT4 out7 = out4;
const int in_w0_idx_base = mad24(out_w_idx, stride_hw.y, -pad_hw.y);
const int in_w1_idx_base = in_w0_idx_base + stride_hw.y;
const int in_w2_idx_base = in_w1_idx_base + stride_hw.y;
const int in_w3_idx_base = in_w2_idx_base + stride_hw.y;
const int in_h_idx_base = mad24(out_h_idx, stride_hw.x, -pad_hw.x);
const int kh_start = select(0, (-in_h_idx_base + dilate_hw.x - 1) / dilate_hw.x, in_h_idx_base < 0);
const int in_h_idx_start = mad24(kh_start, dilate_hw.x, in_h_idx_base);
const int in_h_idx_end = min(mad24(filter_hw.x, dilate_hw.x, in_h_idx_base), in_hw.x);
const int weight_oc_offset = filter_hw.x * filter_hw.y * 4;
const int weight_ic_offset = out_c_blocks * weight_oc_offset;
for(ushort in_c_idx = 0; in_c_idx < in_c_blocks; in_c_idx++) {
//weights NC4HW4 [1, 4*icC4, ocC4*kh*kw, 1] xic4
//index: [0, 4*in_c_idx, out_c_idx_0*kh*kw + kh_start*kw + kw_start, 0]
int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx_0) *filter_hw.x + kh_start)*filter_hw.y + 0) * 4;
for(int iy = in_h_idx_start; iy < in_h_idx_end; iy += dilate_hw.x) {
const int inp_offset_base = (((out_b_idx + in_c_idx * batch) * in_hw.x + iy) * in_hw.y + 0) * 4;
for(int fw = 0; fw < filter_hw.y; fw++) {
const int in_w0_idx = fw * dilate_hw.y + in_w0_idx_base;
const int in_w1_idx = fw * dilate_hw.y + in_w1_idx_base;
const int in_w2_idx = fw * dilate_hw.y + in_w2_idx_base;
const int in_w3_idx = fw * dilate_hw.y + in_w3_idx_base;
COMPUTE_FLOAT4 in0 = (in_w0_idx < 0 || in_w0_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w0_idx, input+inp_offset_base));
COMPUTE_FLOAT4 in1 = (in_w1_idx < 0 || in_w1_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w1_idx, input+inp_offset_base));
COMPUTE_FLOAT4 in2 = (in_w2_idx < 0 || in_w2_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w2_idx, input+inp_offset_base));
COMPUTE_FLOAT4 in3 = (in_w3_idx < 0 || in_w3_idx >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(in_w3_idx, input+inp_offset_base));
COMPUTE_FLOAT4 weight0 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset));
COMPUTE_FLOAT4 weight1 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_ic_offset));
COMPUTE_FLOAT4 weight2 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_ic_offset*2));
COMPUTE_FLOAT4 weight3 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_ic_offset*3));
out0 = mad(in0.x, weight0, out0);
out0 = mad(in0.y, weight1, out0);
out0 = mad(in0.z, weight2, out0);
out0 = mad(in0.w, weight3, out0);
out1 = mad(in1.x, weight0, out1);
out1 = mad(in1.y, weight1, out1);
out1 = mad(in1.z, weight2, out1);
out1 = mad(in1.w, weight3, out1);
out2 = mad(in2.x, weight0, out2);
out2 = mad(in2.y, weight1, out2);
out2 = mad(in2.z, weight2, out2);
out2 = mad(in2.w, weight3, out2);
out3 = mad(in3.x, weight0, out3);
out3 = mad(in3.y, weight1, out3);
out3 = mad(in3.z, weight2, out3);
out3 = mad(in3.w, weight3, out3);
#ifdef CHANNEL_BOUNDARY_PROTECT
weight0 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset));
weight1 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset));
weight2 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2));
weight3 = out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3));
#else
weight0 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset));
weight1 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset));
weight2 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*2));
weight3 = CONVERT_COMPUTE_FLOAT4(vload4(0, weight+weight_offset+weight_oc_offset+weight_ic_offset*3));
#endif
out4 = mad(in0.x, weight0, out4);
out4 = mad(in0.y, weight1, out4);
out4 = mad(in0.z, weight2, out4);
out4 = mad(in0.w, weight3, out4);
out5 = mad(in1.x, weight0, out5);
out5 = mad(in1.y, weight1, out5);
out5 = mad(in1.z, weight2, out5);
out5 = mad(in1.w, weight3, out5);
out6 = mad(in2.x, weight0, out6);
out6 = mad(in2.y, weight1, out6);
out6 = mad(in2.z, weight2, out6);
out6 = mad(in2.w, weight3, out6);
out7 = mad(in3.x, weight0, out7);
out7 = mad(in3.y, weight1, out7);
out7 = mad(in3.z, weight2, out7);
out7 = mad(in3.w, weight3, out7);
weight_offset += 4;
}
}
}
#ifdef RELU
out0 = fmax(out0, (COMPUTE_FLOAT4)0);
out1 = fmax(out1, (COMPUTE_FLOAT4)0);
out2 = fmax(out2, (COMPUTE_FLOAT4)0);
out3 = fmax(out3, (COMPUTE_FLOAT4)0);
out4 = fmax(out4, (COMPUTE_FLOAT4)0);
out5 = fmax(out5, (COMPUTE_FLOAT4)0);
out6 = fmax(out6, (COMPUTE_FLOAT4)0);
out7 = fmax(out7, (COMPUTE_FLOAT4)0);
#endif
#ifdef RELU6
out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out2 = clamp(out2, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out4 = clamp(out4, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out5 = clamp(out5, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out6 = clamp(out6, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
out7 = clamp(out7, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
#endif
int out_offset = (((out_b_idx + out_c_idx_0*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
#ifdef BLOCK_LEAVE
const int remain = out_hw.y - out_w_idx;
if(remain >= 4){
vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0, out1, out2, out3)), 0, output+out_offset);
}else if(remain == 3){
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0, out1)), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out2), 2, output+out_offset);
}else if(remain == 2){
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0, out1)), 0, output+out_offset);
}else if(remain == 1){
vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
}
#ifdef CHANNEL_BOUNDARY_PROTECT
if(out_c_idx_1 >= out_c_blocks)return;
#endif
out_offset = (((out_b_idx + (out_c_idx_1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
if(remain >= 4){
vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4, out5, out6, out7)), 0, output+out_offset);
}else if(remain == 3){
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4, out5)), 0, output+out_offset);
vstore4(CONVERT_FLOAT4(out6), 2, output+out_offset);
}else if(remain == 2){
vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4, out5)), 0, output+out_offset);
}else if(remain == 1){
vstore4(CONVERT_FLOAT4(out4), 0, output+out_offset);
}
#else
vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0, out1, out2, out3)), 0, output+out_offset);
#ifdef CHANNEL_BOUNDARY_PROTECT
if(out_c_idx_1 >= out_c_blocks)return;
#endif
out_offset = (((out_b_idx + (out_c_idx_1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4, out5, out6, out7)), 0, output+out_offset);
#endif
}