Tensor CpuConvForward()

in src/model/operation/convolution.cc [96:225]


Tensor CpuConvForward(const Tensor &x, Tensor &W, Tensor &b,
                      const ConvHandle &ch) {
  CHECK_EQ(x.device()->lang(), kCpp);

  CHECK(x.shape(1) == ch.channels && x.shape(2) == ch.height &&
        x.shape(3) == ch.width)
      << "input sample shape should not change";

  CHECK(W.shape(0) == ch.num_filters && W.shape(1) == ch.channels &&
        W.shape(2) == ch.kernel_h && W.shape(3) == ch.kernel_w)
      << "weights shape should not change";

#ifdef USE_DNNL
  DataType dtype = x.data_type();
  auto dev = x.device();

  Shape shape{ch.batchsize, ch.num_filters, ch.conv_height, ch.conv_width};
  Tensor output(shape, dev, dtype);

  output.device()->Exec(
      [output, x, &W, &b, &ch](Context *ctx) mutable {
        using namespace dnnl;
        using tag = memory::format_tag;
        auto eng = ctx->dnnl_engine;
        auto s = ctx->dnnl_stream;
        auto dtype = dnnl::memory::data_type::f32;

        // dnnl design pattern
        // xxx_user_xxx_memory(and its format tag) is defined by user, which may
        // need to be reordered
        auto conv_user_src_memory = memory({{ch.x_dims}, dtype, tag::nchw}, eng,
                                           x.block()->mutable_data());
        auto conv_user_weights_memory = memory({{ch.w_dims}, dtype, tag::goihw},
                                               eng, W.block()->mutable_data());
        auto conv_user_bias_memory = memory({{ch.b_dims}, dtype, tag::x}, eng,
                                            b.block()->mutable_data());

        // xxx_xxx_memory_md is created for creating conv_desc, and format tag
        // is defined as any
        auto conv_src_md = memory::desc({ch.x_dims}, dtype, tag::any);
        auto conv_bias_md = memory::desc({ch.b_dims}, dtype, tag::any);
        auto conv_weights_md = memory::desc({ch.w_dims}, dtype, tag::any);
        auto conv_dst_md = memory::desc({ch.o_dims}, dtype,
                                        tag::nchw);  // could not set to any

        auto conv_desc = convolution_forward::desc(
            prop_kind::forward, algorithm::convolution_direct, conv_src_md,
            conv_weights_md, conv_bias_md, conv_dst_md, ch.s_dims, ch.p_dims,
            ch.p_dims);
        auto conv_pd = convolution_forward::primitive_desc(conv_desc, eng);

        // auto conv_pd = *ch.conv_pd; // 1ms to 70 ms slower

        // memory placeholder for reorder
        auto conv_src_memory = conv_user_src_memory;
        auto conv_weights_memory = conv_user_weights_memory;

        // output memory
        auto conv_dst_memory =
            memory(conv_pd.dst_desc(), eng, output.block()->mutable_data());

        // Tensor for reorder  - tesing performance shows no significant improve
        Tensor x_reo;
        x_reo.ResetLike(x);
        Tensor W_reo;
        W_reo.ResetLike(W);

        if (conv_pd.src_desc() != conv_user_src_memory.get_desc()) {
          conv_src_memory =
              memory(conv_pd.src_desc(), eng, x_reo.block()->mutable_data());
          reorder(conv_user_src_memory, conv_src_memory)
              .execute(s, {{DNNL_ARG_FROM, conv_user_src_memory},
                           {DNNL_ARG_TO, conv_src_memory}});
        }
        if (conv_pd.weights_desc() != conv_user_weights_memory.get_desc()) {
          conv_weights_memory = memory(conv_pd.weights_desc(), eng,
                                       W_reo.block()->mutable_data());
          reorder(conv_user_weights_memory, conv_weights_memory)
              .execute(s, {{DNNL_ARG_FROM, conv_user_weights_memory},
                           {DNNL_ARG_TO, conv_weights_memory}});
        }

        // execuete forward
        convolution_forward(conv_pd).execute(
            s, {{DNNL_ARG_SRC, conv_src_memory},
                {DNNL_ARG_WEIGHTS, conv_weights_memory},
                {DNNL_ARG_BIAS, conv_user_bias_memory},
                {DNNL_ARG_DST, conv_dst_memory}});

        // synchronize stream
        s.wait();
      },
      {x.block(), W.block(), b.block()}, {output.block()}, "CpuConvForward");

  return output;
#else   // cpp naive, error due to Im2col importing
/*
  Shape w_shape = W.shape();
  Shape b_shape;
  if (ch.bias_term) b_shape = b.shape();

  W.Reshape(Shape{ch.num_filters, ch.col_height});
  if (ch.bias_term) b.Reshape(Shape{ch.num_filters});

  DataType dtype = x.data_type();
  auto dev = x.device();
  Shape shape{ch.batchsize, ch.num_filters, ch.conv_height, ch.conv_width};
  Tensor output(shape, dev, dtype);
  Tensor col_data(Shape{ch.col_height, ch.col_width});  // broadcasted image

  float *data_col = new float[ch.col_height * ch.col_width];
  auto in_data = x.data<float>();
  for (size_t num = 0; num < ch.batchsize; num++) {
    Im2col(in_data + num * ch.imagesize, ch.channels, ch.height, ch.width,
           ch.kernel_h, ch.kernel_w, ch.pad_h, ch.pad_w, ch.stride_h,
           ch.stride_w, data_col);

    col_data.CopyDataFromHostPtr(data_col, ch.col_height * ch.col_width);
    Tensor each = Mult(W, col_data);
    if (ch.bias_term) {
      AddColumn(b, &each);
    }
    CopyDataToFrom(&output, each, each.Size(), num * each.Size());
  };
  W.Reshape(w_shape);
  if (ch.bias_term) b.Reshape(b_shape);
  return output;
*/
#endif  // USE_DNNL
}