in src/operator/convolution_v1-inl.h [180:292]
virtual void Backward(const OpContext &ctx,
const std::vector<TBlob> &out_grad,
const std::vector<TBlob> &in_data,
const std::vector<TBlob> &out_data,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &in_grad,
const std::vector<TBlob> &aux_args) {
using namespace mshadow;
using namespace mshadow::expr;
// TODO(bing): check the BLAS Handle, be careful
if (param_.kernel.ndim() > 2) {
LOG(FATAL) << "Volume convolution is not implmented in mshadow";
}
CHECK_EQ(out_grad.size(), 1);
size_t expected = param_.no_bias == 0 ? 3 : 2;
CHECK(in_data.size() == expected && in_grad.size() == expected);
CHECK_EQ(req.size(), expected);
CHECK_EQ(in_data[conv_v1::kWeight].CheckContiguous(), true);
// get data
Stream<xpu> *s = ctx.get_stream<xpu>();
Tensor<xpu, 4, DType> data = in_data[conv_v1::kData].get<xpu, 4, DType>(s);
Shape<3> wmat_shape =
Shape3(param_.num_group,
param_.num_filter / param_.num_group,
data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
Tensor<xpu, 3, DType> wmat =
in_data[conv_v1::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
Tensor<xpu, 4, DType> grad = out_grad[conv_v1::kOut].get<xpu, 4, DType>(s);
Tensor<xpu, 4, DType> gdata = in_grad[conv_v1::kData].get<xpu, 4, DType>(s);
Tensor<xpu, 3, DType> gwmat =
in_grad[conv_v1::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
#if defined(__CUDACC__)
CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
<< "Must init CuBLAS handle in stream";
#endif
const index_t nbatch = data.size(0);
Tensor<xpu, 1, DType> workspace =
ctx.requested[conv_v1::kTempSpace].get_space_typed<xpu, 1, DType>(
Shape1(this->InitTemp(data.shape_, grad.shape_)), s);
for (index_t i = 0; i < nbatch; i += nstep_) {
const index_t step = std::min(nstep_, nbatch - i);
Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(workspace.dptr_,
Shape2(shape_colunit_[0],
shape_colunit_[1] * step), s);
Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
workspace.dptr_ + temp_col.shape_.Size(),
Shape3(shape_dstunit_[0],
shape_dstunit_[1],
shape_dstunit_[2] * step), s);
temp_dst = reshape(swapaxis<1, 0>(grad.Slice(i, i + step)), temp_dst.shape_);
if (param_.pad[0] == 0 && param_.pad[1] == 0) {
temp_col = unpack_patch2col(data.Slice(i, i + step),
param_.kernel[0],
param_.kernel[1],
param_.stride[0],
param_.stride[1],
param_.dilate[0],
param_.dilate[1]);
} else {
temp_col = unpack_patch2col(pad(data.Slice(i, i + step), param_.pad[0], param_.pad[1]),
param_.kernel[0],
param_.kernel[1],
param_.stride[0],
param_.stride[1],
param_.dilate[0],
param_.dilate[1]);
}
const index_t gstride = temp_col.size(0) / param_.num_group;
for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
if (i == 0) {
Tensor<xpu, 2, DType> tmp_gwmat = gwmat[gid];
Assign(tmp_gwmat, req[conv_v1::kWeight], dot(temp_dst[gid], tmpc.T()));
} else {
gwmat[gid] += dot(temp_dst[gid], tmpc.T());
}
}
for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
tmpc = dot(wmat[gid].T(), temp_dst[gid]);
}
if (param_.pad[0] == 0 && param_.pad[1] == 0) {
Assign(gdata.Slice(i, i + step), req[conv_v1::kData],
pack_col2patch(temp_col,
data.Slice(i, i + step).shape_,
param_.kernel[0],
param_.kernel[1],
param_.stride[0],
param_.stride[1],
param_.dilate[0],
param_.dilate[1]));
} else {
Shape<4> pshape = data.Slice(i, i + step).shape_;
pshape[2] += 2 * param_.pad[0];
pshape[3] += 2 * param_.pad[1];
Assign(gdata.Slice(i, i + step), req[conv_v1::kData],
crop(pack_col2patch(temp_col,
pshape,
param_.kernel[0],
param_.kernel[1],
param_.stride[0],
param_.stride[1],
param_.dilate[0],
param_.dilate[1]),
gdata[i][0].shape_));
}
}
if (!param_.no_bias) {
Tensor<xpu, 1, DType> gbias = in_grad[conv_v1::kBias].get<xpu, 1, DType>(s);
Assign(gbias, req[conv_v1::kBias], sumall_except_dim<1>(grad));
}
}