in src/operator/softmax_output-inl.h [108:228]
virtual void Backward(const OpContext &ctx,
const std::vector<TBlob> &out_grad,
const std::vector<TBlob> &in_data,
const std::vector<TBlob> &out_data,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &in_grad,
const std::vector<TBlob> &aux_args) {
using namespace mshadow;
using namespace mshadow::expr;
CHECK_EQ(in_data.size(), 2U);
CHECK_EQ(out_grad.size(), 1U);
CHECK_GE(in_grad.size(), 1U);
CHECK_GE(req.size(), 1U);
Stream<xpu> *s = ctx.get_stream<xpu>();
if (out_data[softmaxout_enum::kOut].shape_ ==
in_data[softmaxout_enum::kLabel].shape_) {
// use probability as label
Tensor<xpu, 2, DType> label = in_data[softmaxout_enum::kLabel].FlatTo2D<xpu, DType>(s);
Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
Tensor<xpu, 2, DType> grad = in_grad[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
if (param_.out_grad) {
Tensor<xpu, 2, DType> ograd = out_grad[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
grad = scalar<DType>(param_.grad_scale) * (out - label) * ograd;
} else {
grad = (out - label) * scalar<DType>(param_.grad_scale);
}
} else if (param_.multi_output) {
int n = out_data[softmaxout_enum::kOut].size(0);
int k = out_data[softmaxout_enum::kOut].size(1);
Shape<3> s3 = Shape3(n, k, static_cast<int>(out_data[softmaxout_enum::kOut].Size()/n/k));
Shape<2> s2 = Shape2(s3[0], s3[2]);
Tensor<xpu, 2, DType> label =
in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 2, DType>(s2, s);
Tensor<xpu, 3, DType> out =
out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
Tensor<xpu, 3, DType> grad =
in_grad[softmaxout_enum::kData].get_with_shape<xpu, 3, DType>(s3, s);
index_t valid_cnt = label.shape_.Size();
if (param_.use_ignore) {
SoftmaxGrad(grad, out, label, static_cast<DType>(param_.ignore_label));
} else {
SoftmaxGrad(grad, out, label);
}
if (param_.normalization == softmaxout_enum::kBatch) {
valid_cnt = label.size(0);
} else if (param_.normalization == softmaxout_enum::kValid) {
int i_label = static_cast<int>(param_.ignore_label);
Tensor<cpu, 2, DType> workspace =
ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<2, DType>(
label.shape_);
Copy(workspace, label, label.stream_);
for (index_t i = 0; i < workspace.size(0); ++i) {
for (index_t j = 0; j < workspace.size(1); ++j) {
if (static_cast<int>(workspace[i][j]) == i_label) {
valid_cnt--;
}
}
}
valid_cnt = valid_cnt == 0 ? 1 : valid_cnt;
} else {
valid_cnt = 1;
}
grad *= DType(param_.grad_scale /
(param_.normalization == softmaxout_enum::kValid ? 1 : s3[2]) /
valid_cnt);
if (param_.out_grad) {
Tensor<xpu, 3, DType> ograd =
out_grad[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
grad *= ograd;
}
} else {
Shape<1> label_shape = Shape1(in_data[softmaxout_enum::kLabel].Size());
Shape<2> data_shape;
if (param_.preserve_shape) {
data_shape = out_data[softmaxout_enum::kOut].shape_.FlatTo2D();
// Tensor<xpu, 1, DType> label = in_data[softmaxout_enum::kLabel].FlatTo1D<xpu, DType>(s);
// Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
// Tensor<xpu, 2, DType> grad = in_grad[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
} else {
int n = out_data[softmaxout_enum::kOut].size(0);
data_shape = Shape2(n, out_data[softmaxout_enum::kOut].Size()/n);
}
Tensor<xpu, 1, DType> label = in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 1, DType>(
label_shape, s);
Tensor<xpu, 2, DType> out =
out_data[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(data_shape, s);
Tensor<xpu, 2, DType> grad =
in_grad[softmaxout_enum::kData].get_with_shape<xpu, 2, DType>(data_shape, s);
index_t valid_cnt = label.shape_.Size();
if (param_.use_ignore) {
SoftmaxGrad(grad, out, label, static_cast<DType>(param_.ignore_label));
} else {
SoftmaxGrad(grad, out, label);
}
if (param_.normalization == softmaxout_enum::kBatch) {
valid_cnt = label.size(0);
} else if (param_.normalization == softmaxout_enum::kValid) {
int i_label = static_cast<int>(param_.ignore_label);
Tensor<cpu, 1, DType> workspace =
ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<1, DType>(
label.shape_);
Copy(workspace, label, label.stream_);
for (index_t i = 0; i < label.size(0); ++i) {
if (static_cast<int>(workspace[i]) == i_label) {
valid_cnt--;
}
}
valid_cnt = valid_cnt == 0 ? 1 : valid_cnt;
} else {
valid_cnt = 1;
}
grad *= DType(param_.grad_scale / valid_cnt);
if (param_.out_grad) {
Tensor<xpu, 2, DType> ograd =
out_grad[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(data_shape, s);
grad *= ograd;
}
}
}