src/operator/contrib/proposal.cc (371 lines of code) (raw):

/*! * Copyright (c) 2015 by Contributors * \file proposal.cc * \brief * \author Piotr Teterwak, Bing Xu, Jian Guo */ #include "./proposal-inl.h" //============================ // Bounding Box Transform Utils //============================ namespace mxnet { namespace op { namespace utils { // bbox prediction and clip to the image borders inline void BBoxTransformInv(const mshadow::Tensor<cpu, 2>& boxes, const mshadow::Tensor<cpu, 4>& deltas, const float im_height, const float im_width, const int real_height, const int real_width, mshadow::Tensor<cpu, 2> *out_pred_boxes) { CHECK_GE(boxes.size(1), 4); CHECK_GE(out_pred_boxes->size(1), 4); int anchors = deltas.size(1)/4; int heights = deltas.size(2); int widths = deltas.size(3); for (int a = 0; a < anchors; ++a) { for (int h = 0; h < heights; ++h) { for (int w = 0; w < widths; ++w) { index_t index = h * (widths * anchors) + w * (anchors) + a; float width = boxes[index][2] - boxes[index][0] + 1.0; float height = boxes[index][3] - boxes[index][1] + 1.0; float ctr_x = boxes[index][0] + 0.5 * (width - 1.0); float ctr_y = boxes[index][1] + 0.5 * (height - 1.0); float dx = deltas[0][a*4 + 0][h][w]; float dy = deltas[0][a*4 + 1][h][w]; float dw = deltas[0][a*4 + 2][h][w]; float dh = deltas[0][a*4 + 3][h][w]; float pred_ctr_x = dx * width + ctr_x; float pred_ctr_y = dy * height + ctr_y; float pred_w = exp(dw) * width; float pred_h = exp(dh) * height; float pred_x1 = pred_ctr_x - 0.5 * (pred_w - 1.0); float pred_y1 = pred_ctr_y - 0.5 * (pred_h - 1.0); float pred_x2 = pred_ctr_x + 0.5 * (pred_w - 1.0); float pred_y2 = pred_ctr_y + 0.5 * (pred_h - 1.0); pred_x1 = std::max(std::min(pred_x1, im_width - 1.0f), 0.0f); pred_y1 = std::max(std::min(pred_y1, im_height - 1.0f), 0.0f); pred_x2 = std::max(std::min(pred_x2, im_width - 1.0f), 0.0f); pred_y2 = std::max(std::min(pred_y2, im_height - 1.0f), 0.0f); (*out_pred_boxes)[index][0] = pred_x1; (*out_pred_boxes)[index][1] = pred_y1; (*out_pred_boxes)[index][2] = pred_x2; (*out_pred_boxes)[index][3] = pred_y2; if (h >= real_height || w >= real_width) { (*out_pred_boxes)[index][4] = -1.0; } } } } } // iou prediction and clip to the image border inline void IoUTransformInv(const mshadow::Tensor<cpu, 2>& boxes, const mshadow::Tensor<cpu, 4>& deltas, const float im_height, const float im_width, const int real_height, const int real_width, mshadow::Tensor<cpu, 2> *out_pred_boxes) { CHECK_GE(boxes.size(1), 4); CHECK_GE(out_pred_boxes->size(1), 4); int anchors = deltas.size(1)/4; int heights = deltas.size(2); int widths = deltas.size(3); for (int a = 0; a < anchors; ++a) { for (int h = 0; h < heights; ++h) { for (int w = 0; w < widths; ++w) { index_t index = h * (widths * anchors) + w * (anchors) + a; float x1 = boxes[index][0]; float y1 = boxes[index][1]; float x2 = boxes[index][2]; float y2 = boxes[index][3]; float dx1 = deltas[0][a * 4 + 0][h][w]; float dy1 = deltas[0][a * 4 + 1][h][w]; float dx2 = deltas[0][a * 4 + 2][h][w]; float dy2 = deltas[0][a * 4 + 3][h][w]; float pred_x1 = x1 + dx1; float pred_y1 = y1 + dy1; float pred_x2 = x2 + dx2; float pred_y2 = y2 + dy2; pred_x1 = std::max(std::min(pred_x1, im_width - 1.0f), 0.0f); pred_y1 = std::max(std::min(pred_y1, im_height - 1.0f), 0.0f); pred_x2 = std::max(std::min(pred_x2, im_width - 1.0f), 0.0f); pred_y2 = std::max(std::min(pred_y2, im_height - 1.0f), 0.0f); (*out_pred_boxes)[index][0] = pred_x1; (*out_pred_boxes)[index][1] = pred_y1; (*out_pred_boxes)[index][2] = pred_x2; (*out_pred_boxes)[index][3] = pred_y2; if (h >= real_height || w >= real_width) { (*out_pred_boxes)[index][4] = -1.0f; } } } } } // filter box by set confidence to zero // * height or width < rpn_min_size inline void FilterBox(mshadow::Tensor<cpu, 2> *dets, const float min_size) { for (index_t i = 0; i < dets->size(0); i++) { float iw = (*dets)[i][2] - (*dets)[i][0] + 1.0f; float ih = (*dets)[i][3] - (*dets)[i][1] + 1.0f; if (iw < min_size || ih < min_size) { (*dets)[i][0] -= min_size / 2; (*dets)[i][1] -= min_size / 2; (*dets)[i][2] += min_size / 2; (*dets)[i][3] += min_size / 2; (*dets)[i][4] = -1.0f; } } } } // namespace utils } // namespace op } // namespace mxnet //===================== // NMS Utils //===================== namespace mxnet { namespace op { namespace utils { struct ReverseArgsortCompl { const float *val_; explicit ReverseArgsortCompl(float *val) : val_(val) {} bool operator() (float i, float j) { return (val_[static_cast<index_t>(i)] > val_[static_cast<index_t>(j)]); } }; // copy score and init order inline void CopyScore(const mshadow::Tensor<cpu, 2>& dets, mshadow::Tensor<cpu, 1> *score, mshadow::Tensor<cpu, 1> *order) { for (index_t i = 0; i < dets.size(0); i++) { (*score)[i] = dets[i][4]; (*order)[i] = i; } } // sort order array according to score inline void ReverseArgsort(const mshadow::Tensor<cpu, 1>& score, mshadow::Tensor<cpu, 1> *order) { ReverseArgsortCompl cmpl(score.dptr_); std::sort(order->dptr_, order->dptr_ + score.size(0), cmpl); } // reorder proposals according to order and keep the pre_nms_top_n proposals // dets.size(0) == pre_nms_top_n inline void ReorderProposals(const mshadow::Tensor<cpu, 2>& prev_dets, const mshadow::Tensor<cpu, 1>& order, const index_t pre_nms_top_n, mshadow::Tensor<cpu, 2> *dets) { CHECK_EQ(dets->size(0), pre_nms_top_n); for (index_t i = 0; i < dets->size(0); i++) { const index_t index = order[i]; for (index_t j = 0; j < dets->size(1); j++) { (*dets)[i][j] = prev_dets[index][j]; } } } // greedily keep the max detections (already sorted) inline void NonMaximumSuppression(const mshadow::Tensor<cpu, 2>& dets, const float thresh, const index_t post_nms_top_n, mshadow::Tensor<cpu, 1> *area, mshadow::Tensor<cpu, 1> *suppressed, mshadow::Tensor<cpu, 1> *keep, index_t *out_size) { CHECK_EQ(dets.shape_[1], 5) << "dets: [x1, y1, x2, y2, score]"; CHECK_GT(dets.shape_[0], 0); CHECK_EQ(dets.CheckContiguous(), true); CHECK_EQ(area->CheckContiguous(), true); CHECK_EQ(suppressed->CheckContiguous(), true); CHECK_EQ(keep->CheckContiguous(), true); // calculate area for (index_t i = 0; i < dets.size(0); ++i) { (*area)[i] = (dets[i][2] - dets[i][0] + 1) * (dets[i][3] - dets[i][1] + 1); } // calculate nms *out_size = 0; for (index_t i = 0; i < dets.size(0) && (*out_size) < post_nms_top_n; ++i) { float ix1 = dets[i][0]; float iy1 = dets[i][1]; float ix2 = dets[i][2]; float iy2 = dets[i][3]; float iarea = (*area)[i]; if ((*suppressed)[i] > 0.0f) { continue; } (*keep)[(*out_size)++] = i; for (index_t j = i + 1; j < dets.size(0); j ++) { if ((*suppressed)[j] > 0.0f) { continue; } float xx1 = std::max(ix1, dets[j][0]); float yy1 = std::max(iy1, dets[j][1]); float xx2 = std::min(ix2, dets[j][2]); float yy2 = std::min(iy2, dets[j][3]); float w = std::max(0.0f, xx2 - xx1 + 1.0f); float h = std::max(0.0f, yy2 - yy1 + 1.0f); float inter = w * h; float ovr = inter / (iarea + (*area)[j] - inter); if (ovr > thresh) { (*suppressed)[j] = 1.0f; } } } } } // namespace utils } // namespace op } // namespace mxnet namespace mxnet { namespace op { template<typename xpu> class ProposalOp : public Operator{ public: explicit ProposalOp(ProposalParam param) { this->param_ = param; } virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data, const std::vector<OpReqType> &req, const std::vector<TBlob> &out_data, const std::vector<TBlob> &aux_states) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(in_data.size(), 3); CHECK_EQ(out_data.size(), 2); CHECK_GT(req.size(), 1); CHECK_EQ(req[proposal::kOut], kWriteTo); CHECK_EQ(in_data[proposal::kClsProb].shape_[0], 1) << "Sorry, multiple images each device is not implemented."; Stream<xpu> *s = ctx.get_stream<xpu>(); Shape<4> scores_shape = Shape4(in_data[proposal::kClsProb].shape_[0], in_data[proposal::kClsProb].shape_[1] / 2, in_data[proposal::kClsProb].shape_[2], in_data[proposal::kClsProb].shape_[3]); real_t* foreground_score_ptr = in_data[proposal::kClsProb].dptr<real_t>() + scores_shape.Size(); Tensor<cpu, 4> scores = Tensor<cpu, 4>(foreground_score_ptr, scores_shape); Tensor<cpu, 4> bbox_deltas = in_data[proposal::kBBoxPred].get<cpu, 4, real_t>(s); Tensor<cpu, 2> im_info = in_data[proposal::kImInfo].get<cpu, 2, real_t>(s); Tensor<cpu, 2> out = out_data[proposal::kOut].get<cpu, 2, real_t>(s); Tensor<cpu, 2> out_score = out_data[proposal::kScore].get<cpu, 2, real_t>(s); int num_anchors = in_data[proposal::kClsProb].shape_[1] / 2; int height = scores.size(2); int width = scores.size(3); int count = num_anchors * height * width; int rpn_pre_nms_top_n = (param_.rpn_pre_nms_top_n > 0) ? param_.rpn_pre_nms_top_n : count; rpn_pre_nms_top_n = std::min(rpn_pre_nms_top_n, count); int rpn_post_nms_top_n = std::min(param_.rpn_post_nms_top_n, rpn_pre_nms_top_n); int workspace_size = count * 5 + 2 * count + rpn_pre_nms_top_n * 5 + 3 * rpn_pre_nms_top_n; Tensor<cpu, 1> workspace = ctx.requested[proposal::kTempResource].get_space<cpu>( Shape1(workspace_size), s); int start = 0; Tensor<cpu, 2> workspace_proposals(workspace.dptr_ + start, Shape2(count, 5)); start += count * 5; Tensor<cpu, 2> workspace_pre_nms(workspace.dptr_ + start, Shape2(2, count)); start += 2 * count; Tensor<cpu, 2> workspace_ordered_proposals(workspace.dptr_ + start, Shape2(rpn_pre_nms_top_n, 5)); start += rpn_pre_nms_top_n * 5; Tensor<cpu, 2> workspace_nms(workspace.dptr_ + start, Shape2(3, rpn_pre_nms_top_n)); start += 3 * rpn_pre_nms_top_n; CHECK_EQ(workspace_size, start) << workspace_size << " " << start << std::endl; // Generate anchors std::vector<float> base_anchor(4); base_anchor[0] = 0.0; base_anchor[1] = 0.0; base_anchor[2] = param_.feature_stride - 1.0; base_anchor[3] = param_.feature_stride - 1.0; CHECK_EQ(num_anchors, param_.ratios.info.size() * param_.scales.info.size()); std::vector<float> anchors; utils::GenerateAnchors(base_anchor, param_.ratios.info, param_.scales.info, &anchors); std::memcpy(workspace_proposals.dptr_, &anchors[0], sizeof(float) * anchors.size()); // Enumerate all shifted anchors for (index_t i = 0; i < num_anchors; ++i) { for (index_t j = 0; j < height; ++j) { for (index_t k = 0; k < width; ++k) { index_t index = j * (width * num_anchors) + k * (num_anchors) + i; workspace_proposals[index][0] = workspace_proposals[i][0] + k * param_.feature_stride; workspace_proposals[index][1] = workspace_proposals[i][1] + j * param_.feature_stride; workspace_proposals[index][2] = workspace_proposals[i][2] + k * param_.feature_stride; workspace_proposals[index][3] = workspace_proposals[i][3] + j * param_.feature_stride; workspace_proposals[index][4] = scores[0][i][j][k]; } } } // prevent padded predictions int real_height = static_cast<int>(im_info[0][0] / param_.feature_stride); int real_width = static_cast<int>(im_info[0][1] / param_.feature_stride); CHECK_GE(height, real_height) << height << " " << real_height << std::endl; CHECK_GE(width, real_width) << width << " " << real_width << std::endl; if (param_.iou_loss) { utils::IoUTransformInv(workspace_proposals, bbox_deltas, im_info[0][0], im_info[0][1], real_height, real_width, &(workspace_proposals)); } else { utils::BBoxTransformInv(workspace_proposals, bbox_deltas, im_info[0][0], im_info[0][1], real_height, real_width, &(workspace_proposals)); } utils::FilterBox(&workspace_proposals, param_.rpn_min_size * im_info[0][2]); Tensor<cpu, 1> score = workspace_pre_nms[0]; Tensor<cpu, 1> order = workspace_pre_nms[1]; utils::CopyScore(workspace_proposals, &score, &order); utils::ReverseArgsort(score, &order); utils::ReorderProposals(workspace_proposals, order, rpn_pre_nms_top_n, &workspace_ordered_proposals); index_t out_size = 0; Tensor<cpu, 1> area = workspace_nms[0]; Tensor<cpu, 1> suppressed = workspace_nms[1]; Tensor<cpu, 1> keep = workspace_nms[2]; suppressed = 0; // surprised! utils::NonMaximumSuppression(workspace_ordered_proposals, param_.threshold, rpn_post_nms_top_n, &area, &suppressed, &keep, &out_size); // fill in output rois for (index_t i = 0; i < out.size(0); ++i) { // batch index 0 out[i][0] = 0; if (i < out_size) { index_t index = keep[i]; for (index_t j = 0; j < 4; ++j) { out[i][j + 1] = workspace_ordered_proposals[index][j]; } } else { index_t index = keep[i % out_size]; for (index_t j = 0; j < 4; ++j) { out[i][j + 1] = workspace_ordered_proposals[index][j]; } } } // fill in output score for (index_t i = 0; i < out_score.size(0); i++) { if (i < out_size) { index_t index = keep[i]; out_score[i][0] = workspace_ordered_proposals[index][4]; } else { index_t index = keep[i % out_size]; out_score[i][0] = workspace_ordered_proposals[index][4]; } } } virtual void Backward(const OpContext &ctx, const std::vector<TBlob> &out_grad, const std::vector<TBlob> &in_data, const std::vector<TBlob> &out_data, const std::vector<OpReqType> &req, const std::vector<TBlob> &in_grad, const std::vector<TBlob> &aux_states) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(in_grad.size(), 3); Stream<xpu> *s = ctx.get_stream<xpu>(); Tensor<xpu, 4> gscores = in_grad[proposal::kClsProb].get<xpu, 4, real_t>(s); Tensor<xpu, 4> gbbox = in_grad[proposal::kBBoxPred].get<xpu, 4, real_t>(s); Tensor<xpu, 2> ginfo = in_grad[proposal::kImInfo].get<xpu, 2, real_t>(s); // can not assume the grad would be zero Assign(gscores, req[proposal::kClsProb], 0); Assign(gbbox, req[proposal::kBBoxPred], 0); Assign(ginfo, req[proposal::kImInfo], 0); } private: ProposalParam param_; }; // class ProposalOp template<> Operator *CreateOp<cpu>(ProposalParam param) { return new ProposalOp<cpu>(param); } Operator* ProposalProp::CreateOperator(Context ctx) const { DO_BIND_DISPATCH(CreateOp, param_); } DMLC_REGISTER_PARAMETER(ProposalParam); MXNET_REGISTER_OP_PROPERTY(_contrib_Proposal, ProposalProp) .describe("Generate region proposals via RPN") .add_argument("cls_score", "NDArray-or-Symbol", "Score of how likely proposal is object.") .add_argument("bbox_pred", "NDArray-or-Symbol", "BBox Predicted deltas from anchors for proposals") .add_argument("im_info", "NDArray-or-Symbol", "Image size and scale.") .add_arguments(ProposalParam::__FIELDS__()); } // namespace op } // namespace mxnet