void CustomizedVideoInputOp::DecodeAndTransform()

in caffe2_customized_ops/video/customized_video_input_op.h [358:488]


void CustomizedVideoInputOp<Context>::DecodeAndTransform(
    const std::string value,
    float* clip_data,
    int* label_data,
    const int crop_size,  // -1 is uncrop
    const bool mirror,
    const float mean,
    const float std,
    std::mt19937* randgen,
    std::bernoulli_distribution* mirror_this_clip,
    int* height_out,
    int* width_out
  ) {
  float* buffer = nullptr;

  // Decode the video from memory or read from a local file
  int height_raw = -1;
  int width_raw = -1;
  int height_scaled = -1;
  int width_scaled = -1;
  CHECK(GetClipAndLabelFromDBValue(
    value, buffer, label_data, randgen, height_raw, width_raw)
  );

  if ((height_raw <= 0) || (width_raw <= 0)) return;

  const int num_clips = 1;

  for (int i = 0; i < num_clips; i ++) {
    if (use_scale_augmentaiton_) {
      const int buffer_sample_size = height_raw * width_raw * length_ * 3;
      float* buffer_scaled = nullptr;

      ScaleTransform(
          buffer + buffer_sample_size * i,
          3,
          length_,
          height_raw,
          width_raw,
          max_size_,
          min_size_,
          buffer_scaled,
          randgen,
          height_scaled,
          width_scaled);


      // determine the returned output size
      if (i == 0) {
        if (crop_size > 0) {
          *height_out = crop_size;
          *width_out = crop_size;
        } else {
          *height_out = height_scaled;
          *width_out = width_scaled;

          // avoid extreme size (even if we want "full" image)
          const float ratio_max = 1.6f;
          const float ratio = (height_scaled > width_scaled) ?
            ((float)height_scaled / width_scaled) :
            ((float)width_scaled / height_scaled);

          if (ratio > ratio_max) {
            if (height_scaled > width_scaled) {
                *height_out = (int)(ratio_max * width_scaled);
            } else {
                *width_out = (int)(ratio_max * height_scaled);
            }
            // LOG(INFO) << "Truncate image: "
            //   << "width: " << width_scaled << ", height: " << height_scaled
            //   << " to width: " << *width_out << ", height: " << *height_out;
          } // if ratio > ratio_max
        } // else crop_size > 0
      } // if i

      const int clip_size = *height_out * *width_out * length_ * 3;
      int spatial_pos = -1;
      if (use_multi_crop_ > 0)
      {
        // crop along the longer side
        TensorProtos protos;
        CAFFE_ENFORCE(protos.ParseFromString(value));
        const TensorProto& spatial_pos_proto = protos.protos(3);
        spatial_pos = spatial_pos_proto.int32_data(0);
      }

      ClipTransformFlex(
          buffer_scaled,
          3,
          length_,
          height_scaled,
          width_scaled,
          (*height_out),
          (*width_out),
          mirror,
          mean,
          std,
          clip_data + clip_size * i,
          randgen,
          mirror_this_clip,
          is_test_,
          use_bgr_,
          spatial_pos
        );

      if (buffer_scaled != nullptr)
        delete[] buffer_scaled;
    } else {
      LOG(FATAL) << "We don't recommend using unrestricted input size, "
      << "as it is heavily dependent on dataset preparation.";

      // const int buffer_sample_size = height * width * length_ * 3;
      // it will caused problem is the side < crop_size
      // ClipTransform(
      //     buffer + buffer_sample_size * i,
      //     3,
      //     length_,
      //     height,
      //     width,
      //     crop_size + ,
      //     mirror,
      //     mean,
      //     std,
      //     clip_data + clip_size * i,
      //     randgen,
      //     mirror_this_clip,
      //     is_test_);
    } // else
  } // i
  delete[] buffer;
}