in caffe2_customized_ops/video/customized_video_input_op.h [133:250]
OperatorBase::template GetSingleArgument<int>("batch_size", 0)),
mean_(OperatorBase::template GetSingleArgument<float>("mean", 0.)),
std_(OperatorBase::template GetSingleArgument<float>("std", 1.)),
crop_(OperatorBase::template GetSingleArgument<int>("crop", -1)),
scale_h_(OperatorBase::template GetSingleArgument<int>("height", 0)),
scale_w_(OperatorBase::template GetSingleArgument<int>("width", 0)),
length_(OperatorBase::template GetSingleArgument<int>("length", 0)),
sampling_rate_(
OperatorBase::template GetSingleArgument<int>("sampling_rate", 1)),
mirror_(OperatorBase::template GetSingleArgument<int>("mirror", 0)),
temporal_jitter_(
OperatorBase::template GetSingleArgument<int>("temporal_jitter", 1)),
use_image_(OperatorBase::template GetSingleArgument<int>("use_image", 0)),
multiple_label_(
OperatorBase::template GetSingleArgument<int>("multiple_label", 0)),
num_of_labels_(
OperatorBase::template GetSingleArgument<int>("num_of_labels", 0)),
use_local_file_(
OperatorBase::template GetSingleArgument<int>("use_local_file", 0)),
is_test_(OperatorBase::template GetSingleArgument<int>("is_test", 0)),
im_extension_(
OperatorBase::template GetSingleArgument<string>("im_extension", "")),
num_decode_threads_(
OperatorBase::template GetSingleArgument<int>("decode_threads", 4)),
use_bgr_(OperatorBase::template GetSingleArgument<int>("use_bgr", 0)),
min_size_(OperatorBase::template GetSingleArgument<int>("min_size", 256)),
max_size_(OperatorBase::template GetSingleArgument<int>("max_size", 480)),
use_scale_augmentaiton_(
OperatorBase::template GetSingleArgument<int>(
"use_scale_augmentaiton", 0)),
sample_times_(
OperatorBase::template GetSingleArgument<int>("sample_times", 10)),
use_multi_crop_(
OperatorBase::template GetSingleArgument<int>("use_multi_crop", 0)),
thread_pool_(new TaskThreadPool(num_decode_threads_)) {
CAFFE_ENFORCE_GT(batch_size_, 0, "Batch size should be nonnegative.");
// CAFFE_ENFORCE_GE(scale_h_, 0, "Must provide the scale value.");
// CAFFE_ENFORCE_GE(scale_w_, 0, "Must provide the cropping value.");
CAFFE_ENFORCE_GT(length_, 0, "Must provide the clip length value.");
// CAFFE_ENFORCE_GT(crop_, 0, "Must provide the cropping value.");
// CAFFE_ENFORCE_GE(
// scale_h_,
// crop_,
// "The scaled height must be no smaller than the crop value.");
// CAFFE_ENFORCE_GE(
// scale_w_,
// crop_,
// "The scaled width must be no smaller than the crop value.");
if (multiple_label_) {
CAFFE_ENFORCE_GT(
num_of_labels_,
0,
"Number of labels must be set for using multiple label output.");
}
if (crop_ <= 0){ // not cropping
CAFFE_ENFORCE_EQ(
is_test_,
1,
"Cannot use spatial uncrop at training.");
}
// Always need a dbreader, even when using local video files
CAFFE_ENFORCE_GT(
operator_def.input_size(), 0, "Need to have a DBReader blob input");
LOG(INFO) << "Creating a clip input op with the following setting: ";
LOG(INFO) << " Using " << num_decode_threads_ << " CPU threads;";
if (temporal_jitter_) {
LOG(INFO) << " Using temporal jittering;";
}
LOG(INFO) << " Outputting in batches of " << batch_size_ << " clips;";
LOG(INFO) << " Scaling image to " << scale_h_ << "x" << scale_w_;
LOG(INFO) << " Cropping video frame to " << crop_
<< (mirror_ ? " with " : " without ") << "random mirroring;";
LOG(INFO) << " Using " << (is_test_ ? "center" : "random") << " crop";
LOG(INFO) << " Using a clip of " << length_ << " frames;";
LOG(INFO) << " Using a sampling rate of 1:" << sampling_rate_;
LOG(INFO) << " Subtract mean " << mean_ << " and divide by std " << std_
<< ".";
// extra attributes follow
LOG(INFO) << " Scaling image from " << min_size_ << " to " << max_size_;
LOG(INFO) << " Using scale augmentaiton?: " << use_scale_augmentaiton_ ;
LOG(INFO) << " Using BGR order?: " << use_bgr_ ;
LOG(INFO) << " Using sample_times_:" << sample_times_;
LOG(INFO) << " Using use_multi_crop_: " << use_multi_crop_ ;
vector<TIndex> data_shape(5);
vector<TIndex> label_shape(2);
data_shape[0] = batch_size_;
// Assume color videos, will convert to 3 channels, even with black & with
// input videos
data_shape[1] = 3;
data_shape[2] = length_;
if (crop_ > 0) {
data_shape[3] = crop_;
data_shape[4] = crop_;
} else { // uncrop
data_shape[3] = 320; // rough estimate
data_shape[4] = 320; // rough estimate
}
prefetched_clip_.Resize(data_shape);
// If multiple label is used, outout label is a binary vector of length
// number of labels-dim in indicating which labels present
if (multiple_label_) {
label_shape[0] = batch_size_;
label_shape[1] = num_of_labels_;
prefetched_label_.Resize(label_shape);
} else {
prefetched_label_.Resize(vector<TIndex>(1, batch_size_));
}
}