in ops/av_input_op.h [302:422]
OperatorBase::template GetSingleArgument<int>("length_rgb", 0)),
sampling_rate_rgb_(OperatorBase::template GetSingleArgument<int>(
"sampling_rate_rgb",
1)),
length_of_(OperatorBase::template GetSingleArgument<int>("length_of", 0)),
sampling_rate_of_(
OperatorBase::template GetSingleArgument<int>("sampling_rate_of", 1)),
frame_gap_of_(
OperatorBase::template GetSingleArgument<int>("frame_gap_of", 1)),
random_mirror_(OperatorBase::template GetSingleArgument<bool>(
"random_mirror",
true)),
num_of_class_(
OperatorBase::template GetSingleArgument<int>("num_of_class", 0)),
use_local_file_(OperatorBase::template GetSingleArgument<bool>(
"use_local_file",
false)),
random_crop_(
OperatorBase::template GetSingleArgument<bool>("random_crop", true)),
decode_type_(
OperatorBase::template GetSingleArgument<int>("decode_type", 0)),
video_res_type_(
OperatorBase::template GetSingleArgument<int>("video_res_type", 0)),
get_rgb_(OperatorBase::template GetSingleArgument<bool>(
"get_rgb",
false)),
get_logmels_(OperatorBase::template GetSingleArgument<bool>("get_logmels",
false)),
get_video_id_(OperatorBase::template GetSingleArgument<bool>(
"get_video_id",
false)),
do_multi_label_(OperatorBase::template GetSingleArgument<bool>(
"do_multi_label",
false)),
logMelFrames_(OperatorBase::template GetSingleArgument<int>(
"logmel_frames", kNumLogMelFrames)),
logMelFilters_(OperatorBase::template GetSingleArgument<int>(
"logmel_filters", kNumLogMelFilters)),
logMelWindowSizeMs_(OperatorBase::template GetSingleArgument<int>(
"logmel_winsize_ms", kWindowLength)),
logMelWindowStepMs_(OperatorBase::template GetSingleArgument<int>(
"logmel_winstep_ms", kWindowStep)),
logMelAudioSamplingRate_(OperatorBase::template GetSingleArgument<int>(
"logmel_audio_sr", kAudioSamplingRate)),
align_audio_(OperatorBase::template GetSingleArgument<int>(
"align_audio", 1)),
audio_length_(OperatorBase::template GetSingleArgument<int>(
"audio_length", 0)),
tune_audio_step_(OperatorBase::template GetSingleArgument<bool>(
"tune_audio_step",
false)),
num_decode_threads_(OperatorBase::template GetSingleArgument<int>(
"num_decode_threads", 4)),
thread_pool_(std::make_shared<TaskThreadPool>(num_decode_threads_)) {
try {
num_of_required_frame_ = 0;
// mean and std for normalizing different optical flow data type;
// Note that the statistics are generated from SOA, and you may
// want to change them if you are running on a different dataset;
// Each dimension represents: horizontal component of optical flow,
// vertical component of optical flow, magnitude of optical flow,
// Gray, R, G, B.
const std::vector<float> InputDataMean = {0.0046635, 0.0046261,
0.963986, 102.976, 110.201, 100.64, 95.9966};
const std::vector<float> InputDataStd = {0.972347, 0.755146,
1.43588, 55.3691, 58.1489, 56.4701, 55.3324};
// if we need RGB as an input
if (get_rgb_) {
// how many frames we need for RGB
num_of_required_frame_ = std::max(
num_of_required_frame_, (length_rgb_ - 1) * sampling_rate_rgb_ + 1);
channels_rgb_ = 3;
for (int i = 4; i < 7; i++) {
mean_rgb_.push_back(InputDataMean[i]);
inv_std_rgb_.push_back(1.f / InputDataStd[i]);
}
}
CheckParamsAndPrint();
// Always need a dbreader, even when using local video files
CAFFE_ENFORCE_GT(
operator_def.input_size(), 0, "Need to have a DBReader blob input");
vector<int64_t> data_shape(5);
vector<int64_t> label_shape(2);
vector<int64_t> logmels_shape(4);
// for RGB data
data_shape[0] = batch_size_ * clip_per_video_;
data_shape[1] = channels_rgb_;
data_shape[2] = length_rgb_;
data_shape[3] = crop_size_;
data_shape[4] = crop_size_;
prefetched_clip_rgb_.Resize(data_shape);
// for audio data
logmels_shape[0] = batch_size_ * clip_per_video_;
logmels_shape[1] = 1;
logmels_shape[2] = logMelFrames_;
logmels_shape[3] = logMelFilters_;
prefetched_clip_logmels_.Resize(logmels_shape);
// If do_multi_label is used, output label is a binary vector
// of length num_of_class indicating which labels present
if (do_multi_label_) {
label_shape[0] = batch_size_ * clip_per_video_;
label_shape[1] = num_of_class_;
prefetched_label_.Resize(label_shape);
} else {
prefetched_label_.Resize(
vector<int64_t>(1, batch_size_ * clip_per_video_));
}
prefetched_video_id_.Resize(
vector<int64_t>(1, batch_size_ * clip_per_video_));
} catch (const std::exception& exc) {
std::cerr << "While calling AVInputOp initialization\n";
std::cerr << exc.what();
}
}