in ops/av_input_op.h [558:654]
void AVInputOp<Context>::DecodeAndTransform(
const std::string& value,
float* clip_rgb_data,
float* clip_of_logmels_data,
int* label_data,
int64_t* video_id_data,
std::mt19937* randgen,
std::bernoulli_distribution* mirror_this_clip) {
try {
std::vector<unsigned char*> buffer_rgb;
// get the video resolution after decoding
int height = 0;
int width = 0;
// get the number of visual frames to use for synchronizing with aduio
int number_of_frames = 0;
int clip_start_frame = 0;
// Decode the video from memory or read from a local file
std::vector<float> audioSamples;
audioSamples.reserve(logMelAudioSamplingRate_);
CHECK(GetClipsAndLabelsFromDBValue(
value, height, width, buffer_rgb, audioSamples, label_data,
video_id_data, number_of_frames, clip_start_frame, randgen));
int clip_offset_rgb = channels_rgb_ * length_rgb_ * crop_size_ * crop_size_;
int clip_offset_of = channels_of_ * length_of_ * crop_size_ * crop_size_;
for (int i = 0; i < std::min(clip_per_video_, int(buffer_rgb.size()));
i++) {
// get the rectangle for cropping
int h_off = 0;
int w_off = 0;
if (random_crop_) {
// using random crop for training
h_off =
std::uniform_int_distribution<>(0, height - crop_size_)(*randgen);
w_off =
std::uniform_int_distribution<>(0, width - crop_size_)(*randgen);
} else {
// using center crop for testing
h_off = (height - crop_size_) / 2;
w_off = (width - crop_size_) / 2;
}
cv::Rect rect(w_off, h_off, crop_size_, crop_size_);
// randomly mirror the image or not
bool mirror_me = random_mirror_ && (*mirror_this_clip)(*randgen);
if (get_rgb_ && clip_rgb_data) {
ClipTransformRGB(
buffer_rgb[i],
crop_size_,
length_rgb_,
channels_rgb_,
sampling_rate_rgb_,
height,
width,
h_off,
w_off,
mirror_me,
mean_rgb_,
inv_std_rgb_,
clip_rgb_data + (i * clip_offset_rgb));
}
}
if (get_logmels_ && clip_of_logmels_data) {
ClipTransformAudioLogmel(
decode_type_,
get_rgb_,
clip_rgb_data,
number_of_frames,
tune_audio_step_,
logMelFrames_,
logMelAudioSamplingRate_,
logMelWindowSizeMs_,
logMelWindowStepMs_,
logMelFilters_,
num_of_required_frame_,
align_audio_,
clip_per_video_,
audio_length_,
clip_start_frame,
audioSamples,
clip_of_logmels_data);
}
if (buffer_rgb.size() > 0) {
for (int i = 0; i < buffer_rgb.size(); i++) {
unsigned char* buff = buffer_rgb[i];
delete[] buff;
}
}
buffer_rgb.clear();
} catch (const std::exception& exc) {
std::cerr << "While calling DecodeAndTransform()\n";
std::cerr << exc.what();
}
}