void AVInputOp::DecodeAndTransform()

in ops/av_input_op.h [558:654]


void AVInputOp<Context>::DecodeAndTransform(
    const std::string& value,
    float* clip_rgb_data,
    float* clip_of_logmels_data,
    int* label_data,
    int64_t* video_id_data,
    std::mt19937* randgen,
    std::bernoulli_distribution* mirror_this_clip) {
  try {
    std::vector<unsigned char*> buffer_rgb;
    // get the video resolution after decoding
    int height = 0;
    int width = 0;
    // get the number of visual frames to use for synchronizing with aduio
    int number_of_frames = 0;
    int clip_start_frame = 0;
    // Decode the video from memory or read from a local file
    std::vector<float> audioSamples;
    audioSamples.reserve(logMelAudioSamplingRate_);
    CHECK(GetClipsAndLabelsFromDBValue(
        value, height, width, buffer_rgb, audioSamples, label_data,
        video_id_data, number_of_frames, clip_start_frame, randgen));

    int clip_offset_rgb = channels_rgb_ * length_rgb_ * crop_size_ * crop_size_;
    int clip_offset_of = channels_of_ * length_of_ * crop_size_ * crop_size_;
    for (int i = 0; i < std::min(clip_per_video_, int(buffer_rgb.size()));
         i++) {
      // get the rectangle for cropping
      int h_off = 0;
      int w_off = 0;
      if (random_crop_) {
        // using random crop for training
        h_off =
            std::uniform_int_distribution<>(0, height - crop_size_)(*randgen);
        w_off =
            std::uniform_int_distribution<>(0, width - crop_size_)(*randgen);
      } else {
        // using center crop for testing
        h_off = (height - crop_size_) / 2;
        w_off = (width - crop_size_) / 2;
      }
      cv::Rect rect(w_off, h_off, crop_size_, crop_size_);

      // randomly mirror the image or not
      bool mirror_me = random_mirror_ && (*mirror_this_clip)(*randgen);

      if (get_rgb_ && clip_rgb_data) {
        ClipTransformRGB(
            buffer_rgb[i],
            crop_size_,
            length_rgb_,
            channels_rgb_,
            sampling_rate_rgb_,
            height,
            width,
            h_off,
            w_off,
            mirror_me,
            mean_rgb_,
            inv_std_rgb_,
            clip_rgb_data + (i * clip_offset_rgb));
      }
    }

    if (get_logmels_ && clip_of_logmels_data) {
      ClipTransformAudioLogmel(
          decode_type_,
          get_rgb_,
          clip_rgb_data,
          number_of_frames,
          tune_audio_step_,
          logMelFrames_,
          logMelAudioSamplingRate_,
          logMelWindowSizeMs_,
          logMelWindowStepMs_,
          logMelFilters_,
          num_of_required_frame_,
          align_audio_,
          clip_per_video_,
          audio_length_,
          clip_start_frame,
          audioSamples,
          clip_of_logmels_data);
    }

    if (buffer_rgb.size() > 0) {
      for (int i = 0; i < buffer_rgb.size(); i++) {
        unsigned char* buff = buffer_rgb[i];
        delete[] buff;
      }
    }
    buffer_rgb.clear();
  } catch (const std::exception& exc) {
    std::cerr << "While calling DecodeAndTransform()\n";
    std::cerr << exc.what();
  }
}