in ops/av_io.cc [72:266]
void ClipTransformAudioLogmel(
const int decode_type_,
const bool get_rgb_,
const float* clip_rgb_data,
const int number_of_frames,
const bool tune_audio_step_,
const int logMelFrames_,
const int logMelAudioSamplingRate_,
const int logMelWindowSizeMs_,
const int logMelWindowStepMs_,
const int logMelFilters_,
const int num_of_required_frame_,
const int align_audio_,
const int clip_per_video_,
const int audio_length_,
const int clip_start_frame,
std::vector<float> audioSamples,
float* clip_of_logmels_data
) {
int newlogMelWindowStepMs_ = logMelWindowStepMs_;
int newAudioLength_ = audio_length_;
int newClipStartFrame_ = clip_start_frame;
if (get_rgb_ && clip_rgb_data && number_of_frames && tune_audio_step_) {
float framesNeed =
number_of_frames * logMelFrames_ / num_of_required_frame_;
float adjInterval =
logMelWindowSizeMs_ * logMelAudioSamplingRate_ / 1000.0f;
float newAdjStep =
(audioSamples.size() - adjInterval) / (framesNeed - 1);
int newStep =
std::ceil(newAdjStep * 1000.0f / logMelAudioSamplingRate_);
newlogMelWindowStepMs_ = std::max(newStep, 1);
}
LogSpectrum spectrum(
logMelFilters_,
logMelAudioSamplingRate_,
logMelWindowSizeMs_,
newlogMelWindowStepMs_
);
spectrum.Write(audioSamples.data(), audioSamples.size());
int framesRead = 0;
vector<vector<float>> logmels;
while(true){
vector<float> buffer(logMelFilters_);
if (spectrum.Read(buffer.data()) <= 0){
break;
}
framesRead++;
logmels.push_back(buffer);
}
if (get_rgb_ && clip_rgb_data && number_of_frames &&
framesRead && align_audio_) {
if (align_audio_ > 1 && newAudioLength_ == 0) {
if (clip_per_video_ == 1 && framesRead >= logMelFrames_) {
int audio_start_frame = std::floor(
newClipStartFrame_ * framesRead / number_of_frames);
int audio_left_start = audio_start_frame;
if (align_audio_ == 3) { // align by center
int clip_end_frame = newClipStartFrame_ + num_of_required_frame_;
int audio_end_frame = std::ceil(
clip_end_frame * framesRead / number_of_frames);
int audio_middle_frame = std::round(
(audio_start_frame + audio_end_frame) / 2);
audio_left_start =
std::max<int>(audio_middle_frame - std::ceil(logMelFrames_ / 2), 0);
}
int audio_right_end =
std::min(audio_left_start + logMelFrames_, framesRead);
logmels.erase(logmels.begin() + audio_right_end, logmels.end());
if (audio_left_start > 0 &&
audio_left_start < static_cast<int>(logmels.size())) {
logmels.erase(
logmels.begin(), logmels.begin() + audio_left_start);
}
framesRead = static_cast<int>(logmels.size());
}
} else {
cv::Mat1f logmel_cv(framesRead, logMelFilters_);
cv::Mat1f logmel_interpolated;
if (align_audio_ == 1) { // perfect align
newAudioLength_ = num_of_required_frame_;
}
framesRead = std::ceil(
number_of_frames * logMelFrames_ / newAudioLength_);
for (int i = 0; i < logmel_cv.rows; ++i) {
for (int j = 0; j < logmel_cv.cols; ++j) {
logmel_cv.at<float>(i,j) = logmels[i][j];
}
}
if (framesRead >= logmel_cv.rows) {
cv::resize(logmel_cv, logmel_interpolated,
cv::Size(logMelFilters_, framesRead));
} else {
cv::resize(logmel_cv, logmel_interpolated,
cv::Size(logMelFilters_, framesRead), 0, 0,
cv::INTER_NEAREST);
}
logmels.clear();
if (decode_type_ == DecodeType::DO_UNIFORM_SMP) {
for(int i = 0; i < logmel_interpolated.rows; ++i) {
vector<float> buffer(logMelFilters_);
for (int j = 0; j < logmel_interpolated.cols; ++j) {
buffer[j] = logmel_interpolated.at<float>(i,j);
}
logmels.push_back(buffer);
}
} else {
if (newAudioLength_ != num_of_required_frame_ &&
align_audio_ == 3) { // center align
int clip_middle_frame =
std::round(newClipStartFrame_ + num_of_required_frame_ / 2);
newClipStartFrame_ =
std::max(clip_middle_frame - newAudioLength_ / 2, 0);
}
int audio_start_frame = std::floor(
newClipStartFrame_ * framesRead / number_of_frames);
int audio_end_frame = std::min(
logmel_interpolated.rows, audio_start_frame + logMelFrames_);
for(int i = audio_start_frame; i < audio_end_frame; ++i) {
vector<float> buffer(logMelFilters_);
for (int j = 0; j < logmel_interpolated.cols; ++j) {
buffer[j] = logmel_interpolated.at<float>(i,j);
}
logmels.push_back(buffer);
}
}
framesRead = static_cast<int>(logmels.size());
}
// pad during testing; disable for now due to poor performance;
// if (decode_type_ == DecodeType::DO_UNIFORM_SMP &&
// align_audio_ > 1 && clip_per_video_ > 1) {
// int audio_pad_difference = 0;
// int aligned_audio_length = std::ceil(
// num_of_required_frame_ * framesRead / number_of_frames);
// if (logMelFrames_ > aligned_audio_length) {
// audio_pad_difference = logMelFrames_ - aligned_audio_length;
// }
// if (align_audio_ == 3) {
// vector<float> pad(logMelFilters_);
// logmels.insert(
// logmels.begin(), std::floor(audio_pad_difference / 2), pad);
// logmels.insert(
// logmels.end(), std::ceil(audio_pad_difference / 2), pad);
// }
// if (align_audio_ == 2) {
// vector<float> pad(logMelFilters_);
// logmels.insert(
// logmels.end(), audio_pad_difference, pad);
// }
// }
}
float audio_pad_difference = 0.0;
if (decode_type_ == DecodeType::DO_UNIFORM_SMP &&
align_audio_ > 1 && clip_per_video_ > 1 && number_of_frames) {
float aligned_audio_length =
num_of_required_frame_ * framesRead / number_of_frames;
audio_pad_difference = logMelFrames_ - aligned_audio_length;
}
const float frameStep = std::max<float>(
0, (framesRead + audio_pad_difference - logMelFrames_))
/ std::max<int>(1, clip_per_video_ - 1);
const int clipSize = logMelFilters_ * logMelFrames_;
for (int i = 0; i < clip_per_video_; ++i){
float* clip_of_logmels_data_start = clip_of_logmels_data + i * clipSize;
memset(clip_of_logmels_data_start, 0,
clipSize * sizeof(float));
float start_left_shift = 0.0;
if (align_audio_ > 1 && clip_per_video_ > 1
&& i * frameStep + logMelFrames_ > framesRead) {
// last frame, uniform sampling, not perfect alignment, shift by the
// audio_pad_difference
start_left_shift = i * frameStep + logMelFrames_ - framesRead;
} else if (align_audio_ == 3 && clip_per_video_ > 1) {
// align by center, every frame needs to shift by half of pad diff
start_left_shift = audio_pad_difference / 2;
}
const int startFrame =
std::max<int>(std::floor(i * frameStep - start_left_shift), 0);
const int endFrame =
std::min<int>(startFrame + logMelFrames_, framesRead);
for (int j = startFrame; j < endFrame; ++j){
memcpy(clip_of_logmels_data_start + (j - startFrame) * logMelFilters_,
logmels[j].data(), logMelFilters_ * sizeof(float));
}
VLOG(2) << "Copied data " << (startFrame) << " to " << endFrame
<< " total frames " << framesRead << " index " << i
<< " out of " << clip_per_video_;
}
}