in tensorflow_io/core/kernels/audio_video_mp4_kernels.cc [537:681]
void Compute(OpKernelContext* context) override {
const Tensor* input_tensor;
OP_REQUIRES_OK(context, context->input("input", &input_tensor));
const Tensor* rate_tensor;
OP_REQUIRES_OK(context, context->input("rate", &rate_tensor));
const int64 channels = input_tensor->shape().dim_size(1);
OP_REQUIRES(
context, (channels == static_cast<int16>(channels)),
errors::InvalidArgument("channels ", channels, " > max(int16)"));
const int64 rate = rate_tensor->scalar<int64>()();
OP_REQUIRES(context, (rate == static_cast<int32>(rate)),
errors::InvalidArgument("rate ", rate, " > max(int32)"));
// Code from buildAacAudioSpecificConfig:
// ExoPlayer/library/core/src/main/java/com/google/android/exoplayer2/util/CodecSpecificDataUtil.java
static const int AUDIO_SPECIFIC_CONFIG_SAMPLING_RATE_TABLE[] = {
96000, 88200, 64000, 48000, 44100, 32000, 24000,
22050, 16000, 12000, 11025, 8000, 7350};
static const int AUDIO_SPECIFIC_CONFIG_CHANNEL_COUNT_TABLE[] = {
0, //
1, // mono: <FC>
2, // stereo: (FL, FR)
3, // 3.0: <FC>, (FL, FR)
4, // 4.0: <FC>, (FL, FR), <BC>
5, // 5.0 back: <FC>, (FL, FR), (SL, SR)
6, // 5.1 back: <FC>, (FL, FR), (SL, SR), <BC>, [LFE]
8, // 7.1 wide back: <FC>, (FCL, FCR), (FL, FR), (SL, SR), [LFE]
// AUDIO_SPECIFIC_CONFIG_CHANNEL_CONFIGURATION_INVALID,
// AUDIO_SPECIFIC_CONFIG_CHANNEL_CONFIGURATION_INVALID,
// AUDIO_SPECIFIC_CONFIG_CHANNEL_CONFIGURATION_INVALID,
// 7, // 6.1: <FC>, (FL, FR), (SL, SR), <RC>, [LFE]
// 8, // 7.1: <FC>, (FL, FR), (SL, SR), (BL, BR), [LFE]
// AUDIO_SPECIFIC_CONFIG_CHANNEL_CONFIGURATION_INVALID,
// 8, // 7.1 top: <FC>, (FL, FR), (SL, SR), [LFE], (FTL, FTR)
// AUDIO_SPECIFIC_CONFIG_CHANNEL_CONFIGURATION_INVALID
};
int audioObjectType = 2; // AUDIO_OBJECT_TYPE_AAC_LC
int sampleRateIndex = -1;
for (int i = 0;
i < sizeof(AUDIO_SPECIFIC_CONFIG_SAMPLING_RATE_TABLE) /
sizeof(AUDIO_SPECIFIC_CONFIG_SAMPLING_RATE_TABLE[0]);
i++) {
if (rate == AUDIO_SPECIFIC_CONFIG_SAMPLING_RATE_TABLE[i]) {
sampleRateIndex = i;
break;
}
}
OP_REQUIRES(
context, (sampleRateIndex >= 0),
errors::InvalidArgument("sample rate ", rate, " not supported"));
int channelConfig = -1;
for (int i = 0;
i < sizeof(AUDIO_SPECIFIC_CONFIG_CHANNEL_COUNT_TABLE) /
sizeof(AUDIO_SPECIFIC_CONFIG_CHANNEL_COUNT_TABLE[0]);
i++) {
if (channels == AUDIO_SPECIFIC_CONFIG_CHANNEL_COUNT_TABLE[i]) {
channelConfig = i;
break;
}
}
OP_REQUIRES(
context, (channelConfig >= 0),
errors::InvalidArgument("channels ", channels, " not supported"));
std::unique_ptr<void, void (*)(void*)> state(nullptr, [](void* p) {
if (p != nullptr) {
EncodeAACFunctionFini(p);
}
});
state.reset(EncodeAACFunctionInit(0, rate, channels));
OP_REQUIRES(context, (state.get() != nullptr),
errors::InvalidArgument("unable to initialize encoder"));
const float* data_in = input_tensor->flat<float>().data();
const int64_t size_in = input_tensor->NumElements();
int64_t chunk = input_tensor->NumElements() / channels / 1024 + 1;
std::vector<char*> data_out_chunk((size_t)chunk);
std::vector<int64_t> size_out_chunk((size_t)chunk);
int status =
EncodeAACFunctionCall(state.get(), data_in, size_in, &data_out_chunk[0],
&size_out_chunk[0], &chunk);
OP_REQUIRES(context, (status == 0),
errors::InvalidArgument("unable to encode aac"));
Tensor* output_tensor = nullptr;
OP_REQUIRES_OK(
context, context->allocate_output(0, TensorShape({}), &output_tensor));
tstring& output = output_tensor->scalar<tstring>()();
int64 size_out = 0;
for (int64 i = 0; i < chunk; i++) {
size_out += size_out_chunk[i];
}
// At least size_out + 4096
output.reserve(size_out + 4096);
std::unique_ptr<MP4E_mux_t, void (*)(MP4E_mux_t*)> mux(nullptr,
[](MP4E_mux_t* p) {
if (p != nullptr) {
MP4E_close(p);
;
}
});
mux.reset(MP4E_open(0, 0, &output, AudioEncodeMP4AACWriteCallback));
OP_REQUIRES(context, (mux.get() != nullptr),
errors::InvalidArgument("unable open mux"));
MP4E_track_t tr;
tr.track_media_kind = e_audio;
tr.language[0] = 'u';
tr.language[1] = 'n';
tr.language[2] = 'd';
tr.language[3] = 0;
tr.object_type_indication = MP4_OBJECT_TYPE_AUDIO_ISO_IEC_14496_3;
tr.time_scale = rate;
tr.default_duration = 0;
tr.u.a.channelcount = channels;
int audio_track_id = MP4E_add_track(mux.get(), &tr);
unsigned char dsi[2];
dsi[0] = (unsigned char)(((audioObjectType << 3) & 0xF8) |
((sampleRateIndex >> 1) & 0x07));
dsi[1] = (unsigned char)(((sampleRateIndex << 7) & 0x80) |
((channelConfig << 3) & 0x78));
status = MP4E_set_dsi(mux.get(), audio_track_id, dsi, sizeof(dsi));
OP_REQUIRES(context, (status == 0),
errors::InvalidArgument("unable to set dsi: ", status));
for (int64 i = 0; i < chunk; i++) {
status =
MP4E_put_sample(mux.get(), audio_track_id, data_out_chunk[i],
size_out_chunk[i], 1024, MP4E_SAMPLE_RANDOM_ACCESS);
OP_REQUIRES(
context, (status == 0),
errors::InvalidArgument("unable to mux packet ", i, ":", status));
}
// close mux
mux.reset(nullptr);
}