torchaudio/csrc/pybind/sox/io.cpp

#include <torchaudio/csrc/pybind/sox/effects.h> #include <torchaudio/csrc/pybind/sox/effects_chain.h> #include <torchaudio/csrc/pybind/sox/io.h> #include <torchaudio/csrc/pybind/sox/utils.h> #include <torchaudio/csrc/sox/io.h> #include <torchaudio/csrc/sox/types.h> #include <utility> using namespace torchaudio::sox_utils; namespace torchaudio::sox_io { auto get_info_fileobj(py::object fileobj, c10::optional<std::string> format) -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> { // Prepare in-memory file object // When libsox opens a file, it also reads the header. // When opening a file there are two functions that might touch FILE* (and the // underlying buffer). // * `auto_detect_format` // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L43 // * `startread` handler of detected format. // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L574 // To see the handler of a particular format, go to // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/<FORMAT>.c // For example, voribs can be found // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/vorbis.c#L97-L158 // // `auto_detect_format` function only requires 256 bytes, but format-dependent // `startread` handler might require more data. In case of vorbis, the size of // header is unbounded, but typically 4kB maximum. // // "The header size is unbounded, although for streaming a rule-of-thumb of // 4kB or less is recommended (and Xiph.Org's Vorbis encoder follows this // suggestion)." // // See: // https://xiph.org/vorbis/doc/Vorbis_I_spec.html const auto capacity = [&]() { // NOTE: // Use the abstraction provided by `libtorchaudio` to access the global // config defined by libsox. Directly using `sox_get_globals` function will // end up retrieving the static variable defined in `_torchaudio`, which is // not correct. const auto bufsiz = get_buffer_size(); const int64_t kDefaultCapacityInBytes = 4096; return (bufsiz > kDefaultCapacityInBytes) ? bufsiz : kDefaultCapacityInBytes; }(); std::string buffer(capacity, '\0'); auto* buf = const_cast<char*>(buffer.data()); auto num_read = read_fileobj(&fileobj, capacity, buf); // If the file is shorter than 256, then libsox cannot read the header. auto buf_size = (num_read > 256) ? num_read : 256; SoxFormat sf(sox_open_mem_read( buf, buf_size, /*signal=*/nullptr, /*encoding=*/nullptr, /*filetype=*/format.has_value() ? format.value().c_str() : nullptr)); // In case of streamed data, length can be 0 validate_input_memfile(sf); return std::make_tuple( static_cast<int64_t>(sf->signal.rate), static_cast<int64_t>(sf->signal.length / sf->signal.channels), static_cast<int64_t>(sf->signal.channels), static_cast<int64_t>(sf->encoding.bits_per_sample), get_encoding(sf->encoding.encoding)); } auto load_audio_fileobj( py::object fileobj, c10::optional<int64_t> frame_offset, c10::optional<int64_t> num_frames, c10::optional<bool> normalize, c10::optional<bool> channels_first, c10::optional<std::string> format) -> std::tuple<torch::Tensor, int64_t> { auto effects = get_effects(frame_offset, num_frames); return torchaudio::sox_effects::apply_effects_fileobj( std::move(fileobj), effects, normalize, channels_first, std::move(format)); } namespace { // helper class to automatically release buffer, to be used by // save_audio_fileobj struct AutoReleaseBuffer { char* ptr; size_t size; AutoReleaseBuffer() : ptr(nullptr), size(0) {} AutoReleaseBuffer(const AutoReleaseBuffer& other) = delete; AutoReleaseBuffer(AutoReleaseBuffer&& other) = delete; auto operator=(const AutoReleaseBuffer& other) -> AutoReleaseBuffer& = delete; auto operator=(AutoReleaseBuffer&& other) -> AutoReleaseBuffer& = delete; ~AutoReleaseBuffer() { if (ptr) { free(ptr); } } }; } // namespace void save_audio_fileobj( py::object fileobj, torch::Tensor tensor, int64_t sample_rate, bool channels_first, c10::optional<double> compression, c10::optional<std::string> format, c10::optional<std::string> encoding, c10::optional<int64_t> bits_per_sample) { validate_input_tensor(tensor); if (!format.has_value()) { throw std::runtime_error( "`format` is required when saving to file object."); } const auto filetype = format.value(); if (filetype == "amr-nb") { const auto num_channels = tensor.size(channels_first ? 0 : 1); if (num_channels != 1) { throw std::runtime_error( "amr-nb format only supports single channel audio."); } } else if (filetype == "htk") { const auto num_channels = tensor.size(channels_first ? 0 : 1); if (num_channels != 1) { throw std::runtime_error( "htk format only supports single channel audio."); } } else if (filetype == "gsm") { const auto num_channels = tensor.size(channels_first ? 0 : 1); if (num_channels != 1) { throw std::runtime_error( "gsm format only supports single channel audio."); } if (sample_rate != 8000) { throw std::runtime_error( "gsm format only supports a sampling rate of 8kHz."); } } const auto signal_info = get_signalinfo(&tensor, sample_rate, filetype, channels_first); const auto encoding_info = get_encodinginfo_for_save( filetype, tensor.dtype(), compression, std::move(encoding), bits_per_sample); AutoReleaseBuffer buffer; SoxFormat sf(sox_open_memstream_write( &buffer.ptr, &buffer.size, &signal_info, &encoding_info, filetype.c_str(), /*oob=*/nullptr)); if (static_cast<sox_format_t*>(sf) == nullptr) { throw std::runtime_error( "Error saving audio file: failed to open memory stream."); } torchaudio::sox_effects_chain::SoxEffectsChainPyBind chain( /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()), /*output_encoding=*/sf->encoding); chain.addInputTensor(&tensor, sample_rate, channels_first); chain.addOutputFileObj(sf, &buffer.ptr, &buffer.size, &fileobj); chain.run(); // Closing the sox_format_t is necessary for flushing the last chunk to the // buffer sf.close(); fileobj.attr("write")(py::bytes(buffer.ptr, buffer.size)); } } // namespace torchaudio::sox_io

torchaudio/csrc/pybind/sox/io.cpp (134 lines of code) (raw):