in torchbenchmark/models/demucs/demucs/audio.py [0:0]
def read(self,
seek_time=None,
duration=None,
streams=slice(None),
samplerate=None,
channels=None,
temp_folder=None):
"""
Slightly more efficient implementation than stempeg,
in particular, this will extract all stems at once
rather than having to loop over one file multiple times
for each stream.
Args:
seek_time (float): seek time in seconds or None if no seeking is needed.
duration (float): duration in seconds to extract or None to extract until the end.
streams (slice, int or list): streams to extract, can be a single int, a list or
a slice. If it is a slice or list, the output will be of size [S, C, T]
with S the number of streams, C the number of channels and T the number of samples.
If it is an int, the output will be [C, T].
samplerate (int): if provided, will resample on the fly. If None, no resampling will
be done. Original sampling rate can be obtained with :method:`samplerate`.
channels (int): if 1, will convert to mono. We do not rely on ffmpeg for that
as ffmpeg automatically scale by +3dB to conserve volume when playing on speakers.
See https://sound.stackexchange.com/a/42710.
Our definition of mono is simply the average of the two channels. Any other
value will be ignored.
temp_folder (str or Path or None): temporary folder to use for decoding.
"""
streams = np.array(range(len(self)))[streams]
single = not isinstance(streams, np.ndarray)
if single:
streams = [streams]
if duration is None:
target_size = None
query_duration = None
else:
target_size = int((samplerate or self.samplerate()) * duration)
query_duration = float((target_size + 1) / (samplerate or self.samplerate()))
with temp_filenames(len(streams)) as filenames:
command = ['ffmpeg', '-y']
command += ['-loglevel', 'panic']
if seek_time:
command += ['-ss', str(seek_time)]
command += ['-i', str(self.path)]
for stream, filename in zip(streams, filenames):
command += ['-map', f'0:{self._audio_streams[stream]}']
if query_duration is not None:
command += ['-t', str(query_duration)]
command += ['-threads', '1']
command += ['-f', 'f32le']
if samplerate is not None:
command += ['-ar', str(samplerate)]
command += [filename]
sp.run(command, check=True)
wavs = []
for filename in filenames:
wav = np.fromfile(filename, dtype=np.float32)
wav = torch.from_numpy(wav)
wav = wav.view(-1, self.channels()).t()
if channels == 1:
# Case 1:
# The caller asked 1-channel audio, but the stream have multiple
# channels, downmix all channels.
# We do mono convertion here as ffmpeg mess up the volume of mono output
# otherwise. See https://sound.stackexchange.com/a/42710.
wav = wav.mean(dim=0, keepdim=True)
elif self.channels() == 1 and channels != 1:
# Case 2:
# The caller asked for multiple channels, but the input file have
# one single channel, replicate the audio over all channels.
wav = wav.as_strided(size=(channels, wav.shape[1]), stride=(0, 1))
elif self.channels() >= channels:
# Case 3:
# The caller asked for multiple channels, and the input file have
# more channels than requested. In that case return the first channels.
wav = wav[:channels, :]
else:
# Case 4: What is a reasonable choice here?
raise ValueError('The input file has less channels than requested')
if target_size is not None:
wav = wav[..., :target_size]
wavs.append(wav)
wav = torch.stack(wavs, dim=0)
if single:
wav = wav[0]
return wav