in torchrecipes/audio/source_separation/datamodule/utils.py [0:0]
def __call__(self, samples: SampleType) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
samples (SampleType): The Tuple that contains
sample_rate, mixture waveform, clean waveforms of all speakers.
Returns:
(Tuple(Tensor, Tensor, Tensor)):
The Tensor of mixture speech wavecforms of dimension `[batch, time]`.
The Tensor of clean speech wavecforms of dimension `[batch, num_speaker, time]`.
The Tensor of padding mask of dimension `[batch, time]`.
"""
if self.duration == -1:
target_num_frames = max(s[1].shape[-1] for s in samples)
else:
target_num_frames = int(self.duration * self.sample_rate)
mixes, srcs, masks = [], [], []
for sample in samples:
mix, src, mask = self._fix_num_frames(
sample, target_num_frames, self.sample_rate, random_start=True
)
mixes.append(mix)
srcs.append(src)
masks.append(mask)
return (torch.stack(mixes, 0), torch.stack(srcs, 0), torch.stack(masks, 0))