in ultravox/model/ultravox_model.py [0:0]
def init_latency_mask(self, audio_latency_block_size: int, dtype: torch.dtype):
if audio_latency_block_size is None:
self.audio_streaming_mask = None
return
# Use max_context_length directly in the calculation
max_seqlen = self.max_context_length
assert (
max_seqlen > 0
), f"maximum sequence length must be positive, got {max_seqlen}"
assert (
max_seqlen % audio_latency_block_size == 0
), f"audio_latency_block_size {audio_latency_block_size} must divide {max_seqlen} evenly."
# Given the block size, we calculate number of blocks.
audio_latency_nblocks = max_seqlen // audio_latency_block_size
audio_streaming_mask = (
torch.tril(
torch.ones(audio_latency_nblocks, audio_latency_nblocks),
diagonal=0,
)
.repeat_interleave(audio_latency_block_size, dim=0)
.repeat_interleave(audio_latency_block_size, dim=1)
)
audio_streaming_mask = (1.0 - audio_streaming_mask) * torch.finfo(dtype).min
audio_streaming_mask = audio_streaming_mask[None, None, :, :]
self.register_buffer(
"audio_streaming_mask", audio_streaming_mask, persistent=False
)