in optimum/intel/openvino/modeling_visual_language.py [0:0]
def get_vision_embeddings(self, pixel_values, grid_thw, **kwargs):
hidden_states = self.vision_embeddings(pixel_values)[0]
rotary_pos_emb = self.rot_pos_emb(grid_thw)
window_index, cu_window_seqlens = self.get_window_index(grid_thw)
cu_window_seqlens = torch.tensor(
cu_window_seqlens,
dtype=torch.int32,
)
cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
dim=0, dtype=torch.int32
)
cu_seqlens = torch.nn.functional.pad(cu_seqlens, (1, 0), value=0)
attention_mask = torch.zeros((1, hidden_states.shape[0], hidden_states.shape[0]), dtype=torch.bool)
causal_mask = torch.zeros_like(attention_mask, dtype=torch.float32)
for i in range(1, len(cu_seqlens)):
attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
causal_mask.masked_fill_(torch.logical_not(attention_mask), float("-inf"))
window_attention_mask = torch.zeros((1, hidden_states.shape[0], hidden_states.shape[0]), dtype=torch.bool)
window_causal_mask = torch.zeros_like(attention_mask, dtype=torch.float32)
for i in range(1, len(cu_window_seqlens)):
window_attention_mask[
..., cu_window_seqlens[i - 1] : cu_window_seqlens[i], cu_window_seqlens[i - 1] : cu_window_seqlens[i]
] = True
window_causal_mask.masked_fill_(torch.logical_not(window_attention_mask), float("-inf"))
res = self.vision_embeddings_merger(
pixel_values=hidden_states,
attention_mask=causal_mask,
window_attention_mask=window_causal_mask,
window_index=window_index,
rotary_pos_emb=rotary_pos_emb,
)[0]
return res