in optimum/intel/openvino/modeling_visual_language.py [0:0]
def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
if input_ids is not None and input_ids.shape[1] == 1:
return None
tgt_sizes = kwargs["tgt_sizes"]
pixel_values_list = pixel_values
vision_hidden_states = []
all_pixel_values = []
img_cnt = []
for pixel_value in pixel_values_list:
img_cnt.append(len(pixel_value))
all_pixel_values.extend([i.flatten(end_dim=1).permute(1, 0) for i in pixel_value])
vision_embedding = None
# exist image
if all_pixel_values:
tgt_sizes = [tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)]
tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32)
max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])
all_pixel_values = torch.nn.utils.rnn.pad_sequence(all_pixel_values, batch_first=True, padding_value=0.0)
B, L, _ = all_pixel_values.shape
all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool)
for i in range(B):
patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True
position_ids = self._prepare_vis_position_ids(
all_pixel_values,
patch_attn_mask,
tgt_sizes,
self.config.vision_config.patch_size,
self.config.vision_config.image_size // self.config.patch_size,
)
vision_embedding = torch.from_numpy(
self.vision_embeddings(
pixel_values=all_pixel_values, patch_attention_mask=patch_attn_mask, position_ids=position_ids
)[0]
)
vision_embedding = self.resampling(vision_embedding, tgt_sizes)
start = 0
for pixel_value in pixel_values_list:
img_cnt = len(pixel_value)
if img_cnt > 0:
vision_hidden_states.append(vision_embedding[start : start + img_cnt])
start += img_cnt
else:
vision_hidden_states.append([])
else: # no image
dummy_feature = []
for _ in range(len(pixel_values_list)):
vision_hidden_states.append(dummy_feature)
return vision_hidden_states