in optimum/intel/openvino/modeling_visual_language.py [0:0]
def get_vision_embeddings(self, pixel_values, input_ids, **kwargs):
# Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-SmolVLM-2/src/transformers/models/smolvlm/modeling_smolvlm.py#L899-L942
if input_ids is not None and input_ids.shape[1] == 1 and kwargs.get("past_key_values") is not None:
return None
batch_size, num_images, num_channels, height, width = pixel_values.shape
pixel_values = pixel_values
pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
# Remove padding images - padding images are full 0.
nb_values_per_image = pixel_values.shape[1:].numel()
real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
if not any(real_images_inds):
# no images, leave one empty image.
real_images_inds[0] = True
pixel_values = pixel_values[real_images_inds].contiguous()
pixel_attention_mask = kwargs.get("pixel_attention_mask")
# Handle the vision attention mask
if pixel_attention_mask is None:
pixel_attention_mask = torch.ones(
size=[pixel_values.shape[i] for i in (0, 2, 3)],
dtype=torch.bool,
device=pixel_values.device,
)
else:
# Remove padding images from the mask
pixel_attention_mask = pixel_attention_mask.view(batch_size * num_images, *pixel_attention_mask.shape[2:])
pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous()
patch_size = self.config.vision_config.patch_size
num_patches_per_side = self.config.vision_config.image_size // patch_size
patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
batch_size_, _, max_im_h, max_im_w = pixel_values.shape
max_nb_patches_h, max_nb_patches_w = max_im_h // patch_size, max_im_w // patch_size
boundaries = torch.arange(1 / num_patches_per_side, 1.0, 1 / num_patches_per_side)
position_ids = torch.full(size=(batch_size_, max_nb_patches_h * max_nb_patches_w), fill_value=0)
for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
nb_patches_h = p_attn_mask[:, 0].sum()
nb_patches_w = p_attn_mask[0].sum()
fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
pos_ids = (bucket_coords_h[:, None] * num_patches_per_side + bucket_coords_w).flatten()
position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
return self.vision_embeddings(
pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, patch_position_ids=position_ids
).last_hidden_state