def get_vision_embeddings()

in optimum/intel/openvino/modeling_visual_language.py [0:0]


    def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
        if input_ids is not None and input_ids.shape[1] == 1:
            return None
        tgt_sizes = kwargs["tgt_sizes"]
        pixel_values_list = pixel_values
        vision_hidden_states = []
        all_pixel_values = []
        img_cnt = []
        for pixel_value in pixel_values_list:
            img_cnt.append(len(pixel_value))
            all_pixel_values.extend([i.flatten(end_dim=1).permute(1, 0) for i in pixel_value])

        vision_embedding = None
        # exist image
        if all_pixel_values:
            tgt_sizes = [tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)]
            tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32)

            max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])

            all_pixel_values = torch.nn.utils.rnn.pad_sequence(all_pixel_values, batch_first=True, padding_value=0.0)
            B, L, _ = all_pixel_values.shape
            all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)

            patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool)
            for i in range(B):
                patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True
            position_ids = self._prepare_vis_position_ids(
                all_pixel_values,
                patch_attn_mask,
                tgt_sizes,
                self.config.vision_config.patch_size,
                self.config.vision_config.image_size // self.config.patch_size,
            )
            vision_embedding = torch.from_numpy(
                self.vision_embeddings(
                    pixel_values=all_pixel_values, patch_attention_mask=patch_attn_mask, position_ids=position_ids
                )[0]
            )
            vision_embedding = self.resampling(vision_embedding, tgt_sizes)

            start = 0
            for pixel_value in pixel_values_list:
                img_cnt = len(pixel_value)
                if img_cnt > 0:
                    vision_hidden_states.append(vision_embedding[start : start + img_cnt])
                    start += img_cnt
                else:
                    vision_hidden_states.append([])
        else:  # no image
            dummy_feature = []
            for _ in range(len(pixel_values_list)):
                vision_hidden_states.append(dummy_feature)
        return vision_hidden_states