def forward()

in optimum/intel/openvino/modeling.py [0:0]


    def forward(self, input_ids, pixel_values, attention_mask: Optional[torch.Tensor] = None, **kwargs):
        self.compile()

        np_inputs = isinstance(input_ids, np.ndarray)
        if not np_inputs:
            input_ids = input_ids.cpu().numpy()
            pixel_values = pixel_values.cpu().numpy()
            attention_mask = attention_mask.cpu().numpy() if attention_mask is not None else attention_mask
        inputs = {"input_ids": input_ids, "pixel_values": pixel_values}
        # Add the attention_mask when needed
        if "attention_mask" in self.input_names:
            inputs["attention_mask"] = attention_mask if attention_mask is not None else np.ones_like(input_ids)
        outputs = self._inference(inputs)
        logits_per_image = (
            torch.from_numpy(outputs["logits_per_image"]).to(self.device)
            if not np_inputs
            else outputs["logits_per_image"]
        )
        logits_per_text = (
            torch.from_numpy(outputs["logits_per_text"]).to(self.device)
            if not np_inputs
            else outputs["logits_per_text"]
        )
        text_embeds = (
            torch.from_numpy(outputs["text_embeds"]).to(self.device) if not np_inputs else outputs["text_embeds"]
        )
        image_embeds = (
            torch.from_numpy(outputs["image_embeds"]).to(self.device) if not np_inputs else outputs["image_embeds"]
        )

        return CLIPOutput(
            logits_per_image=logits_per_image,
            logits_per_text=logits_per_text,
            text_embeds=text_embeds,
            image_embeds=image_embeds,
        )