in optimum/intel/openvino/modeling.py [0:0]
def forward(self, input_ids, pixel_values, attention_mask: Optional[torch.Tensor] = None, **kwargs):
self.compile()
np_inputs = isinstance(input_ids, np.ndarray)
if not np_inputs:
input_ids = input_ids.cpu().numpy()
pixel_values = pixel_values.cpu().numpy()
attention_mask = attention_mask.cpu().numpy() if attention_mask is not None else attention_mask
inputs = {"input_ids": input_ids, "pixel_values": pixel_values}
# Add the attention_mask when needed
if "attention_mask" in self.input_names:
inputs["attention_mask"] = attention_mask if attention_mask is not None else np.ones_like(input_ids)
outputs = self._inference(inputs)
logits_per_image = (
torch.from_numpy(outputs["logits_per_image"]).to(self.device)
if not np_inputs
else outputs["logits_per_image"]
)
logits_per_text = (
torch.from_numpy(outputs["logits_per_text"]).to(self.device)
if not np_inputs
else outputs["logits_per_text"]
)
text_embeds = (
torch.from_numpy(outputs["text_embeds"]).to(self.device) if not np_inputs else outputs["text_embeds"]
)
image_embeds = (
torch.from_numpy(outputs["image_embeds"]).to(self.device) if not np_inputs else outputs["image_embeds"]
)
return CLIPOutput(
logits_per_image=logits_per_image,
logits_per_text=logits_per_text,
text_embeds=text_embeds,
image_embeds=image_embeds,
)