in community-content/vertex_model_garden/model_oss/diffusers/handler.py [0:0]
def inference(self, data: Any, *args, **kwargs) -> List[Image.Image]:
"""Run the inference."""
prompts, images, mask_images = data
if self.task == TEXT_TO_IMAGE:
predicted_images = self.pipeline(prompt=prompts).images
elif self.task == IMAGE_TO_IMAGE:
predicted_images = self.pipeline(prompt=prompts, image=images).images
elif self.task == IMAGE_INPAINTING:
predicted_images = self.pipeline(
prompt=prompts, image=images, mask_image=mask_images
).images
elif self.task == INSTRUCT_PIX2PIX:
predicted_images = self.pipeline(prompt=prompts, image=images).images
elif self.task == CONTROLNET:
predicted_images = self.pipeline(
prompt=prompts, image=images, num_inference_steps=20
).images
elif self.task == CONDITIONED_SUPER_RES:
predicted_images = self.pipeline(
prompt=prompts, image=images, num_inference_steps=20
).images
elif self.task == TEXT_TO_VIDEO_ZERO_SHOT:
# For each given prompt, generate a short video.
# The pipeline doesn't support multiple prompts in one run yet.
videos = []
for prompt in prompts:
numpy_arrays = self.pipeline(prompt=prompt).images
numpy_arrays = [(i * 255).astype("uint8") for i in numpy_arrays]
videos.append(
frames_to_video_bytes(numpy_arrays, fps=4)
)
return videos
elif self.task == TEXT_TO_VIDEO:
predicted_images = np.asarray(self.pipeline(prompt=prompts).frames)
# For multiple prompts, the model concatenates video frames, i.e. the
# output shape is (num_frames, height, width * len(prompts), channels).
# Therefore we need to split the output into different videos.
predicted_images = np.array_split(predicted_images, len(prompts), axis=2)
videos = [
frames_to_video_bytes(images, fps=8)
for images in predicted_images
]
return videos
else:
raise ValueError(f"Invalid TASK: {self.task}")
return predicted_images