optimum/habana/transformers/models/idefics2/image_processing_idefics2.py (54 lines of code) (raw):

# coding=utf-8 # Copyright 2024 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Iterable, List, Optional, Union import numpy as np from transformers.image_processing_utils import BatchFeature from transformers.image_utils import ChannelDimension, infer_channel_dimension_format from transformers.models.idefics2.image_processing_idefics2 import ( Idefics2ImageProcessor, get_max_height_width, make_pixel_mask, ) from transformers.utils import TensorType class Gaudi2Idefics2ImageProcessor(Idefics2ImageProcessor): def pad( self, images: List[np.ndarray], constant_values: Union[float, Iterable[float]] = 0, return_pixel_mask: bool = True, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> BatchFeature: """ Inherits from Idefics2ImageProcessor::pad https://github.com/huggingface/transformers/blob/v4.43.4/src/transformers/models/idefics2/image_processing_idefics2.py#L314 The only differences are: - pad size use longest_edge, so the image size will not change, aims to accelerate finetune speed """ if getattr(self, "pad_to_longest_edge", False): pad_size = (self.size["longest_edge"], self.size["longest_edge"]) else: pad_size = get_max_height_width(images, input_data_format=input_data_format) batch_size = len(images) max_num_images = max(len(images_) for images_ in images) input_data_format = ( infer_channel_dimension_format(images[0][0]) if input_data_format is None else input_data_format ) data_format = input_data_format if data_format is None else data_format def empty_image(size, input_data_format): if input_data_format == ChannelDimension.FIRST: return np.zeros((3, *size), dtype=np.uint8) elif input_data_format == ChannelDimension.LAST: return np.zeros((*size, 3), dtype=np.uint8) raise ValueError("Invalid channel dimension format.") padded_images_list = [ [empty_image(pad_size, data_format) for _ in range(max_num_images)] for _ in range(batch_size) ] padded_masks = [[np.zeros(pad_size) for _ in range(max_num_images)] for _ in range(batch_size)] for batch_idx in range(batch_size): for sample_idx, image in enumerate(images[batch_idx]): padded_images_list[batch_idx][sample_idx] = self._pad_image( image, pad_size, constant_values=constant_values, data_format=data_format, input_data_format=input_data_format, ) padded_masks[batch_idx][sample_idx] = make_pixel_mask( image, output_size=pad_size, input_data_format=input_data_format ) padded_masks = padded_masks if return_pixel_mask else None return padded_images_list, padded_masks