def format_messages_v2()

in deepseek_vl2/models/processing_deepseek_vl_v2.py [0:0]


    def format_messages_v2(self, messages, pil_images, systems=None):
        """play the role of format_messages_v2 and get_images_info in the last version"""
        tokenized_data = []
        masked_tokenized_data = []  # labels
        images_list = []
        images_seq_mask = []
        images_spatial_crop = []
        num_image_tokens = []

        image_index = 0

        conv = get_conv_template(self.sft_format)
        conv_system_message = conv.system_message

        for idx, message in enumerate(messages):
            if idx == 0:
                tokenized_data += [self.bos_id]
                masked_tokenized_data += [self.bos_id]
                images_seq_mask += [False]
                conv.system_message = conv_system_message
            else:
                conv.system_message = ''

            if message['role'] == conv.roles[0] or message['role'] == "user":
                conv.reset_message()
                conv.append_message(conv.roles[0], str(message['content']).strip())
                conv.append_message(conv.roles[1], '')
                formatted_question = conv.get_prompt()
                tokenized_str, images, seq_mask, spatial_crop, n_image_tokens = self.tokenize_with_images(
                    formatted_question,
                    pil_images[image_index: image_index + formatted_question.count(self.image_token)],
                    bos=False,
                    eos=False,
                    cropping=len(pil_images) <= 2
                )
                image_index += formatted_question.count(self.image_token)

                tokenized_data += tokenized_str
                if self.mask_prompt:
                    masked_tokenized_data += [self.ignore_id] * len(tokenized_str)
                else:
                    masked_tokenized_data += tokenized_str
                images_list += images
                images_seq_mask += seq_mask
                images_spatial_crop += spatial_crop
                num_image_tokens += n_image_tokens

            elif message['role'] == conv.roles[1] or message['role'] == "assistant":
                formatted_answer = message['content'].strip()
                assert formatted_answer.count(
                    self.image_token) == 0, f"there should be no {self.image_token} in the assistant's reply, but got {messages}"
                tokenized_str, images, seq_mask, spatial_crop, n_image_tokens = self.tokenize_with_images(
                    formatted_answer,
                    [],
                    bos=False,
                    eos=True,
                    cropping=len(pil_images) <= 2)

                tokenized_data += tokenized_str
                masked_tokenized_data += tokenized_str
                images_seq_mask += seq_mask

            elif message['role'] == 'system' or message['role'] == 'deepseekapi-sys':
                # 如果message里面有system,那就只允许出现在message的第一句,同时conv原本的system就会失效
                assert idx == 0, 'system information should only exist in the begining of the conversation'
                formatted_system = message['content'].strip()
                tokenized_str = self.encode(formatted_system, bos=False, eos=False)
                tokenized_data += tokenized_str
                if self.mask_prompt:
                    masked_tokenized_data += [self.ignore_id] * len(tokenized_str)
                else:
                    masked_tokenized_data += tokenized_str
                seq_mask = [False] * len(tokenized_str)
                images_seq_mask += seq_mask

            else:
                assert False, f"Unknown role: {message['role']}"

        assert len(tokenized_data) == len(
            images_seq_mask), f"format_messages_v2: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
        assert len(images_spatial_crop) == len(num_image_tokens), f"image number should be compatible"

        return tokenized_data, masked_tokenized_data, images_list, images_seq_mask, images_spatial_crop, num_image_tokens