# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import dataclasses import re import sys import traceback from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Union from collections import defaultdict from megatron_patch.data.image_processing import get_visual_transform import numpy as np import torch from torchvision import transforms as T import json from megatron.energon import ( Batch, DefaultTaskEncoder, VQASample, ) from megatron_patch.data.energon.chatml import ChatMLSample from megatron.training import get_args from megatron_patch.tokenizer import get_tokenizer # Type for intermediate batch, after batch() @dataclass class ImageTaskSample: __key__: str __subflavors__: Dict imgs: List[np.ndarray] # (c, h, w) videos: List[np.ndarray] # (c, h, w) image_thw_grids: np.ndarray video_thw_grids: np.ndarray image_input_mask: np.ndarray video_input_mask: np.ndarray second_per_grid_ts: np.ndarray # (n_videos, ) text: np.ndarray target: np.ndarray # Typing for the resulting batch data after encode_batch() @dataclass class VQATaskBatch(Batch): __keys__: List[str] __subflavors__: List[Dict] # (num_tiles, c, h, w) imgs: torch.Tensor videos: torch.Tensor image_thw_grids: torch.Tensor video_thw_grids: torch.Tensor image_input_mask: torch.Tensor video_input_mask: torch.Tensor second_per_grid_ts: torch.Tensor # (n_videos, ), read from metadata? # (n, seq_len) text: torch.Tensor # (n, seq_len) target: torch.Tensor class InternalWarning(Warning): ... def convert_to_qwen2vl_content( user_input: str, image_pattern: str = '', video_pattern: str = '