vision/smolvlm2/smolvlm/datasets/dataset.py [242:425]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    if not frames:
        raise RuntimeError(f"No frames successfully loaded from '{folder_path}' after sampling.")
        
    return frames, timestamps, duration_seconds


# Image Loader
##############################################################################
def load_single_image(img_path: str) -> Image.Image:
    jpeg = Image.open(img_path)
    img = jpeg.copy().convert('RGB')
    return img


##############################################################################
# Helper Functions for Masking
##############################################################################
def find_global_img_patterns(tokens: List[str]) -> List[int]:
    mask_positions = []
    for i in range(len(tokens) - 4):
        if (
            tokens[i] == '<'
            and tokens[i+1] == 'global'
            and tokens[i+2] == '-'
            and tokens[i+3] == 'img'
            and tokens[i+4] == '>'
        ):
            mask_positions.extend([i, i+1, i+2, i+3, i+4])
    return mask_positions


def find_row_col_patterns(tokens: List[str]) -> List[int]:
    pattern = re.compile(r'^< row _ [1-9] _ col _ [1-9] >$')
    mask_positions = []
    for i in range(len(tokens) - 8):
        # Slice out exactly 9 tokens (e.g. <, row, _, 1, _, col, _, 1, >)
        group = tokens[i : i + 9]
        if pattern.fullmatch(" ".join(group)):
            mask_positions.extend(range(i, i + 9))
    return mask_positions


def _search_subsequence(
    sequence: torch.Tensor,
    pattern: List[int],
    start: int = 0
) -> int:
    """
    Searches for the first occurrence of 'pattern' in 'sequence'
    starting at offset 'start'. Returns the index of that occurrence,
    or -1 if not found.
    """
    # Convert input_ids to a Python list
    seq_list = sequence.tolist()
    pat_len = len(pattern)
    if pat_len == 0:
        return -1

    # Simple forward search
    for i in range(start, len(seq_list) - pat_len + 1):
        if seq_list[i : i + pat_len] == pattern:
            return i
    return -1


def _mask_system_tokens(
    input_ids: torch.Tensor,
    labels: torch.Tensor,
    tokenizer
):
    """
    Identifies every occurrence of "System:" in `input_ids` (tokenized form),
    then masks (sets to IGNORE_INDEX) from the first token of "System:" up to 
    the next "<end_of_utterance>" marker or the end of the entire sequence.

    Args:
        input_ids (torch.Tensor): The token IDs for the conversation.
        labels (torch.Tensor): A copy of `input_ids` that we modify in-place 
           to set certain spans to IGNORE_INDEX.
        tokenizer: The tokenizer.
    """ 
    system_str = "System:"
    end_str    = "<end_of_utterance>"

    system_ids = tokenizer.encode(system_str, add_special_tokens=False)
    end_ids    = tokenizer.encode(end_str,   add_special_tokens=False)

    start_pos = 0
    while True:
        # 1) find next "System:"
        sys_start = _search_subsequence(input_ids, system_ids, start=start_pos)
        if sys_start == -1:
            break  # no more occurrences

        # 2) find next "<end_of_utterance>" after that
        sys_end = _search_subsequence(input_ids, end_ids, start=sys_start + len(system_ids))
        if sys_end == -1:
            sys_end = len(input_ids)  # if not found, go to end of sequence

        # 3) Mask [sys_start .. sys_end) in 'labels'
        labels[sys_start:sys_end] = IGNORE_INDEX

        # 4) Move forward
        start_pos = sys_end + len(end_ids)


def _mask_user_tokens(
    input_ids: torch.Tensor,
    labels: torch.Tensor,
    tokenizer
):
    """
    Identifies every occurrence of "User:" in `input_ids`,
    then masks (sets to IGNORE_INDEX) from that token to the next "<end_of_utterance>" 
    or the end of the sequence. This removes the user's text from the training labels,
    so the model won't try to predict user text.

    Args:
        input_ids (torch.Tensor): The token IDs for the conversation.
        labels (torch.Tensor): A copy of `input_ids` that we modify in-place 
           to set certain spans to IGNORE_INDEX.
        tokenizer: The tokenizer.
    """
    user_str = "User:"
    end_str  = "<end_of_utterance>"

    user_ids = tokenizer.encode(user_str, add_special_tokens=False)
    end_ids  = tokenizer.encode(end_str,  add_special_tokens=False)

    start_pos = 0
    while True:
        # 1) find next "User:"
        usr_start = _search_subsequence(input_ids, user_ids, start=start_pos)
        if usr_start == -1:
            break  # no more occurrences

        # 2) find next "<end_of_utterance>" after that
        usr_end = _search_subsequence(input_ids, end_ids, start=usr_start + len(user_ids))
        if usr_end == -1:
            usr_end = len(input_ids)

        # 3) Mask [usr_start .. usr_end) in 'labels'
        labels[usr_start:usr_end] = IGNORE_INDEX

        # 4) Move forward
        start_pos = usr_end + len(end_ids)
        
##############################################################################
# Dataset
##############################################################################
class SupervisedDataset(Dataset):
    def __init__(
        self,
        dataset_args: Dict[str, Any],
        processor: transformers.ProcessorMixin,
        data_args: DataArguments,
        training_args: TrainingArguments,
        model_args: ModelArguments,
    ):
        """
        A dataset class that loads text/images/multi-image/videos, 
        tokenizes them via `processor`, and optionally masks user/system text.

        Args:
            dataset_args (Dict[str, Any]): Info specifying the dataset path, 
              sampling_strategy, possibly "source_fps", etc.
            processor (ProcessorMixin): Usually a multi-modal HF processor 
              that has a tokenizer + image_processor for vision.
            data_args (DataArguments): Contains config like `mask_user_tokens`, 
              `mask_system_tokens`, `fps`, etc.
            training_args (TrainingArguments): Possibly used for sampling or logging.
        """
        super().__init__()
        self.mask_user_tokens =  getattr(data_args, "mask_user_tokens", False)
        self.mask_system_tokens = getattr(data_args, "mask_system_tokens", True)
        self.add_media_intro_outro = getattr(data_args, "add_media_intro_outro", False)
        
        self.processor = processor
        self.tokenizer = processor.tokenizer
        self.data_args = data_args
        self.training_args = training_args
        
        #todo: verfiery that args get here
        self.target_fps = getattr(model_args, "fps", 1.0) # CLIP sampling FPS
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



vision/smolvlm2/smolvlm/datasets/dataset_clip_sampling.py [374:557]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    if not frames:
        raise RuntimeError(f"No frames successfully loaded from '{folder_path}' after sampling.")
        
    return frames, timestamps, duration_seconds


# Image Loader
##############################################################################
def load_single_image(img_path: str) -> Image.Image:
    jpeg = Image.open(img_path)
    img = jpeg.copy().convert('RGB')
    return img


##############################################################################
# Helper Functions for Masking
##############################################################################
def find_global_img_patterns(tokens: List[str]) -> List[int]:
    mask_positions = []
    for i in range(len(tokens) - 4):
        if (
            tokens[i] == '<'
            and tokens[i+1] == 'global'
            and tokens[i+2] == '-'
            and tokens[i+3] == 'img'
            and tokens[i+4] == '>'
        ):
            mask_positions.extend([i, i+1, i+2, i+3, i+4])
    return mask_positions


def find_row_col_patterns(tokens: List[str]) -> List[int]:
    pattern = re.compile(r'^< row _ [1-9] _ col _ [1-9] >$')
    mask_positions = []
    for i in range(len(tokens) - 8):
        # Slice out exactly 9 tokens (e.g. <, row, _, 1, _, col, _, 1, >)
        group = tokens[i : i + 9]
        if pattern.fullmatch(" ".join(group)):
            mask_positions.extend(range(i, i + 9))
    return mask_positions


def _search_subsequence(
    sequence: torch.Tensor,
    pattern: List[int],
    start: int = 0
) -> int:
    """
    Searches for the first occurrence of 'pattern' in 'sequence'
    starting at offset 'start'. Returns the index of that occurrence,
    or -1 if not found.
    """
    # Convert input_ids to a Python list
    seq_list = sequence.tolist()
    pat_len = len(pattern)
    if pat_len == 0:
        return -1

    # Simple forward search
    for i in range(start, len(seq_list) - pat_len + 1):
        if seq_list[i : i + pat_len] == pattern:
            return i
    return -1


def _mask_system_tokens(
    input_ids: torch.Tensor,
    labels: torch.Tensor,
    tokenizer
):
    """
    Identifies every occurrence of "System:" in `input_ids` (tokenized form),
    then masks (sets to IGNORE_INDEX) from the first token of "System:" up to 
    the next "<end_of_utterance>" marker or the end of the entire sequence.

    Args:
        input_ids (torch.Tensor): The token IDs for the conversation.
        labels (torch.Tensor): A copy of `input_ids` that we modify in-place 
           to set certain spans to IGNORE_INDEX.
        tokenizer: The tokenizer.
    """ 
    system_str = "System:"
    end_str    = "<end_of_utterance>"

    system_ids = tokenizer.encode(system_str, add_special_tokens=False)
    end_ids    = tokenizer.encode(end_str,   add_special_tokens=False)

    start_pos = 0
    while True:
        # 1) find next "System:"
        sys_start = _search_subsequence(input_ids, system_ids, start=start_pos)
        if sys_start == -1:
            break  # no more occurrences

        # 2) find next "<end_of_utterance>" after that
        sys_end = _search_subsequence(input_ids, end_ids, start=sys_start + len(system_ids))
        if sys_end == -1:
            sys_end = len(input_ids)  # if not found, go to end of sequence

        # 3) Mask [sys_start .. sys_end) in 'labels'
        labels[sys_start:sys_end] = IGNORE_INDEX

        # 4) Move forward
        start_pos = sys_end + len(end_ids)


def _mask_user_tokens(
    input_ids: torch.Tensor,
    labels: torch.Tensor,
    tokenizer
):
    """
    Identifies every occurrence of "User:" in `input_ids`,
    then masks (sets to IGNORE_INDEX) from that token to the next "<end_of_utterance>" 
    or the end of the sequence. This removes the user's text from the training labels,
    so the model won't try to predict user text.

    Args:
        input_ids (torch.Tensor): The token IDs for the conversation.
        labels (torch.Tensor): A copy of `input_ids` that we modify in-place 
           to set certain spans to IGNORE_INDEX.
        tokenizer: The tokenizer.
    """
    user_str = "User:"
    end_str  = "<end_of_utterance>"

    user_ids = tokenizer.encode(user_str, add_special_tokens=False)
    end_ids  = tokenizer.encode(end_str,  add_special_tokens=False)

    start_pos = 0
    while True:
        # 1) find next "User:"
        usr_start = _search_subsequence(input_ids, user_ids, start=start_pos)
        if usr_start == -1:
            break  # no more occurrences

        # 2) find next "<end_of_utterance>" after that
        usr_end = _search_subsequence(input_ids, end_ids, start=usr_start + len(user_ids))
        if usr_end == -1:
            usr_end = len(input_ids)

        # 3) Mask [usr_start .. usr_end) in 'labels'
        labels[usr_start:usr_end] = IGNORE_INDEX

        # 4) Move forward
        start_pos = usr_end + len(end_ids)
        
##############################################################################
# Dataset
##############################################################################
class SupervisedDataset(Dataset):
    def __init__(
        self,
        dataset_args: Dict[str, Any],
        processor: transformers.ProcessorMixin,
        data_args: DataArguments,
        training_args: TrainingArguments,
        model_args: ModelArguments,
    ):
        """
        A dataset class that loads text/images/multi-image/videos, 
        tokenizes them via `processor`, and optionally masks user/system text.

        Args:
            dataset_args (Dict[str, Any]): Info specifying the dataset path, 
              sampling_strategy, possibly "source_fps", etc.
            processor (ProcessorMixin): Usually a multi-modal HF processor 
              that has a tokenizer + image_processor for vision.
            data_args (DataArguments): Contains config like `mask_user_tokens`, 
              `mask_system_tokens`, `fps`, etc.
            training_args (TrainingArguments): Possibly used for sampling or logging.
        """
        super().__init__()
        self.mask_user_tokens =  getattr(data_args, "mask_user_tokens", False)
        self.mask_system_tokens = getattr(data_args, "mask_system_tokens", True)
        self.add_media_intro_outro = getattr(data_args, "add_media_intro_outro", False)
        
        self.processor = processor
        self.tokenizer = processor.tokenizer
        self.data_args = data_args
        self.training_args = training_args
        
        #todo: verfiery that args get here
        self.target_fps = getattr(model_args, "fps", 1.0) # CLIP sampling FPS
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



