def verify_eos_tokens_in_tokenizer()

in optimum/exporters/executorch/utils.py [0:0]


def verify_eos_tokens_in_tokenizer(model_eos_ids: List[int], tokenizer) -> bool:
    """
    Verifies that the model's EOS token IDs are present in the tokenizer's
    set of potential end-of-sequence tokens.

    Args:
        model_eos_ids: A list of EOS token IDs recorded int the PTE file (the source of truth).
        tokenizer: The Hugging Face tokenizer instance to check.

    Returns:
        True if at least one model EOS ID is found among the tokenizer's potential
        EOS tokens, False otherwise.
    """
    if not model_eos_ids:
        print("Warning: model_eos_ids list is empty. No verification can be performed.")
        return True

    candidate_eos_ids: Set[int] = set()

    # 1. Check primary eos_token and pad_token attributes
    if tokenizer.eos_token_id is not None:
        candidate_eos_ids.add(tokenizer.eos_token_id)
    if tokenizer.pad_token_id is not None:
        candidate_eos_ids.add(tokenizer.pad_token_id)

    # 2. Check all tokens listed in the special_tokens_map
    for token_string in tokenizer.special_tokens_map.values():
        if token_string:
            # Use convert_tokens_to_ids for robustness
            token_id = tokenizer.convert_tokens_to_ids(token_string)
            if isinstance(token_id, int):
                candidate_eos_ids.add(token_id)

    # 3. Check added tokens for "end-of-X" patterns
    for token_id, added_token in tokenizer.added_tokens_decoder.items():
        token_str = added_token.content.lower()
        # Heuristic to find tokens that signify an end
        if "end" in token_str or token_str.startswith("</"):
            candidate_eos_ids.add(token_id)

    # The check: is any "true" ID present in the candidate set?
    is_valid = any(model_id in candidate_eos_ids for model_id in model_eos_ids)

    return is_valid