def convert_processors()

in utils/create_dummy_models.py [0:0]
172 lines of code
58 McCabe index (conditional complexity)

def convert_processors(processors, tiny_config, output_folder, result):
    """Change a processor to work with smaller inputs.

    For tokenizers, we try to reduce their vocabulary size.

    For feature extractor, we use smaller image size or change
    other attributes using the values from `tiny_config`. See `convert_feature_extractor`.

    This method should not fail: we catch the errors and put them in `result["warnings"]` with descriptive messages.
    """

    def _sanity_check(fast_tokenizer, slow_tokenizer, keep_fast_tokenizer=False):
        """Set tokenizer(s) to `None` if the fast/slow tokenizers have different values for `vocab_size` or `length`.

        If `keep_fast_tokenizer=True`, the fast tokenizer will be kept.
        """
        # sanity check 1: fast and slow tokenizers should be compatible (vocab_size)
        if fast_tokenizer is not None and slow_tokenizer is not None:
            if fast_tokenizer.vocab_size != slow_tokenizer.vocab_size:
                warning_message = (
                    "The fast/slow tokenizers "
                    f"({fast_tokenizer.__class__.__name__}/{slow_tokenizer.__class__.__name__}) have different "
                    "vocabulary size: "
                    f"fast_tokenizer.vocab_size = {fast_tokenizer.vocab_size} and "
                    f"slow_tokenizer.vocab_size = {slow_tokenizer.vocab_size}."
                )
                result["warnings"].append(warning_message)
                if not keep_fast_tokenizer:
                    fast_tokenizer = None
                slow_tokenizer = None

        # sanity check 2: fast and slow tokenizers should be compatible (length)
        if fast_tokenizer is not None and slow_tokenizer is not None:
            if len(fast_tokenizer) != len(slow_tokenizer):
                warning_message = (
                    f"The fast/slow tokenizers () have different length: "
                    f"len(fast_tokenizer) = {len(fast_tokenizer)} and "
                    f"len(slow_tokenizer) = {len(slow_tokenizer)}."
                )
                result["warnings"].append(warning_message)
                if not keep_fast_tokenizer:
                    fast_tokenizer = None
                slow_tokenizer = None

        return fast_tokenizer, slow_tokenizer

    tokenizers = []
    feature_extractors = []
    for processor in processors:
        if isinstance(processor, PreTrainedTokenizerBase):
            if processor.__class__.__name__ not in {x.__class__.__name__ for x in tokenizers}:
                tokenizers.append(processor)
        elif isinstance(processor, BaseImageProcessor):
            if processor.__class__.__name__ not in {x.__class__.__name__ for x in feature_extractors}:
                feature_extractors.append(processor)
        elif isinstance(processor, FeatureExtractionMixin):
            if processor.__class__.__name__ not in {x.__class__.__name__ for x in feature_extractors}:
                feature_extractors.append(processor)
        elif isinstance(processor, ProcessorMixin):
            if hasattr(processor, "tokenizer"):
                if processor.tokenizer.__class__.__name__ not in {x.__class__.__name__ for x in tokenizers}:
                    tokenizers.append(processor.tokenizer)
            # Currently, we only have these 2 possibilities
            if hasattr(processor, "image_processor"):
                if processor.image_processor.__class__.__name__ not in {
                    x.__class__.__name__ for x in feature_extractors
                }:
                    feature_extractors.append(processor.image_processor)
            elif hasattr(processor, "feature_extractor"):
                if processor.feature_extractor.__class__.__name__ not in {
                    x.__class__.__name__ for x in feature_extractors
                }:
                    feature_extractors.append(processor.feature_extractor)

    # check the built processors have the unique type
    num_types = len({x.__class__.__name__ for x in feature_extractors})
    if num_types >= 2:
        raise ValueError(f"`feature_extractors` should contain at most 1 type, but it contains {num_types} types!")
    num_types = len({x.__class__.__name__.replace("Fast", "") for x in tokenizers})
    if num_types >= 2:
        raise ValueError(f"`tokenizers` should contain at most 1 tokenizer type, but it contains {num_types} types!")

    fast_tokenizer = None
    slow_tokenizer = None

    for tokenizer in tokenizers:
        if isinstance(tokenizer, PreTrainedTokenizerFast):
            fast_tokenizer = tokenizer
        else:
            slow_tokenizer = tokenizer

    # If the (original) fast/slow tokenizers don't correspond, keep only the fast tokenizer.
    # This doesn't necessarily imply the fast/slow tokenizers in a single Hub repo. has issues.
    # It's more of an issue in `build_processor` which tries to get a checkpoint with as much effort as possible.
    # For `YosoModel` (which uses `AlbertTokenizer(Fast)`), its real (Hub) checkpoint doesn't contain valid files to
    # load the slower tokenizer (`AlbertTokenizer`), and it ends up finding the (canonical) checkpoint of `AlbertModel`,
    # which has different vocabulary.
    # TODO: Try to improve `build_processor`'s definition and/or usage to avoid the above situation in the first place.
    fast_tokenizer, slow_tokenizer = _sanity_check(fast_tokenizer, slow_tokenizer, keep_fast_tokenizer=True)
    original_fast_tokenizer, original_slow_tokenizer = fast_tokenizer, slow_tokenizer

    if fast_tokenizer:
        try:
            # Wav2Vec2ForCTC , ByT5Tokenizer etc. all are already small enough and have no fast version that can
            # be retrained
            if fast_tokenizer.vocab_size > TARGET_VOCAB_SIZE:
                fast_tokenizer = convert_tokenizer(fast_tokenizer)
        except Exception:
            result["warnings"].append(
                (
                    f"Failed to convert the fast tokenizer for {fast_tokenizer.__class__.__name__}.",
                    traceback.format_exc(),
                )
            )

    # If `fast_tokenizer` exists, `slow_tokenizer` should correspond to it.
    if fast_tokenizer:
        # Make sure the fast tokenizer can be saved
        try:
            # We don't save it to `output_folder` at this moment - only at the end of this function.
            with tempfile.TemporaryDirectory() as tmpdir:
                fast_tokenizer.save_pretrained(tmpdir)
                try:
                    slow_tokenizer = AutoTokenizer.from_pretrained(tmpdir, use_fast=False)
                except Exception:
                    result["warnings"].append(
                        (
                            f"Failed to load the slow tokenizer saved from {fast_tokenizer.__class__.__name__}.",
                            traceback.format_exc(),
                        )
                    )
                    # Let's just keep the fast version
                    slow_tokenizer = None
        except Exception:
            result["warnings"].append(
                (
                    f"Failed to save the fast tokenizer for {fast_tokenizer.__class__.__name__}.",
                    traceback.format_exc(),
                )
            )
            fast_tokenizer = None

    # If the (possibly converted) fast/slow tokenizers don't correspond, set them to `None`, and use the original
    # tokenizers.
    fast_tokenizer, slow_tokenizer = _sanity_check(fast_tokenizer, slow_tokenizer, keep_fast_tokenizer=False)

    # If there is any conversion failed, we keep the original tokenizers.
    if (original_fast_tokenizer is not None and fast_tokenizer is None) or (
        original_slow_tokenizer is not None and slow_tokenizer is None
    ):
        warning_messagae = (
            "There are some issues when converting the fast/slow tokenizers. The original tokenizers from the Hub "
            " will be used instead."
        )
        result["warnings"].append(warning_messagae)
        # Let's use the original version at the end (`original_fast_tokenizer` and `original_slow_tokenizer`)
        fast_tokenizer = original_fast_tokenizer
        slow_tokenizer = original_slow_tokenizer

    # Make sure the fast tokenizer can be saved
    if fast_tokenizer:
        # We don't save it to `output_folder` at this moment - only at the end of this function.
        with tempfile.TemporaryDirectory() as tmpdir:
            try:
                fast_tokenizer.save_pretrained(tmpdir)
            except Exception:
                result["warnings"].append(
                    (
                        f"Failed to save the fast tokenizer for {fast_tokenizer.__class__.__name__}.",
                        traceback.format_exc(),
                    )
                )
                fast_tokenizer = None
    # Make sure the slow tokenizer can be saved
    if slow_tokenizer:
        # We don't save it to `output_folder` at this moment - only at the end of this function.
        with tempfile.TemporaryDirectory() as tmpdir:
            try:
                slow_tokenizer.save_pretrained(tmpdir)
            except Exception:
                result["warnings"].append(
                    (
                        f"Failed to save the slow tokenizer for {slow_tokenizer.__class__.__name__}.",
                        traceback.format_exc(),
                    )
                )
                slow_tokenizer = None

    # update feature extractors using the tiny config
    try:
        feature_extractors = [convert_feature_extractor(p, tiny_config) for p in feature_extractors]
    except Exception:
        result["warnings"].append(
            (
                "Failed to convert feature extractors.",
                traceback.format_exc(),
            )
        )
        feature_extractors = []

    if hasattr(tiny_config, "max_position_embeddings") and tiny_config.max_position_embeddings > 0:
        if fast_tokenizer is not None:
            if fast_tokenizer.__class__.__name__ in [
                "RobertaTokenizerFast",
                "XLMRobertaTokenizerFast",
                "LongformerTokenizerFast",
                "MPNetTokenizerFast",
            ]:
                fast_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
            else:
                fast_tokenizer.model_max_length = tiny_config.max_position_embeddings
        if slow_tokenizer is not None:
            if slow_tokenizer.__class__.__name__ in [
                "RobertaTokenizer",
                "XLMRobertaTokenizer",
                "LongformerTokenizer",
                "MPNetTokenizer",
            ]:
                slow_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
            else:
                slow_tokenizer.model_max_length = tiny_config.max_position_embeddings

    processors = [fast_tokenizer, slow_tokenizer] + feature_extractors
    processors = [p for p in processors if p is not None]
    for p in processors:
        p.save_pretrained(output_folder)

    return processors