def build_processor()

in utils/create_dummy_models.py [0:0]
87 lines of code
38 McCabe index (conditional complexity)

def build_processor(config_class, processor_class, allow_no_checkpoint=False):
    """Create a processor for `processor_class`.

    If a processor is not able to be built with the original arguments, this method tries to change the arguments and
    call itself recursively, by inferring a new `config_class` or a new `processor_class` from another one, in order to
    find a checkpoint containing the necessary files to build a processor.

    The processor is not saved here. Instead, it will be saved in `convert_processors` after further changes in
    `convert_processors`. For each model architecture`, a copy will be created and saved along the built model.
    """
    # Currently, this solely uses the docstring in the source file of `config_class` to find a checkpoint.
    checkpoint = get_checkpoint_from_config_class(config_class)

    if checkpoint is None:
        # try to get the checkpoint from the config class for `processor_class`.
        # This helps cases like `XCLIPConfig` and `VideoMAEFeatureExtractor` to find a checkpoint from `VideoMAEConfig`.
        config_class_from_processor_class = get_config_class_from_processor_class(processor_class)
        checkpoint = get_checkpoint_from_config_class(config_class_from_processor_class)

    processor = None
    try:
        processor = processor_class.from_pretrained(checkpoint)
    except Exception as e:
        logger.error(f"{e.__class__.__name__}: {e}")

    # Try to get a new processor class from checkpoint. This is helpful for a checkpoint without necessary file to load
    # processor while `processor_class` is an Auto class. For example, `sew` has `Wav2Vec2Processor` in
    # `PROCESSOR_MAPPING_NAMES`, its `tokenizer_class` is `AutoTokenizer`, and the checkpoint
    # `https://huggingface.co/asapp/sew-tiny-100k` has no tokenizer file, but we can get
    # `tokenizer_class: Wav2Vec2CTCTokenizer` from the config file. (The new processor class won't be able to load from
    # `checkpoint`, but it helps this recursive method to find a way to build a processor).
    if (
        processor is None
        and checkpoint is not None
        and issubclass(processor_class, (PreTrainedTokenizerBase, AutoTokenizer))
    ):
        try:
            config = AutoConfig.from_pretrained(checkpoint)
        except Exception as e:
            logger.error(f"{e.__class__.__name__}: {e}")
            config = None
        if config is not None:
            if not isinstance(config, config_class):
                raise ValueError(
                    f"`config` (which is of type {config.__class__.__name__}) should be an instance of `config_class`"
                    f" ({config_class.__name__})!"
                )
            tokenizer_class = config.tokenizer_class
            new_processor_class = None
            if tokenizer_class is not None:
                new_processor_class = getattr(transformers_module, tokenizer_class)
                if new_processor_class != processor_class:
                    processor = build_processor(config_class, new_processor_class)
            # If `tokenizer_class` is not specified in `config`, let's use `config` to get the process class via auto
            # mappings, but only allow the tokenizer mapping being used. This is to make `Wav2Vec2Conformer` build
            if processor is None:
                new_processor_classes = get_processor_types_from_config_class(
                    config.__class__, allowed_mappings=["tokenizer"]
                )
                # Used to avoid infinite recursion between a pair of fast/slow tokenizer types
                names = [
                    x.__name__.replace("Fast", "") for x in [processor_class, new_processor_class] if x is not None
                ]
                new_processor_classes = [
                    x for x in new_processor_classes if x is not None and x.__name__.replace("Fast", "") not in names
                ]
                if len(new_processor_classes) > 0:
                    new_processor_class = new_processor_classes[0]
                    # Let's use fast tokenizer if there is any
                    for x in new_processor_classes:
                        if x.__name__.endswith("Fast"):
                            new_processor_class = x
                            break
                    processor = build_processor(config_class, new_processor_class)

    if processor is None:
        # Try to build each component (tokenizer & feature extractor) of a `ProcessorMixin`.
        if issubclass(processor_class, ProcessorMixin):
            attrs = {}
            for attr_name in processor_class.attributes:
                attrs[attr_name] = []
                # This could be a tuple (for tokenizers). For example, `CLIPProcessor` has
                #   - feature_extractor_class = "CLIPFeatureExtractor"
                #   - tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
                attr_class_names = getattr(processor_class, f"{attr_name}_class")
                if not isinstance(attr_class_names, tuple):
                    attr_class_names = (attr_class_names,)

                for name in attr_class_names:
                    attr_class = getattr(transformers_module, name)
                    attr = build_processor(config_class, attr_class)
                    if attr is not None:
                        attrs[attr_name].append(attr)

            # try to build a `ProcessorMixin`, so we can return a single value
            if all(len(v) > 0 for v in attrs.values()):
                try:
                    processor = processor_class(**{k: v[0] for k, v in attrs.items()})
                except Exception as e:
                    logger.error(f"{e.__class__.__name__}: {e}")
        else:
            # `checkpoint` might lack some file(s) to load a processor. For example, `facebook/hubert-base-ls960`
            # has no tokenizer file to load `Wav2Vec2CTCTokenizer`. In this case, we try to build a processor
            # with the configuration class (for example, `Wav2Vec2Config`) corresponding to `processor_class`.
            config_class_from_processor_class = get_config_class_from_processor_class(processor_class)
            if config_class_from_processor_class != config_class:
                processor = build_processor(config_class_from_processor_class, processor_class)

    # Try to create an image processor or a feature extractor without any checkpoint
    if (
        processor is None
        and allow_no_checkpoint
        and (issubclass(processor_class, BaseImageProcessor) or issubclass(processor_class, FeatureExtractionMixin))
    ):
        try:
            processor = processor_class()
        except Exception as e:
            logger.error(f"{e.__class__.__name__}: {e}")

    # validation
    if processor is not None:
        if not (isinstance(processor, processor_class) or processor_class.__name__.startswith("Auto")):
            raise ValueError(
                f"`processor` (which is of type {processor.__class__.__name__}) should be an instance of"
                f" {processor_class.__name__} or an Auto class!"
            )

    return processor