def create_image_encoder()

in src/pixparse/models/image_encoder_timm.py [0:0]


def create_image_encoder(cfg: ImageEncoderCfg) -> nn.Module:
    assert cfg.name
    extra_kwargs = {}
    if cfg.image_size is not None:
        extra_kwargs['img_size'] = cfg.image_size
    assert cfg.image_fmt in ('L', 'RGB')
    model = timm.create_model(
        cfg.name,
        pretrained=cfg.pretrained,
        in_chans=1 if cfg.image_fmt == 'L' else 3,
        num_classes=0,
        global_pool='',
        **extra_kwargs
    )

    # FIXME need to add support for changing input resolution / attn window sizes for models like swin,
    #  the original Donut added some hacks to resize rel-pos bias

    return model