in src/pixparse/models/image_encoder_timm.py [0:0]
def create_image_encoder(cfg: ImageEncoderCfg) -> nn.Module:
assert cfg.name
extra_kwargs = {}
if cfg.image_size is not None:
extra_kwargs['img_size'] = cfg.image_size
assert cfg.image_fmt in ('L', 'RGB')
model = timm.create_model(
cfg.name,
pretrained=cfg.pretrained,
in_chans=1 if cfg.image_fmt == 'L' else 3,
num_classes=0,
global_pool='',
**extra_kwargs
)
# FIXME need to add support for changing input resolution / attn window sizes for models like swin,
# the original Donut added some hacks to resize rel-pos bias
return model