timm/models/mobilenetv5.py

from functools import partial from typing import Callable, List, Optional, Sequence, Tuple, Union import torch import torch.nn as nn import torch.nn.functional as F from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD from timm.layers import ( SelectAdaptivePool2d, Linear, LayerType, PadType, RmsNorm2d, ConvNormAct, create_conv2d, get_norm_act_layer, to_2tuple ) from ._builder import build_model_with_cfg from ._efficientnet_blocks import SqueezeExcite, UniversalInvertedResidual from ._efficientnet_builder import BlockArgs, EfficientNetBuilder, decode_arch_def, efficientnet_init_weights, \ round_channels, resolve_act_layer from ._features import feature_take_indices from ._features_fx import register_notrace_module from ._manipulate import checkpoint_seq, checkpoint from ._registry import generate_default_cfgs, register_model __all__ = ['MobileNetV5', 'MobileNetV5Encoder'] @register_notrace_module class MobileNetV5MultiScaleFusionAdapter(nn.Module): """Multi-layer fusion token adapter. Args: in_chs: List of input channel counts for each feature scale. out_chs: The number of output channels. output_resolution: The output resolution. expansion_ratio: The FFN expansion ratio. interpolation_mode: The upsampling interpolation mode. layer_scale_init_value: The initial value of the layer scale, no layer scale if None. """ def __init__( self, in_chs: Union[int, List[int]], out_chs: int, output_resolution: int, expansion_ratio: float = 2.0, interpolation_mode: str = "nearest", layer_scale_init_value: Optional[float] = None, noskip: bool = True, act_layer: Optional[LayerType] = None, norm_layer: Optional[LayerType] = None, ): super().__init__() self.in_channels = sum(in_chs) if isinstance(in_chs, Sequence) else in_chs self.out_channels = out_chs self.output_resolution = to_2tuple(output_resolution) self.expansion_ratio = expansion_ratio self.interpolation_mode = interpolation_mode self.layer_scale_init_value = layer_scale_init_value self.noskip = noskip act_layer = act_layer or nn.GELU norm_layer = norm_layer or RmsNorm2d self.ffn = UniversalInvertedResidual( in_chs=self.in_channels, out_chs=self.out_channels, dw_kernel_size_mid=0, exp_ratio=self.expansion_ratio, act_layer=act_layer, norm_layer=norm_layer, noskip=self.noskip, layer_scale_init_value=self.layer_scale_init_value, ) self.norm = norm_layer(self.out_channels) def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor: # Inputs list of [B, C, H, W] tensors high_resolution = inputs[0].shape[-2:] # Assuming the first input is the highest resolution. resized_inputs = [] for _, img in enumerate(inputs): feat_size = img.shape[-2:] if feat_size[0] < high_resolution[0] or feat_size[1] < high_resolution[1]: img = F.interpolate(img, size=high_resolution, mode=self.interpolation_mode) resized_inputs.append(img) channel_cat_imgs = torch.cat(resized_inputs, dim=1) # Cat on channel dim, must equal self.in_channels img = self.ffn(channel_cat_imgs) if high_resolution[0] != self.output_resolution[0] or high_resolution[1] != self.output_resolution[1]: # Interpolate / pool to target output_resolution if highest feature resolution differs if ( high_resolution[0] % self.output_resolution[0] != 0 or high_resolution[1] % self.output_resolution[1] != 0 ): img = F.interpolate(img, size=self.output_resolution, mode="bilinear") else: h_strides = high_resolution[0] // self.output_resolution[0] w_strides = high_resolution[1] // self.output_resolution[1] img = F.avg_pool2d( img, kernel_size=(h_strides, w_strides), stride=(h_strides, w_strides), ) img = self.norm(img) return img class MobileNetV5(nn.Module): """ MobiletNet-V5 """ def __init__( self, block_args: BlockArgs, num_classes: int = 1000, in_chans: int = 3, stem_size: int = 16, fix_stem: bool = False, num_features: int = 2048, pad_type: str = '', use_msfa: bool = True, msfa_indices: List[int] = (-3, -2, -1), msfa_output_resolution: int = 16, act_layer: Optional[LayerType] = None, norm_layer: Optional[LayerType] = None, aa_layer: Optional[LayerType] = None, se_layer: Optional[LayerType] = None, se_from_exp: bool = True, round_chs_fn: Callable = round_channels, drop_rate: float = 0., drop_path_rate: float = 0., layer_scale_init_value: Optional[float] = None, global_pool: str = 'avg', ): """ Args: block_args: Arguments for blocks of the network. num_classes: Number of classes for classification head. in_chans: Number of input image channels. stem_size: Number of output channels of the initial stem convolution. fix_stem: If True, don't scale stem by round_chs_fn. num_features: Number of output channels of the conv head layer. head_bias: If True, add a learnable bias to the conv head layer. pad_type: Type of padding to use for convolution layers. act_layer: Type of activation layer. norm_layer: Type of normalization layer. aa_layer: Type of anti-aliasing layer. se_layer: Type of Squeeze-and-Excite layer. se_from_exp: If True, calculate SE channel reduction from expanded mid channels. round_chs_fn: Callable to round number of filters based on depth multiplier. drop_rate: Dropout rate. drop_path_rate: Stochastic depth rate. layer_scale_init_value: Enable layer scale on compatible blocks if not None. global_pool: Type of pooling to use for global pooling features of the FC head. """ super().__init__() act_layer = act_layer or nn.GELU norm_layer = norm_layer or RmsNorm2d norm_act_layer = get_norm_act_layer(norm_layer, act_layer) se_layer = se_layer or SqueezeExcite self.num_classes = num_classes self.drop_rate = drop_rate self.grad_checkpointing = False self.msfa_indices = msfa_indices self.msfa_output_resolution = msfa_output_resolution # Stem if not fix_stem: stem_size = round_chs_fn(stem_size) self.conv_stem = ConvNormAct( in_chans, stem_size, kernel_size=3, stride=2, padding=pad_type, norm_layer=norm_layer, act_layer=act_layer, ) # Middle stages (IR/ER/DS Blocks) builder = EfficientNetBuilder( output_stride=32, pad_type=pad_type, round_chs_fn=round_chs_fn, se_from_exp=se_from_exp, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer, se_layer=se_layer, drop_path_rate=drop_path_rate, layer_scale_init_value=layer_scale_init_value, ) self.blocks = nn.Sequential(*builder(stem_size, block_args)) self.feature_info = builder.features self.stage_ends = [f['stage'] for f in self.feature_info] self.num_features = builder.in_chs # features of last stage, output of forward_features() # Neck (aggregation) + Head + Pooling if use_msfa: self.num_features = self.head_hidden_size = num_features # output of msfa is output of forward_features() # Map msfa indices to feature info and calculate sum of feature channels self.msfa_indices = feature_take_indices(len(self.feature_info), self.msfa_indices)[0] self.msfa_in_chs = sum([self.feature_info[mi]['num_chs'] for mi in self.msfa_indices]) self.msfa = MobileNetV5MultiScaleFusionAdapter( in_chs=self.msfa_in_chs, out_chs=num_features, output_resolution=self.msfa_output_resolution, norm_layer=norm_layer, act_layer=act_layer, ) self.global_pool = SelectAdaptivePool2d(pool_type=global_pool) self.conv_head = None self.norm_head = None else: self.num_features = builder.in_chs # features of last stage, output of forward_features() self.head_hidden_size = num_features self.msfa = None self.global_pool = SelectAdaptivePool2d(pool_type=global_pool) num_pooled_chs = self.num_features * self.global_pool.feat_mult() # mobilenet-v4 style post-pooling PW conv is followed by a norm+act layer self.conv_head = create_conv2d(num_pooled_chs, self.head_hidden_size, 1, padding=pad_type) self.norm_head = norm_act_layer(self.head_hidden_size) self.flatten = nn.Flatten(1) if global_pool else nn.Identity() # don't flatten if pooling disabled self.classifier = Linear(self.head_hidden_size, num_classes) if num_classes > 0 else nn.Identity() efficientnet_init_weights(self) def as_sequential(self): layers = [self.conv_stem, self.bn1] layers.extend(self.blocks) layers.append(self.global_pool) if self.conv_head is not None: layers.append(self.conv_head) if self.norm_head is not None: layers.append(self.norm_head) layers.extend([nn.Flatten(), nn.Dropout(self.drop_rate), self.classifier]) return nn.Sequential(*layers) @torch.jit.ignore def group_matcher(self, coarse: bool = False): return dict( stem=r'^conv_stem|bn1', blocks=r'^blocks\.(\d+)' if coarse else r'^blocks\.(\d+)\.(\d+)' ) @torch.jit.ignore def set_grad_checkpointing(self, enable: bool = True): self.grad_checkpointing = enable @torch.jit.ignore def get_classifier(self) -> nn.Module: return self.classifier def reset_classifier(self, num_classes: int, global_pool: str = 'avg'): self.num_classes = num_classes # NOTE: cannot meaningfully change pooling of efficient head after creation self.global_pool = SelectAdaptivePool2d(pool_type=global_pool) self.flatten = nn.Flatten(1) if global_pool else nn.Identity() # don't flatten if pooling disabled self.classifier = Linear(self.head_hidden_size, num_classes) if num_classes > 0 else nn.Identity() def forward_intermediates( self, x: torch.Tensor, indices: Optional[Union[int, List[int]]] = None, norm: bool = False, stop_early: bool = False, output_fmt: str = 'NCHW', intermediates_only: bool = False, extra_blocks: bool = False, ) -> Union[List[torch.Tensor], Tuple[torch.Tensor, List[torch.Tensor]]]: """ Forward features that returns intermediates. Args: x: Input image tensor indices: Take last n blocks if int, all if None, select matching indices if sequence norm: Apply norm layer to compatible intermediates stop_early: Stop iterating over blocks when last desired intermediate hit output_fmt: Shape of intermediate feature outputs intermediates_only: Only return intermediate features extra_blocks: Include outputs of all blocks and head conv in output, does not align with feature_info Returns: """ assert output_fmt in ('NCHW',), 'Output shape must be NCHW.' if stop_early: assert intermediates_only, 'Must use intermediates_only for early stopping.' intermediates = [] if extra_blocks: take_indices, max_index = feature_take_indices(len(self.blocks) + 1, indices) else: take_indices, max_index = feature_take_indices(len(self.stage_ends), indices) take_indices = [self.stage_ends[i] for i in take_indices] max_index = self.stage_ends[max_index] # FIXME MFSA and forward_intermediates overlap, they both take indices from specific features # When a user wants to grab specific feature maps for a downstream task AND have the msfa output # what should we do? Accumulate two intermediates? One for msfa and one for take_indices? # forward pass feat_idx = 0 # stem is index 0 x = self.conv_stem(x) if feat_idx in take_indices: intermediates.append(x) if torch.jit.is_scripting() or not stop_early: # can't slice blocks in torchscript blocks = self.blocks else: blocks = self.blocks[:max_index] for blk in blocks: feat_idx += 1 x = blk(x) if feat_idx in take_indices: intermediates.append(x) if intermediates_only: return intermediates # FIXME see note above # self.msfa(msfa_intermediatse) return x, intermediates def prune_intermediate_layers( self, indices: Union[int, List[int]] = 1, prune_norm: bool = False, prune_head: bool = True, extra_blocks: bool = False, ): """ Prune layers not required for specified intermediates. """ if extra_blocks: take_indices, max_index = feature_take_indices(len(self.blocks) + 1, indices) else: take_indices, max_index = feature_take_indices(len(self.stage_ends), indices) max_index = self.stage_ends[max_index] self.blocks = self.blocks[:max_index] # truncate blocks w/ stem as idx 0 if max_index < len(self.blocks): self.conv_head = None self.norm_head = None if prune_head: self.conv_head = None self.norm_head = None self.reset_classifier(0, '') return take_indices def forward_features(self, x: torch.Tensor) -> torch.Tensor: if self.msfa is not None: # When MSFA aggregation layer is present, we gather intermediates as is forward_intermediates feat_idx = 0 # offset by one from blocks index due to stem feature intermediates = [] x = self.conv_stem(x) if feat_idx in self.msfa_indices: intermediates.append(x) for blk in self.blocks: feat_idx += 1 # FIXME fix grad checkpointing x = blk(x) if feat_idx in self.msfa_indices: intermediates.append(x) x = self.msfa(intermediates) else: x = self.conv_stem(x) if self.grad_checkpointing and not torch.jit.is_scripting(): x = checkpoint_seq(self.blocks, x, flatten=True) else: x = self.blocks(x) return x def forward_head(self, x: torch.Tensor, pre_logits: bool = False) -> torch.Tensor: x = self.global_pool(x) if self.conv_head is not None: x = self.conv_head(x) if self.norm_head is not None: x = self.norm_head(x) x = self.flatten(x) if self.drop_rate > 0.: x = F.dropout(x, p=self.drop_rate, training=self.training) if pre_logits: return x return self.classifier(x) def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.forward_features(x) x = self.forward_head(x) return x class MobileNetV5Encoder(nn.Module): """MobileNetV5 Vision Encoder""" def __init__( self, block_args: BlockArgs, in_chans: int = 3, stem_size: int = 64, fix_stem: bool = False, pad_type: str = '', msfa_indices: Sequence[int] = (-2, -1), msfa_output_resolution: int = 16, act_layer: Optional[LayerType] = None, norm_layer: Optional[LayerType] = None, aa_layer: Optional[LayerType] = None, se_layer: Optional[LayerType] = None, se_from_exp: bool = True, round_chs_fn: Callable = round_channels, drop_rate: float = 0., drop_path_rate: float = 0., layer_scale_init_value: Optional[float] = None, ): super().__init__() act_layer = act_layer or nn.GELU norm_layer = norm_layer or RmsNorm2d se_layer = se_layer or SqueezeExcite self.num_classes = 0 # Exists to satisfy ._hub module APIs. self.drop_rate = drop_rate self.grad_checkpointing = False # Stem if not fix_stem: stem_size = round_chs_fn(stem_size) self.conv_stem = ConvNormAct( in_chans, stem_size, kernel_size=3, stride=2, padding=pad_type, norm_layer=norm_layer, act_layer=act_layer, ) builder = EfficientNetBuilder( output_stride=32, pad_type=pad_type, round_chs_fn=round_chs_fn, se_from_exp=se_from_exp, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer, se_layer=se_layer, drop_path_rate=drop_path_rate, layer_scale_init_value=layer_scale_init_value, ) self.blocks = nn.Sequential(*builder(stem_size, block_args)) self.feature_info = builder.features self.stage_ends = [f['stage'] for f in self.feature_info] self.num_features = self.head_hidden_size = 2048 # output of msfa is output of forward_features() # Map msfa indices to feature info and calculate sum of feature channels self.msfa_indices = feature_take_indices(len(self.feature_info), msfa_indices)[0] self.msfa_in_chs = sum([self.feature_info[mi]['num_chs'] for mi in self.msfa_indices]) self.msfa_output_resolution = msfa_output_resolution self.msfa = MobileNetV5MultiScaleFusionAdapter( in_chs=self.msfa_in_chs, out_chs=self.num_features, output_resolution=self.msfa_output_resolution, norm_layer=norm_layer, act_layer=act_layer, ) efficientnet_init_weights(self) def forward_intermediates( self, x: torch.Tensor, indices: Optional[Union[int, List[int]]] = None, norm: bool = False, stop_early: bool = False, output_fmt: str = 'NCHW', intermediates_only: bool = False, extra_blocks: bool = False, ) -> Union[List[torch.Tensor], Tuple[torch.Tensor, List[torch.Tensor]]]: """ Forward features that returns intermediates. Args: x: Input image tensor indices: Take last n blocks if int, all if None, select matching indices if sequence norm: (Unused) Applies norm layer to compatible intermediates stop_early: Stop iterating over blocks when last desired intermediate hit output_fmt: Shape of intermediate feature outputs intermediates_only: Only return intermediate features extra_blocks: Include outputs of all blocks and head conv in output, does not align with feature_info Returns: """ del norm assert output_fmt in ('NCHW',), 'Output shape must be NCHW.' if stop_early: assert intermediates_only, 'Must use intermediates_only for early stopping.' # MobileNet v5's MultiScaleFusionAdapter takes intermediates from specific feature indicies and uses them in # its computation. These MSFA indices are not guaranteed to be captured by the `indices` parameter passed to # this function, so we accumulate two sets of indices, one that aligns with the `indices` parameter and one # that is required by the MSFA block. intermediates = [] msfa_intermediates = [] if extra_blocks: take_indices, max_index = feature_take_indices(len(self.blocks) + 1, indices) else: take_indices, max_index = feature_take_indices(len(self.stage_ends), indices) take_indices = [self.stage_ends[i] for i in take_indices] max_index = self.stage_ends[max_index] # forward pass feat_idx = 0 # stem is index 0 x = self.conv_stem(x) if feat_idx in take_indices: intermediates.append(x) if feat_idx in self.msfa_indices: msfa_intermediates.append(x) if torch.jit.is_scripting() or not stop_early: # can't slice blocks in torchscript blocks = self.blocks else: blocks = self.blocks[:max_index] for blk in blocks: feat_idx += 1 x = blk(x) if feat_idx in take_indices: intermediates.append(x) if feat_idx in self.msfa_indices: msfa_intermediates.append(x) if intermediates_only: return intermediates return self.msfa(msfa_intermediates), intermediates def forward_features(self, x: torch.Tensor) -> torch.Tensor: feat_idx = 0 # offset by one from blocks index due to stem feature intermediates = [] x = self.conv_stem(x) if feat_idx in self.msfa_indices: intermediates.append(x) for blk in self.blocks: feat_idx += 1 # FIXME fix grad checkpointing x = blk(x) if feat_idx in self.msfa_indices: intermediates.append(x) return self.msfa(intermediates) def forward_head(self, x: torch.Tensor) -> torch.Tensor: raise NotImplementedError("MobileNetV5Encoder does not support classification use cases.") def forward(self, x: torch.Tensor) -> torch.Tensor: return self.forward_features(x) def _create_mnv5_encoder(variant: str, pretrained: bool = False, **kwargs) -> MobileNetV5Encoder: out_indices = kwargs.pop('out_indices', (0, 1, 2, 3, 4)) feature_cfg = dict(out_indices=out_indices, feature_cls='getter') kwargs_filter = ( 'num_classes', 'num_features', 'head_conv', 'head_bias', 'head_norm', 'global_pool', ) model = build_model_with_cfg( MobileNetV5Encoder, variant, pretrained, pretrained_strict=False, feature_cfg=feature_cfg, kwargs_filter=kwargs_filter, **kwargs, ) return model def _create_mnv5(variant: str, pretrained: bool = False, **kwargs) -> MobileNetV5Encoder: out_indices = kwargs.pop('out_indices', (0, 1, 2, 3, 4)) feature_cfg = dict(out_indices=out_indices, feature_cls='getter') model = build_model_with_cfg( MobileNetV5, variant, pretrained, pretrained_strict=False, feature_cfg=feature_cfg, **kwargs, ) return model def _gen_mobilenet_v5( variant: str, channel_multiplier: float = 1.0, group_size=None, pretrained: bool = False, encoder: bool = False, **kwargs, ) -> MobileNetV5Encoder: if 'mobilenetv5_base' in variant: arch_def: list[list[str]] = [ # Stage 0: 128x128 in [ 'er_r1_k3_s2_e4_c128', 'er_r1_k3_s1_e4_c128', 'er_r1_k3_s1_e4_c128', ], # Stage 1: 256x256 in [ 'uir_r1_a3_k5_s2_e6_c256', 'uir_r1_a5_k0_s1_e4_c256', 'uir_r1_a3_k0_s1_e4_c256', 'uir_r1_a5_k0_s1_e4_c256', 'uir_r1_a3_k0_s1_e4_c256', ], # Stage 2: 640x640 in [ "uir_r1_a5_k5_s2_e6_c512", "uir_r1_a5_k0_s1_e4_c512", "uir_r1_a5_k0_s1_e4_c512", "uir_r1_a0_k0_s1_e1_c512", 'mqa_r1_k3_h8_s2_d64_c512', "uir_r1_a0_k0_s1_e2_c512", 'mqa_r1_k3_h8_s2_d64_c512', "uir_r1_a0_k0_s1_e2_c512", 'mqa_r1_k3_h8_s2_d64_c512', "uir_r1_a0_k0_s1_e2_c512", 'mqa_r1_k3_h8_s2_d64_c512', "uir_r1_a0_k0_s1_e2_c512", 'mqa_r1_k3_h8_s2_d64_c512', "uir_r1_a0_k0_s1_e2_c512", 'mqa_r1_k3_h8_s2_d64_c512', "uir_r1_a0_k0_s1_e2_c512", ], # Stage 3: 1280x1280 in [ "uir_r1_a5_k5_s2_e6_c1024", 'mqa_r1_k3_h16_s1_d64_c1024', "uir_r1_a0_k0_s1_e2_c1024", 'mqa_r1_k3_h16_s1_d64_c1024', "uir_r1_a0_k0_s1_e2_c1024", 'mqa_r1_k3_h16_s1_d64_c1024', "uir_r1_a0_k0_s1_e2_c1024", 'mqa_r1_k3_h16_s1_d64_c1024', "uir_r1_a0_k0_s1_e2_c1024", 'mqa_r1_k3_h16_s1_d64_c1024', "uir_r1_a0_k0_s1_e2_c1024", 'mqa_r1_k3_h16_s1_d64_c1024', "uir_r1_a0_k0_s1_e2_c1024", 'mqa_r1_k3_h16_s1_d64_c1024', "uir_r1_a0_k0_s1_e2_c1024", ], ] else: arch_def: list[list[str]] = [ # Stage 0: 128x128 in [ 'er_r1_k3_s2_e4_c128', 'er_r1_k3_s1_e4_c128', 'er_r1_k3_s1_e4_c128', ], # Stage 1: 256x256 in [ 'uir_r1_a3_k5_s2_e6_c256', 'uir_r1_a5_k0_s1_e4_c256', 'uir_r1_a3_k0_s1_e4_c256', 'uir_r1_a5_k0_s1_e4_c256', 'uir_r1_a3_k0_s1_e4_c256', ], # Stage 2: 640x640 in [ "uir_r1_a5_k5_s2_e6_c640", "uir_r1_a5_k0_s1_e4_c640", "uir_r1_a5_k0_s1_e4_c640", "uir_r1_a5_k0_s1_e4_c640", "uir_r1_a5_k0_s1_e4_c640", "uir_r1_a5_k0_s1_e4_c640", "uir_r1_a5_k0_s1_e4_c640", "uir_r1_a5_k0_s1_e4_c640", "uir_r1_a0_k0_s1_e1_c640", "mqa_r1_k3_h12_v2_s1_d64_c640", "uir_r1_a0_k0_s1_e2_c640", "mqa_r1_k3_h12_v2_s1_d64_c640", "uir_r1_a0_k0_s1_e2_c640", "mqa_r1_k3_h12_v2_s1_d64_c640", "uir_r1_a0_k0_s1_e2_c640", "mqa_r1_k3_h12_v2_s1_d64_c640", "uir_r1_a0_k0_s1_e2_c640", "mqa_r1_k3_h12_v2_s1_d64_c640", "uir_r1_a0_k0_s1_e2_c640", "mqa_r1_k3_h12_v2_s1_d64_c640", "uir_r1_a0_k0_s1_e2_c640", "mqa_r1_k3_h12_v2_s1_d64_c640", "uir_r1_a0_k0_s1_e2_c640", "mqa_r1_k3_h12_v2_s1_d64_c640", "uir_r1_a0_k0_s1_e2_c640", "mqa_r1_k3_h12_v2_s1_d64_c640", "uir_r1_a0_k0_s1_e2_c640", "mqa_r1_k3_h12_v2_s1_d64_c640", "uir_r1_a0_k0_s1_e2_c640", "mqa_r1_k3_h12_v2_s1_d64_c640", "uir_r1_a0_k0_s1_e2_c640", "mqa_r1_k3_h12_v2_s1_d64_c640", "uir_r1_a0_k0_s1_e2_c640", "mqa_r1_k3_h12_v2_s1_d64_c640", "uir_r1_a0_k0_s1_e2_c640", "mqa_r1_k3_h12_v2_s1_d64_c640", "uir_r1_a0_k0_s1_e2_c640", ], # Stage 3: 1280x1280 in [ "uir_r1_a5_k5_s2_e6_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", "mqa_r1_k3_h16_s1_d96_c1280", "uir_r1_a0_k0_s1_e2_c1280", ], ] model_kwargs = dict( block_args=decode_arch_def(arch_def, group_size=group_size), stem_size=64, fix_stem=channel_multiplier < 1.0, round_chs_fn=partial(round_channels, multiplier=channel_multiplier), norm_layer=RmsNorm2d, act_layer=nn.GELU, layer_scale_init_value=1e-5, ) model_kwargs = dict(model_kwargs, **kwargs) if encoder: model = _create_mnv5_encoder(variant, pretrained, **model_kwargs) else: model = _create_mnv5(variant, pretrained, **model_kwargs) return model def _cfg(url: str = '', **kwargs): return { 'url': url, 'num_classes': 1000, 'input_size': (3, 256, 256), 'pool_size': (16, 16), 'crop_pct': 1.0, 'interpolation': 'bicubic', 'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD, 'first_conv': 'conv_stem.conv', 'classifier': 'classifier', **kwargs } default_cfgs = generate_default_cfgs({ # encoder-only configs 'mobilenetv5_300m_enc': _cfg( #hf_hub_id='timm/', input_size=(3, 768, 768), num_classes=0), # WIP classification configs for testing 'mobilenetv5_300m': _cfg( # hf_hub_id='timm/', input_size=(3, 768, 768), num_classes=0), 'mobilenetv5_base.untrained': _cfg( # hf_hub_id='timm/', num_classes=1000) }) @register_model def mobilenetv5_300m_enc(pretrained: bool = False, **kwargs) -> MobileNetV5Encoder: """MobileNet V5 Vision Encoder""" pad_type = kwargs.pop('pad_type', 'same') model = _gen_mobilenet_v5( 'mobilenetv5_300m_enc', pretrained=pretrained, encoder=True, pad_type=pad_type, **kwargs, ) return model @register_model def mobilenetv5_300m(pretrained: bool = False, **kwargs) -> MobileNetV5: model = _gen_mobilenet_v5('mobilenetv5_300m', pretrained=pretrained, **kwargs) return model @register_model def mobilenetv5_base(pretrained: bool = False, **kwargs) -> MobileNetV5: model = _gen_mobilenet_v5('mobilenetv5_base', pretrained=pretrained, **kwargs) return model

timm/models/mobilenetv5.py (647 lines of code) (raw):