in pytorchvideo/models/vision_transformers.py [0:0]
def create_multiscale_vision_transformers(
*,
spatial_size: _size_2_t,
temporal_size: int,
cls_embed_on: bool = True,
sep_pos_embed: bool = True,
depth: int = 16,
norm: str = "layernorm",
# Patch embed config.
enable_patch_embed: bool = True,
input_channels: int = 3,
patch_embed_dim: int = 96,
conv_patch_embed_kernel: Tuple[int] = (3, 7, 7),
conv_patch_embed_stride: Tuple[int] = (2, 4, 4),
conv_patch_embed_padding: Tuple[int] = (1, 3, 3),
enable_patch_embed_norm: bool = False,
use_2d_patch: bool = False,
# Attention block config.
num_heads: int = 1,
mlp_ratio: float = 4.0,
qkv_bias: bool = True,
dropout_rate_block: float = 0.0,
droppath_rate_block: float = 0.0,
pooling_mode: str = "conv",
pool_first: bool = False,
embed_dim_mul: Optional[List[List[int]]] = None,
atten_head_mul: Optional[List[List[int]]] = None,
pool_q_stride_size: Optional[List[List[int]]] = None,
pool_kv_stride_size: Optional[List[List[int]]] = None,
pool_kv_stride_adaptive: Optional[_size_3_t] = None,
pool_kvq_kernel: Optional[_size_3_t] = None,
# Head config.
head: Optional[Callable] = create_vit_basic_head,
head_dropout_rate: float = 0.5,
head_activation: Callable = None,
head_num_classes: int = 400,