in pytorchvideo/models/audio_visual_slowfast.py [0:0]
def create_audio_visual_slowfast(
*,
# SlowFast configs.
slowfast_channel_reduction_ratio: Tuple[int] = (8, 2),
slowfast_conv_channel_fusion_ratio: int = 2,
fusion_builder: Callable[
[int, int], nn.Module
] = None, # Args: fusion_dim_in, stage_idx
# Input clip configs.
input_channels: Tuple[int] = (3, 3, 1),
# Model configs.
model_depth: int = 50,
model_num_class: int = 400,
dropout_rate: float = 0.5,
# Normalization configs.
norm: Callable = nn.BatchNorm3d,
# Activation configs.
activation: Callable = nn.ReLU,
# Stem configs.
stem_dim_outs: Tuple[int] = (64, 8, 32),
stem_conv_kernel_sizes: Tuple[Tuple[int]] = ((1, 7, 7), (5, 7, 7), (9, 1, 9)),
stem_conv_strides: Tuple[Tuple[int]] = ((1, 2, 2), (1, 2, 2), (1, 1, 1)),
stem_pool: Tuple[Callable] = (nn.MaxPool3d, nn.MaxPool3d, None),
stem_pool_kernel_sizes: Tuple[Tuple[int]] = ((1, 3, 3), (1, 3, 3), (1, 3, 3)),
stem_pool_strides: Tuple[Tuple[int]] = ((1, 2, 2), (1, 2, 2), (1, 1, 1)),
# Stage configs.
stage_conv_a_kernel_sizes: Tuple[Tuple[Tuple[int]]] = (
((1, 1, 1), (1, 1, 1), (3, 1, 1), (3, 1, 1)),
((3, 1, 1), (3, 1, 1), (3, 1, 1), (3, 1, 1)),
((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1)),
),
stage_conv_b_kernel_sizes: Tuple[Tuple[Tuple[int]]] = (
((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)),
((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)),
((3, 1, 3), (3, 1, 3), (3, 1, 3), (3, 1, 3)),
),
stage_conv_b_num_groups: Tuple[Tuple[int]] = (
(1, 1, 1, 1),
(1, 1, 1, 1),
(1, 1, 1, 1),
),
stage_conv_b_dilations: Tuple[Tuple[Tuple[int]]] = (
((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1)),
((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1)),
((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1)),
),
stage_spatial_strides: Tuple[Tuple[int]] = (
(1, 2, 2, 2),
(1, 2, 2, 2),
(1, 2, 2, 2),
),
stage_temporal_strides: Tuple[Tuple[int]] = (
(1, 1, 1, 1),
(1, 1, 1, 1),
(1, 2, 2, 2),
),
bottleneck: Tuple[Tuple[Callable]] = (
(
create_bottleneck_block,
create_bottleneck_block,
create_bottleneck_block,
create_bottleneck_block,
),
(
create_bottleneck_block,
create_bottleneck_block,
create_bottleneck_block,
create_bottleneck_block,
),
(
create_acoustic_bottleneck_block,
create_acoustic_bottleneck_block,
create_bottleneck_block,
create_bottleneck_block,
),
),
# Head configs.
head_pool: Callable = nn.AvgPool3d,
head_pool_kernel_sizes: Tuple[Tuple[int]] = ((8, 7, 7), (32, 7, 7), (16, 1, 10)),
head_output_size: Tuple[int] = (1, 1, 1),
head_activation: Callable = None,
head_output_with_global_average: bool = True,