def create_audio_visual_slowfast()

in pytorchvideo/models/audio_visual_slowfast.py [0:0]


def create_audio_visual_slowfast(
    *,
    # SlowFast configs.
    slowfast_channel_reduction_ratio: Tuple[int] = (8, 2),
    slowfast_conv_channel_fusion_ratio: int = 2,
    fusion_builder: Callable[
        [int, int], nn.Module
    ] = None,  # Args: fusion_dim_in, stage_idx
    # Input clip configs.
    input_channels: Tuple[int] = (3, 3, 1),
    # Model configs.
    model_depth: int = 50,
    model_num_class: int = 400,
    dropout_rate: float = 0.5,
    # Normalization configs.
    norm: Callable = nn.BatchNorm3d,
    # Activation configs.
    activation: Callable = nn.ReLU,
    # Stem configs.
    stem_dim_outs: Tuple[int] = (64, 8, 32),
    stem_conv_kernel_sizes: Tuple[Tuple[int]] = ((1, 7, 7), (5, 7, 7), (9, 1, 9)),
    stem_conv_strides: Tuple[Tuple[int]] = ((1, 2, 2), (1, 2, 2), (1, 1, 1)),
    stem_pool: Tuple[Callable] = (nn.MaxPool3d, nn.MaxPool3d, None),
    stem_pool_kernel_sizes: Tuple[Tuple[int]] = ((1, 3, 3), (1, 3, 3), (1, 3, 3)),
    stem_pool_strides: Tuple[Tuple[int]] = ((1, 2, 2), (1, 2, 2), (1, 1, 1)),
    # Stage configs.
    stage_conv_a_kernel_sizes: Tuple[Tuple[Tuple[int]]] = (
        ((1, 1, 1), (1, 1, 1), (3, 1, 1), (3, 1, 1)),
        ((3, 1, 1), (3, 1, 1), (3, 1, 1), (3, 1, 1)),
        ((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1)),
    ),
    stage_conv_b_kernel_sizes: Tuple[Tuple[Tuple[int]]] = (
        ((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)),
        ((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)),
        ((3, 1, 3), (3, 1, 3), (3, 1, 3), (3, 1, 3)),
    ),
    stage_conv_b_num_groups: Tuple[Tuple[int]] = (
        (1, 1, 1, 1),
        (1, 1, 1, 1),
        (1, 1, 1, 1),
    ),
    stage_conv_b_dilations: Tuple[Tuple[Tuple[int]]] = (
        ((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1)),
        ((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1)),
        ((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1)),
    ),
    stage_spatial_strides: Tuple[Tuple[int]] = (
        (1, 2, 2, 2),
        (1, 2, 2, 2),
        (1, 2, 2, 2),
    ),
    stage_temporal_strides: Tuple[Tuple[int]] = (
        (1, 1, 1, 1),
        (1, 1, 1, 1),
        (1, 2, 2, 2),
    ),
    bottleneck: Tuple[Tuple[Callable]] = (
        (
            create_bottleneck_block,
            create_bottleneck_block,
            create_bottleneck_block,
            create_bottleneck_block,
        ),
        (
            create_bottleneck_block,
            create_bottleneck_block,
            create_bottleneck_block,
            create_bottleneck_block,
        ),
        (
            create_acoustic_bottleneck_block,
            create_acoustic_bottleneck_block,
            create_bottleneck_block,
            create_bottleneck_block,
        ),
    ),
    # Head configs.
    head_pool: Callable = nn.AvgPool3d,
    head_pool_kernel_sizes: Tuple[Tuple[int]] = ((8, 7, 7), (32, 7, 7), (16, 1, 10)),
    head_output_size: Tuple[int] = (1, 1, 1),
    head_activation: Callable = None,
    head_output_with_global_average: bool = True,