def get_vae_config()

in scripts/convert_ltx_to_diffusers.py [0:0]


def get_vae_config(version: str) -> Dict[str, Any]:
    if version in ["0.9.0"]:
        config = {
            "in_channels": 3,
            "out_channels": 3,
            "latent_channels": 128,
            "block_out_channels": (128, 256, 512, 512),
            "down_block_types": (
                "LTXVideoDownBlock3D",
                "LTXVideoDownBlock3D",
                "LTXVideoDownBlock3D",
                "LTXVideoDownBlock3D",
            ),
            "decoder_block_out_channels": (128, 256, 512, 512),
            "layers_per_block": (4, 3, 3, 3, 4),
            "decoder_layers_per_block": (4, 3, 3, 3, 4),
            "spatio_temporal_scaling": (True, True, True, False),
            "decoder_spatio_temporal_scaling": (True, True, True, False),
            "decoder_inject_noise": (False, False, False, False, False),
            "downsample_type": ("conv", "conv", "conv", "conv"),
            "upsample_residual": (False, False, False, False),
            "upsample_factor": (1, 1, 1, 1),
            "patch_size": 4,
            "patch_size_t": 1,
            "resnet_norm_eps": 1e-6,
            "scaling_factor": 1.0,
            "encoder_causal": True,
            "decoder_causal": False,
            "timestep_conditioning": False,
        }
    elif version in ["0.9.1"]:
        config = {
            "in_channels": 3,
            "out_channels": 3,
            "latent_channels": 128,
            "block_out_channels": (128, 256, 512, 512),
            "down_block_types": (
                "LTXVideoDownBlock3D",
                "LTXVideoDownBlock3D",
                "LTXVideoDownBlock3D",
                "LTXVideoDownBlock3D",
            ),
            "decoder_block_out_channels": (256, 512, 1024),
            "layers_per_block": (4, 3, 3, 3, 4),
            "decoder_layers_per_block": (5, 6, 7, 8),
            "spatio_temporal_scaling": (True, True, True, False),
            "decoder_spatio_temporal_scaling": (True, True, True),
            "decoder_inject_noise": (True, True, True, False),
            "downsample_type": ("conv", "conv", "conv", "conv"),
            "upsample_residual": (True, True, True),
            "upsample_factor": (2, 2, 2),
            "timestep_conditioning": True,
            "patch_size": 4,
            "patch_size_t": 1,
            "resnet_norm_eps": 1e-6,
            "scaling_factor": 1.0,
            "encoder_causal": True,
            "decoder_causal": False,
        }
        VAE_KEYS_RENAME_DICT.update(VAE_091_RENAME_DICT)
    elif version in ["0.9.5"]:
        config = {
            "in_channels": 3,
            "out_channels": 3,
            "latent_channels": 128,
            "block_out_channels": (128, 256, 512, 1024, 2048),
            "down_block_types": (
                "LTXVideo095DownBlock3D",
                "LTXVideo095DownBlock3D",
                "LTXVideo095DownBlock3D",
                "LTXVideo095DownBlock3D",
            ),
            "decoder_block_out_channels": (256, 512, 1024),
            "layers_per_block": (4, 6, 6, 2, 2),
            "decoder_layers_per_block": (5, 5, 5, 5),
            "spatio_temporal_scaling": (True, True, True, True),
            "decoder_spatio_temporal_scaling": (True, True, True),
            "decoder_inject_noise": (False, False, False, False),
            "downsample_type": ("spatial", "temporal", "spatiotemporal", "spatiotemporal"),
            "upsample_residual": (True, True, True),
            "upsample_factor": (2, 2, 2),
            "timestep_conditioning": True,
            "patch_size": 4,
            "patch_size_t": 1,
            "resnet_norm_eps": 1e-6,
            "scaling_factor": 1.0,
            "encoder_causal": True,
            "decoder_causal": False,
            "spatial_compression_ratio": 32,
            "temporal_compression_ratio": 8,
        }
        VAE_KEYS_RENAME_DICT.update(VAE_095_RENAME_DICT)
    elif version in ["0.9.7"]:
        config = {
            "in_channels": 3,
            "out_channels": 3,
            "latent_channels": 128,
            "block_out_channels": (128, 256, 512, 1024, 2048),
            "down_block_types": (
                "LTXVideo095DownBlock3D",
                "LTXVideo095DownBlock3D",
                "LTXVideo095DownBlock3D",
                "LTXVideo095DownBlock3D",
            ),
            "decoder_block_out_channels": (256, 512, 1024),
            "layers_per_block": (4, 6, 6, 2, 2),
            "decoder_layers_per_block": (5, 5, 5, 5),
            "spatio_temporal_scaling": (True, True, True, True),
            "decoder_spatio_temporal_scaling": (True, True, True),
            "decoder_inject_noise": (False, False, False, False),
            "downsample_type": ("spatial", "temporal", "spatiotemporal", "spatiotemporal"),
            "upsample_residual": (True, True, True),
            "upsample_factor": (2, 2, 2),
            "timestep_conditioning": True,
            "patch_size": 4,
            "patch_size_t": 1,
            "resnet_norm_eps": 1e-6,
            "scaling_factor": 1.0,
            "encoder_causal": True,
            "decoder_causal": False,
            "spatial_compression_ratio": 32,
            "temporal_compression_ratio": 8,
        }
        VAE_KEYS_RENAME_DICT.update(VAE_095_RENAME_DICT)
    return config