in threestudio/models/guidance/zero123_unified_guidance.py [0:0]
def configure(self) -> None:
self.min_step: Optional[int] = None
self.max_step: Optional[int] = None
self.grad_clip_val: Optional[float] = None
@dataclass
class NonTrainableModules:
pipe: Zero123Pipeline
pipe_phi: Optional[Zero123Pipeline] = None
self.weights_dtype = (
torch.float16 if self.cfg.half_precision_weights else torch.float32
)
threestudio.info(f"Loading Zero123 ...")
# need to make sure the pipeline file is in path
sys.path.append("extern/")
pipe_kwargs = {
"safety_checker": None,
"requires_safety_checker": False,
"variant": "fp16" if self.cfg.half_precision_weights else None,
"torch_dtype": self.weights_dtype,
"cache_dir": self.cfg.cache_dir,
"local_files_only": self.cfg.local_files_only,
}
pipe = Zero123Pipeline.from_pretrained(
self.cfg.pretrained_model_name_or_path,
**pipe_kwargs,
).to(self.device)
self.prepare_pipe(pipe)
# phi network for VSD
# introduce two trainable modules:
# - self.camera_embedding
# - self.lora_layers
pipe_phi = None
# if the phi network shares the same unet with the pretrain network
# we need to pass additional cross attention kwargs to the unet
self.vsd_share_model = (
self.cfg.guidance_type == "vsd"
and self.cfg.vsd_phi_model_name_or_path is None
)
if self.cfg.guidance_type == "vsd":
if self.cfg.vsd_phi_model_name_or_path is None:
pipe_phi = pipe
else:
pipe_phi = Zero123Pipeline.from_pretrained(
self.cfg.vsd_phi_model_name_or_path,
**pipe_kwargs,
).to(self.device)
self.prepare_pipe(pipe_phi)
# set up camera embedding
if self.cfg.vsd_use_camera_condition:
if self.cfg.vsd_camera_condition_type in ["extrinsics", "mvp"]:
self.camera_embedding_dim = 16
elif self.cfg.vsd_camera_condition_type == "spherical":
self.camera_embedding_dim = 4
else:
raise ValueError("Invalid camera condition type!")
# FIXME: hard-coded output dim
self.camera_embedding = ToDTypeWrapper(
TimestepEmbedding(self.camera_embedding_dim, 1280),
self.weights_dtype,
).to(self.device)
pipe_phi.unet.class_embedding = self.camera_embedding
if self.cfg.vsd_use_lora:
# set up LoRA layers
lora_attn_procs = {}
for name in pipe_phi.unet.attn_processors.keys():
cross_attention_dim = (
None
if name.endswith("attn1.processor")
else pipe_phi.unet.config.cross_attention_dim
)
if name.startswith("mid_block"):
hidden_size = pipe_phi.unet.config.block_out_channels[-1]
elif name.startswith("up_blocks"):
block_id = int(name[len("up_blocks.")])
hidden_size = list(
reversed(pipe_phi.unet.config.block_out_channels)
)[block_id]
elif name.startswith("down_blocks"):
block_id = int(name[len("down_blocks.")])
hidden_size = pipe_phi.unet.config.block_out_channels[block_id]
lora_attn_procs[name] = LoRAAttnProcessor(
hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
)
pipe_phi.unet.set_attn_processor(lora_attn_procs)
self.lora_layers = AttnProcsLayers(pipe_phi.unet.attn_processors).to(
self.device
)
self.lora_layers._load_state_dict_pre_hooks.clear()
self.lora_layers._state_dict_hooks.clear()
threestudio.info(f"Loaded Stable Diffusion!")
self.scheduler = DDPMScheduler.from_config(pipe.scheduler.config)
self.num_train_timesteps = self.scheduler.config.num_train_timesteps
# q(z_t|x) = N(alpha_t x, sigma_t^2 I)
# in DDPM, alpha_t = sqrt(alphas_cumprod_t), sigma_t^2 = 1 - alphas_cumprod_t
self.alphas_cumprod: Float[Tensor, "T"] = self.scheduler.alphas_cumprod.to(
self.device
)
self.alphas: Float[Tensor, "T"] = self.alphas_cumprod**0.5
self.sigmas: Float[Tensor, "T"] = (1 - self.alphas_cumprod) ** 0.5
# log SNR
self.lambdas: Float[Tensor, "T"] = self.sigmas / self.alphas
self._non_trainable_modules = NonTrainableModules(
pipe=pipe,
pipe_phi=pipe_phi,
)
# self.clip_image_embeddings and self.image_latents
self.prepare_image_embeddings()