threestudio/models/guidance/stable_diffusion_bsd_guidance.py [761:927]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        )

        w = (1 - self.alphas[t]).view(-1, 1, 1, 1)

        grad = w * (noise_pred_pretrain - noise_pred_est)
        return grad

    def compute_grad_vsd_hifa(
        self,
        latents: Float[Tensor, "B 4 64 64"],
        text_embeddings_vd: Float[Tensor, "BB 77 768"],
        text_embeddings: Float[Tensor, "BB 77 768"],
        camera_condition: Float[Tensor, "B 4 4"],
        mask=None,
    ):
        B, _, DH, DW = latents.shape
        rgb = self.decode_latents(latents)
        self.name = "hifa"

        if mask is not None:
            mask = F.interpolate(mask, (DH, DW), mode="bilinear", antialias=True)
        with torch.no_grad():
            # random timestamp
            t = torch.randint(
                self.min_step,
                self.max_step + 1,
                [B],
                dtype=torch.long,
                device=self.device,
            )
            w = (1 - self.alphas[t]).view(-1, 1, 1, 1)
            # add noise
            noise = torch.randn_like(latents)
            latents_noisy = self.scheduler_sample.add_noise(latents, noise, t)
            latents_noisy_lora = self.scheduler_lora_sample.add_noise(latents, noise, t)
            # pred noise

            self.scheduler_sample.config.num_train_timesteps = t.item()
            self.scheduler_sample.set_timesteps(t.item() // 50 + 1)
            self.scheduler_lora_sample.config.num_train_timesteps = t.item()
            self.scheduler_lora_sample.set_timesteps(t.item() // 50 + 1)

            for i, timestep in enumerate(self.scheduler_sample.timesteps):     
            # for i, timestep in tqdm(enumerate(self.scheduler.timesteps)):   
                latent_model_input = torch.cat([latents_noisy] * 2, dim=0)
                latent_model_input_lora = torch.cat([latents_noisy_lora] * 2, dim=0)

                # print(latent_model_input.shape)
                with self.disable_unet_class_embedding(self.unet) as unet:
                    cross_attention_kwargs = {"scale": 0.0} if self.single_model else None
                    noise_pred_pretrain = self.forward_unet(
                        unet,
                        latent_model_input,
                        timestep,
                        encoder_hidden_states=text_embeddings_vd,
                        cross_attention_kwargs=cross_attention_kwargs,
                    )

                # use view-independent text embeddings in LoRA
                noise_pred_est = self.forward_unet(
                    self.unet_lora,
                    latent_model_input_lora,
                    timestep,
                    encoder_hidden_states=text_embeddings,
                    class_labels=torch.cat(
                        [
                            camera_condition.view(B, -1),
                            torch.zeros_like(camera_condition.view(B, -1)),
                        ],
                        dim=0,
                    ),
                    cross_attention_kwargs={"scale": 1.0},
                )

                (
                    noise_pred_pretrain_text,
                    noise_pred_pretrain_uncond,
                ) = noise_pred_pretrain.chunk(2)

                # NOTE: guidance scale definition here is aligned with diffusers, but different from other guidance
                noise_pred_pretrain = noise_pred_pretrain_uncond + self.cfg.guidance_scale * (
                    noise_pred_pretrain_text - noise_pred_pretrain_uncond
                )
                if mask is not None:
                    noise_pred_pretrain = mask * noise_pred_pretrain + (1 - mask) * noise

                (
                    noise_pred_est_text,
                    noise_pred_est_uncond,
                ) = noise_pred_est.chunk(2)

                # NOTE: guidance scale definition here is aligned with diffusers, but different from other guidance
                # noise_pred_est = noise_pred_est_uncond + self.cfg.guidance_scale_lora * (
                #     noise_pred_est_text - noise_pred_est_uncond
                # )
                noise_pred_est = noise_pred_est_text
                if mask is not None:
                    noise_pred_est = mask * noise_pred_est + (1 - mask) * noise

                latents_noisy = self.scheduler_sample.step(noise_pred_pretrain, timestep, latents_noisy).prev_sample
                latents_noisy_lora = self.scheduler_lora_sample.step(noise_pred_est, timestep, latents_noisy_lora).prev_sample

                # noise = torch.randn_like(latents)
                # latents_noisy = self.scheduler.step(noise_pred_pretrain, timestep, latents_noisy).prev_sample
                # latents_noisy = mask * latents_noisy + (1-mask) * latents
                # latents_noisy = self.scheduler_sample.add_noise(latents_noisy, noise, timestep)

                # latents_noisy_lora = self.scheduler_lora.step(noise_pred_est, timestep, latents_noisy_lora).prev_sample
                # latents_noisy_lora = mask * latents_noisy_lora + (1-mask) * latents
                # latents_noisy_lora = self.scheduler_lora_sample.add_noise(latents_noisy_lora, noise, timestep)

            hifa_images = self.decode_latents(latents_noisy)
            hifa_lora_images = self.decode_latents(latents_noisy_lora)

            import cv2
            import numpy as np
            if mask is not None:
                print('hifa mask!')
                prefix = 'vsd_mask'
            else:
                prefix = ''
            temp = (hifa_images.permute(0, 2, 3, 1).detach().cpu()[0].numpy() * 255).astype(np.uint8)
            cv2.imwrite(".threestudio_cache/%s%s_test.jpg" % (prefix, self.name), temp[:, :, ::-1])
            temp = (hifa_lora_images.permute(0, 2, 3, 1).detach().cpu()[0].numpy() * 255).astype(np.uint8)
            cv2.imwrite(".threestudio_cache/%s%s_test_lora.jpg" %  (prefix, self.name), temp[:, :, ::-1])

        target = (latents_noisy - latents_noisy_lora + latents).detach()
        # target = latents_noisy.detach()
        targets_rgb = self.decode_latents(target)
        # targets_rgb = (hifa_images - hifa_lora_images + rgb).detach()
        temp = (targets_rgb.permute(0, 2, 3, 1).detach().cpu()[0].numpy() * 255).astype(np.uint8)
        cv2.imwrite(".threestudio_cache/%s_target.jpg" % self.name, temp[:, :, ::-1])

        return w * 0.5 * F.mse_loss(target, latents, reduction='sum')

    def train_lora(
        self,
        latents: Float[Tensor, "B 4 64 64"],
        text_embeddings: Float[Tensor, "BB 77 768"],
        camera_condition: Float[Tensor, "B 4 4"],
    ):
        B = latents.shape[0]
        latents = latents.detach().repeat(self.cfg.lora_n_timestamp_samples, 1, 1, 1)

        t = torch.randint(
            int(self.num_train_timesteps * 0.0),
            int(self.num_train_timesteps * 1.0),
            [B * self.cfg.lora_n_timestamp_samples],
            dtype=torch.long,
            device=self.device,
        )

        noise = torch.randn_like(latents)
        noisy_latents = self.scheduler_lora.add_noise(latents, noise, t)
        if self.scheduler_lora.config.prediction_type == "epsilon":
            target = noise
        elif self.scheduler_lora.config.prediction_type == "v_prediction":
            target = self.scheduler_lora.get_velocity(latents, noise, t)
        else:
            raise ValueError(
                f"Unknown prediction type {self.scheduler_lora.config.prediction_type}"
            )
        # use view-independent text embeddings in LoRA
        text_embeddings_cond, _ = text_embeddings.chunk(2)
        if self.cfg.lora_cfg_training and random.random() < 0.1:
            camera_condition = torch.zeros_like(camera_condition)
        noise_pred = self.forward_unet(
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


threestudio/models/guidance/stable_diffusion_vsd_guidance.py [720:886]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        )

        w = (1 - self.alphas[t]).view(-1, 1, 1, 1)

        grad = w * (noise_pred_pretrain - noise_pred_est)
        return grad

    def compute_grad_vsd_hifa(
        self,
        latents: Float[Tensor, "B 4 64 64"],
        text_embeddings_vd: Float[Tensor, "BB 77 768"],
        text_embeddings: Float[Tensor, "BB 77 768"],
        camera_condition: Float[Tensor, "B 4 4"],
        mask=None,
    ):
        B, _, DH, DW = latents.shape
        rgb = self.decode_latents(latents)
        self.name = "hifa"
        
        if mask is not None:
            mask = F.interpolate(mask, (DH, DW), mode="bilinear", antialias=True)
        with torch.no_grad():
            # random timestamp
            t = torch.randint(
                self.min_step,
                self.max_step + 1,
                [B],
                dtype=torch.long,
                device=self.device,
            )
            w = (1 - self.alphas[t]).view(-1, 1, 1, 1)
            # add noise
            noise = torch.randn_like(latents)
            latents_noisy = self.scheduler_sample.add_noise(latents, noise, t)
            latents_noisy_lora = self.scheduler_lora_sample.add_noise(latents, noise, t)
            # pred noise
            
            self.scheduler_sample.config.num_train_timesteps = t.item()
            self.scheduler_sample.set_timesteps(t.item() // 50 + 1)
            self.scheduler_lora_sample.config.num_train_timesteps = t.item()
            self.scheduler_lora_sample.set_timesteps(t.item() // 50 + 1)
            
            for i, timestep in enumerate(self.scheduler_sample.timesteps):     
            # for i, timestep in tqdm(enumerate(self.scheduler.timesteps)):   
                latent_model_input = torch.cat([latents_noisy] * 2, dim=0)
                latent_model_input_lora = torch.cat([latents_noisy_lora] * 2, dim=0)
                
                # print(latent_model_input.shape)
                with self.disable_unet_class_embedding(self.unet) as unet:
                    cross_attention_kwargs = {"scale": 0.0} if self.single_model else None
                    noise_pred_pretrain = self.forward_unet(
                        unet,
                        latent_model_input,
                        timestep,
                        encoder_hidden_states=text_embeddings_vd,
                        cross_attention_kwargs=cross_attention_kwargs,
                    )

                # use view-independent text embeddings in LoRA
                noise_pred_est = self.forward_unet(
                    self.unet_lora,
                    latent_model_input_lora,
                    timestep,
                    encoder_hidden_states=text_embeddings,
                    class_labels=torch.cat(
                        [
                            camera_condition.view(B, -1),
                            torch.zeros_like(camera_condition.view(B, -1)),
                        ],
                        dim=0,
                    ),
                    cross_attention_kwargs={"scale": 1.0},
                )

                (
                    noise_pred_pretrain_text,
                    noise_pred_pretrain_uncond,
                ) = noise_pred_pretrain.chunk(2)

                # NOTE: guidance scale definition here is aligned with diffusers, but different from other guidance
                noise_pred_pretrain = noise_pred_pretrain_uncond + self.cfg.guidance_scale * (
                    noise_pred_pretrain_text - noise_pred_pretrain_uncond
                )
                if mask is not None:
                    noise_pred_pretrain = mask * noise_pred_pretrain + (1 - mask) * noise
            
                (
                    noise_pred_est_text,
                    noise_pred_est_uncond,
                ) = noise_pred_est.chunk(2)

                # NOTE: guidance scale definition here is aligned with diffusers, but different from other guidance
                # noise_pred_est = noise_pred_est_uncond + self.cfg.guidance_scale_lora * (
                #     noise_pred_est_text - noise_pred_est_uncond
                # )
                noise_pred_est = noise_pred_est_text
                if mask is not None:
                    noise_pred_est = mask * noise_pred_est + (1 - mask) * noise
            
                latents_noisy = self.scheduler_sample.step(noise_pred_pretrain, timestep, latents_noisy).prev_sample
                latents_noisy_lora = self.scheduler_lora_sample.step(noise_pred_est, timestep, latents_noisy_lora).prev_sample
                
                # noise = torch.randn_like(latents)
                # latents_noisy = self.scheduler.step(noise_pred_pretrain, timestep, latents_noisy).prev_sample
                # latents_noisy = mask * latents_noisy + (1-mask) * latents
                # latents_noisy = self.scheduler_sample.add_noise(latents_noisy, noise, timestep)
            
                # latents_noisy_lora = self.scheduler_lora.step(noise_pred_est, timestep, latents_noisy_lora).prev_sample
                # latents_noisy_lora = mask * latents_noisy_lora + (1-mask) * latents
                # latents_noisy_lora = self.scheduler_lora_sample.add_noise(latents_noisy_lora, noise, timestep)

            hifa_images = self.decode_latents(latents_noisy)
            hifa_lora_images = self.decode_latents(latents_noisy_lora)
            
            import cv2
            import numpy as np
            if mask is not None:
                print('hifa mask!')
                prefix = 'vsd_mask'
            else:
                prefix = ''
            temp = (hifa_images.permute(0, 2, 3, 1).detach().cpu()[0].numpy() * 255).astype(np.uint8)
            cv2.imwrite(".threestudio_cache/%s%s_test.jpg" % (prefix, self.name), temp[:, :, ::-1])
            temp = (hifa_lora_images.permute(0, 2, 3, 1).detach().cpu()[0].numpy() * 255).astype(np.uint8)
            cv2.imwrite(".threestudio_cache/%s%s_test_lora.jpg" %  (prefix, self.name), temp[:, :, ::-1])

        target = (latents_noisy - latents_noisy_lora + latents).detach()
        # target = latents_noisy.detach()
        targets_rgb = self.decode_latents(target)
        # targets_rgb = (hifa_images - hifa_lora_images + rgb).detach()
        temp = (targets_rgb.permute(0, 2, 3, 1).detach().cpu()[0].numpy() * 255).astype(np.uint8)
        cv2.imwrite(".threestudio_cache/%s_target.jpg" % self.name, temp[:, :, ::-1])
        
        return w * 0.5 * F.mse_loss(target, latents, reduction='sum')

    def train_lora(
        self,
        latents: Float[Tensor, "B 4 64 64"],
        text_embeddings: Float[Tensor, "BB 77 768"],
        camera_condition: Float[Tensor, "B 4 4"],
    ):
        B = latents.shape[0]
        latents = latents.detach().repeat(self.cfg.lora_n_timestamp_samples, 1, 1, 1)

        t = torch.randint(
            int(self.num_train_timesteps * 0.0),
            int(self.num_train_timesteps * 1.0),
            [B * self.cfg.lora_n_timestamp_samples],
            dtype=torch.long,
            device=self.device,
        )

        noise = torch.randn_like(latents)
        noisy_latents = self.scheduler_lora.add_noise(latents, noise, t)
        if self.scheduler_lora.config.prediction_type == "epsilon":
            target = noise
        elif self.scheduler_lora.config.prediction_type == "v_prediction":
            target = self.scheduler_lora.get_velocity(latents, noise, t)
        else:
            raise ValueError(
                f"Unknown prediction type {self.scheduler_lora.config.prediction_type}"
            )
        # use view-independent text embeddings in LoRA
        text_embeddings_cond, _ = text_embeddings.chunk(2)
        if self.cfg.lora_cfg_training and random.random() < 0.1:
            camera_condition = torch.zeros_like(camera_condition)
        noise_pred = self.forward_unet(
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -