in janus/models/vq_model.py [0:0]
def forward(self, z):
# reshape z -> (batch, height, width, channel) and flatten
z = torch.einsum("b c h w -> b h w c", z).contiguous()
z_flattened = z.view(-1, self.e_dim)
# distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
if self.l2_norm:
z = F.normalize(z, p=2, dim=-1)
z_flattened = F.normalize(z_flattened, p=2, dim=-1)
embedding = F.normalize(self.embedding.weight, p=2, dim=-1)
else:
embedding = self.embedding.weight
d = (
torch.sum(z_flattened**2, dim=1, keepdim=True)
+ torch.sum(embedding**2, dim=1)
- 2
* torch.einsum(
"bd,dn->bn", z_flattened, torch.einsum("n d -> d n", embedding)
)
)
min_encoding_indices = torch.argmin(d, dim=1)
z_q = embedding[min_encoding_indices].view(z.shape)
perplexity = None
min_encodings = None
vq_loss = None
commit_loss = None
entropy_loss = None
# compute loss for embedding
if self.training:
vq_loss = torch.mean((z_q - z.detach()) ** 2)
commit_loss = self.beta * torch.mean((z_q.detach() - z) ** 2)
entropy_loss = self.entropy_loss_ratio * compute_entropy_loss(-d)
# preserve gradients
z_q = z + (z_q - z).detach()
# reshape back to match original input shape
z_q = torch.einsum("b h w c -> b c h w", z_q)
return (
z_q,
(vq_loss, commit_loss, entropy_loss),
(perplexity, min_encodings, min_encoding_indices),
)