in point_e/models/pretrained_clip.py [0:0]
def embed_images_grid(self, xs: Iterable[Optional[ImageType]]) -> torch.Tensor:
"""
Embed images into latent grids.
:param xs: an iterable of images to embed.
:return: a tensor of shape [N x C x L], where L = self.grid_size**2.
"""
if self.ensure_used_params:
extra_value = 0.0
for p in self.parameters():
extra_value = extra_value + p.mean() * 0.0
else:
extra_value = 0.0
x = self.images_to_tensor(xs).to(self.clip_model.dtype)
# https://github.com/openai/CLIP/blob/4d120f3ec35b30bd0f992f5d8af2d793aad98d2a/clip/model.py#L225
vt = self.clip_model.visual
x = vt.conv1(x) # shape = [*, width, grid, grid]
x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
x = torch.cat(
[
vt.class_embedding.to(x.dtype)
+ torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
x,
],
dim=1,
) # shape = [*, grid ** 2 + 1, width]
x = x + vt.positional_embedding.to(x.dtype)
x = vt.ln_pre(x)
x = x.permute(1, 0, 2) # NLD -> LND
x = vt.transformer(x)
x = x.permute(1, 2, 0) # LND -> NDL
return x[..., 1:].contiguous().float() + extra_value