def embed_images_grid()

in point_e/models/pretrained_clip.py [0:0]


    def embed_images_grid(self, xs: Iterable[Optional[ImageType]]) -> torch.Tensor:
        """
        Embed images into latent grids.

        :param xs: an iterable of images to embed.
        :return: a tensor of shape [N x C x L], where L = self.grid_size**2.
        """
        if self.ensure_used_params:
            extra_value = 0.0
            for p in self.parameters():
                extra_value = extra_value + p.mean() * 0.0
        else:
            extra_value = 0.0

        x = self.images_to_tensor(xs).to(self.clip_model.dtype)

        # https://github.com/openai/CLIP/blob/4d120f3ec35b30bd0f992f5d8af2d793aad98d2a/clip/model.py#L225
        vt = self.clip_model.visual
        x = vt.conv1(x)  # shape = [*, width, grid, grid]
        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
        x = torch.cat(
            [
                vt.class_embedding.to(x.dtype)
                + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
                x,
            ],
            dim=1,
        )  # shape = [*, grid ** 2 + 1, width]
        x = x + vt.positional_embedding.to(x.dtype)
        x = vt.ln_pre(x)

        x = x.permute(1, 0, 2)  # NLD -> LND
        x = vt.transformer(x)
        x = x.permute(1, 2, 0)  # LND -> NDL

        return x[..., 1:].contiguous().float() + extra_value