def __getitem__()

in data/multiviewvideo.py [0:0]


    def __getitem__(self, idx):
        (seg, frame), cam = self.framecamlist[idx]

        result = {}

        result["segid"] = seg
        result["frameid"] = frame
        if cam is not None:
            result["cameraid"] = cam

        validinput = True

        # image from one or more cameras (those cameras are fixed over the dataset)
        if "fixedcamimage" in self.keyfilter:
            ninput = len(self.fixedcameras)

            fixedcamimage = []
            for i in range(ninput):
                imagepath = self.imagepath.format(seg=seg, cam=self.fixedcameras[i], frame=int(frame))
                image = utils.downsample(
                        np.asarray(Image.open(imagepath), dtype=np.uint8), self.fixedcamdownsample).transpose((2, 0, 1)).astype(np.float32)
                fixedcamimage.append(image)
            fixedcamimage = np.concatenate(fixedcamimage, axis=1)
            fixedcamimage[:] -= self.fixedcammean
            fixedcamimage[:] /= self.fixedcamstd
            result["fixedcamimage"] = fixedcamimage

        # image from one or more cameras, always the same frame
        if "fixedframeimage" in self.keyfilter:
            ninput = len(self.fixedcameras)

            fixedframeimage = []
            for i in range(ninput):
                imagepath = self.imagepath.format(
                        seg=self.fixedframesegframe[0],
                        cam=self.fixedcameras[i],
                        frame=int(self.fixedframesegframe[1]))
                image = utils.downsample(
                        np.asarray(Image.open(imagepath), dtype=np.uint8), self.fixedcamdownsample).transpose((2, 0, 1)).astype(np.float32)
                fixedframeimage.append(image)
            fixedframeimage = np.concatenate(fixedframeimage, axis=1)
            fixedframeimage[:] -= self.fixedcammean
            fixedframeimage[:] /= self.fixedcamstd
            result["fixedframeimage"] = fixedframeimage

        # vertices
        for k in ["verts", "verts_next"]:
            if k in self.keyfilter:
                vertpath = self.vertpath.format(seg=seg, frame=int(frame) + (1 if k == "verts_next" else 0))
                verts = np.fromfile(vertpath, dtype=np.float32)
                if self.standardizeverts:
                    verts -= self.vertmean.ravel()
                    verts /= self.vertstd
                result[k] = verts.reshape((-1, 3))

        # texture averaged over all cameras for a single frame
        for k in ["avgtex", "avgtex_next"]:
            if k in self.keyfilter:
                texpath = self.texpath.format(seg=seg, cam="average", frame=int(frame) + (1 if k == "avgtex_next" else 0))
                try:
                    tex = np.asarray(Image.open(texpath), dtype=np.uint8)
                    if tex.shape[0] != self.avgtexsize:
                        tex = cv2.resize(tex, dsize=(self.avgtexsize, self.avgtexsize), interpolation=cv2.INTER_LINEAR)
                    tex = tex.transpose((2, 0, 1)).astype(np.float32)
                except:
                    tex = np.zeros((3, self.avgtexsize, self.avgtexsize), dtype=np.float32)
                    validinput = False
                if np.sum(tex) == 0:
                    validinput = False
                texmask = np.sum(tex, axis=0) != 0
                if self.standardizeavgtex:
                    tex -= self.avgtexmean
                    tex /= self.texstd
                    tex[:, ~texmask] = 0.
                result[k] = tex

        # keep track of whether we fail to load any of the input
        result["validinput"] = np.float32(1.0 if validinput else 0.0)

        if "modelmatrix" in self.keyfilter or "modelmatrixinv" in self.keyfilter or "camera" in self.keyfilter:
            def to4x4(m):
                return np.r_[m, np.array([[0., 0., 0., 1.]], dtype=np.float32)]

            # per-frame rigid transformation of scene/object
            for k in ["modelmatrix", "modelmatrix_next"]:
                if k in self.keyfilter:
                    if self.transfpath is not None:
                        transfpath = self.transfpath.format(seg=seg, frame=int(frame) + (1 if k == "modelmatrix_next" else 0))
                        try:
                            frametransf = np.genfromtxt(os.path.join(transfpath)).astype(np.float32)
                        except:
                            frametransf = None

                        result[k] = to4x4(np.dot(
                            np.linalg.inv(to4x4(frametransf)),
                            to4x4(self.basetransf))[:3, :4])
                    else:
                        result[k] = np.eye(3, 4, dtype=np.float32)#np.linalg.inv(to4x4(self.basetransf))[:3, :4]

            # inverse of per-frame rigid transformation of scene/object
            for k in ["modelmatrixinv", "modelmatrixinv_next"]:
                if k in self.keyfilter:
                    if self.transfpath is not None:
                        transfpath = self.transfpath.format(seg=seg, frame=int(frame) + (1 if k == "modelmatrixinv_next" else 0))
                        try:
                            frametransf = np.genfromtxt(os.path.join(transfpath)).astype(np.float32)
                        except:
                            frametransf = None

                        result[k] = to4x4(np.dot(
                            np.linalg.inv(to4x4(self.basetransf)),
                            to4x4(frametransf))[:3, :4])
                    else:
                        result[k] = np.eye(3, 4, dtype=np.float32)#self.basetransf

        # camera-specific data
        if cam is not None:
            # camera pose
            if "camera" in self.keyfilter:
                result["campos"] = np.dot(self.basetransf[:3, :3].T, self.campos[cam] - self.basetransf[:3, 3])
                result["camrot"] = np.dot(self.basetransf[:3, :3].T, self.camrot[cam].T).T
                result["focal"] = self.focal[cam]
                result["princpt"] = self.princpt[cam]
                result["camindex"] = self.allcameras.index(cam)

            # per-frame / per-camera unwrapped texture map
            if "tex" in self.keyfilter:
                texpath = self.texpath.format(seg=seg, cam=cam, frame=frame)
                try:
                    tex = np.asarray(Image.open(texpath), dtype=np.uint8).transpose((2, 0, 1)).astype(np.float32)
                except:
                    tex = np.zeros((3, self.texsize, self.texsize), dtype=np.float32)

                assert tex.shape[1] == self.texsize
                texmask = np.sum(tex, axis=0) != 0
                if self.standardizetex:
                    tex -= self.texmean
                    tex /= self.texstd
                    tex[:, ~texmask] = 0.
                result["tex"] = tex
                result["texmask"] = texmask

            # camera images
            if "image" in self.keyfilter:
                # target image
                imagepath = self.imagepath.format(seg=seg, cam=cam, frame=int(frame))
                image = utils.downsample(
                        np.asarray(Image.open(imagepath), dtype=np.uint8),
                        self.downsample).transpose((2, 0, 1)).astype(np.float32)
                height, width = image.shape[1:3]
                valid = np.float32(1.0) if np.sum(image) != 0 else np.float32(0.)

                # remove black level
                result["image"] = np.clip(image - np.array(self.blacklevel, dtype=np.float32)[:, None, None], 0., None)
                result["imagevalid"] = valid

                # optionally mask pixels with bright background values
                if self.maskbrightbg and cam in self.bg:
                    result["imagemask"] = self.bg[cam][1]

                # optionally mask pixels with bright values
                if self.maskbright:
                    if "imagemask" in result:
                        result["imagemask"] *= np.where(
                                (image[0] > 245.) |
                                (image[1] > 245.) |
                                (image[2] > 245.), 0., 1.)[None, :, :]
                    else:
                        result["imagemask"] = np.where(
                                (image[0] > 245.) |
                                (image[1] > 245.) |
                                (image[2] > 245.), 0., 1.).astype(np.float32)[None, :, :]

            # background image
            if "bg" in self.keyfilter and self.returnbg:
                result["bg"] = self.bg[cam][0]

            # image pixel coordinates
            if "pixelcoords" in self.keyfilter:
                if self.subsampletype == "patch":
                    indx = torch.randint(0, width - self.subsamplesize + 1, size=(1,)).item()
                    indy = torch.randint(0, height - self.subsamplesize + 1, size=(1,)).item()

                    py, px = torch.meshgrid(
                            torch.arange(indy, indy + self.subsamplesize).float(),
                            torch.arange(indx, indx + self.subsamplesize).float())
                elif self.subsampletype == "random":
                    px = torch.randint(0, width, size=(self.subsamplesize, self.subsamplesize)).float()
                    py = torch.randint(0, height, size=(self.subsamplesize, self.subsamplesize)).float()
                elif self.subsampletype == "random2":
                    px = torch.random(size=(self.subsamplesize, self.subsamplesize)).float() * (width - 1)
                    py = torch.random(size=(self.subsamplesize, self.subsamplesize)).float() * (height - 1)
                elif self.subsampletype == "stratified":
                    ssy = self.subsamplesize
                    ssx = self.subsamplesize
                    bsizex = (width - 1.) / ssx
                    bsizey = (height - 1.) / ssy
                    px = (torch.arange(ssx)[None, :] + torch.rand(size=(ssy, ssx))) * bsizex
                    py = (torch.arange(ssy)[:, None] + torch.rand(size=(ssy, ssx))) * bsizey
                elif self.subsampletype == None:
                    py, px = torch.meshgrid(torch.arange(height).float(), torch.arange(width).float())
                else:
                    raise

                result["pixelcoords"] = torch.stack([px, py], dim=-1)

        return result