in datasets/replica.py [0:0]
def __getitem__(self, idx):
random.seed()
trajectory_len = self.seq_len * self.step
if self.samples_per_epoch:
idx = random.randint(0, len(self.seq_idxs) - 1)
idxstr = self.seq_idxs[idx]
seq_start = random.randint(0, self.episode_len - trajectory_len)
else:
trajectories_per_episode = math.floor(self.episode_len / trajectory_len)
seq_idx = int(idx / trajectories_per_episode)
seq_idx = int(self.seq_idxs[seq_idx])
idxstr = str(seq_idx).zfill(2)
seq_start = (idx % trajectories_per_episode) * trajectory_len
seq_start = int(seq_start)
# Load cameras
episode_path = os.path.join(self.datapath, idxstr)
with open(os.path.join(episode_path, 'cameras.json'), 'r') as f:
cameras = json.load(f)
Rt = []
K = []
rgb = []
depth = []
sample_indices = list(range(seq_start, seq_start + (self.seq_len * self.step), self.step))
for idx, i in enumerate(sample_indices):
Rt.append(torch.Tensor(cameras[i]['Rt']))
K.append(torch.Tensor(cameras[i]['K']))
_rgb = os.path.join(episode_path, str(i).zfill(3) + '_rgb.png')
_rgb = self.resize_transform_rgb(Image.open(_rgb))
rgb.append(_rgb[:3, :, :])
if self.depth:
_depth = os.path.join(episode_path, str(i).zfill(3) + '_depth.tiff')
# We dont want to normalize depth values
_depth = self.resize_transform_depth(Image.open(_depth))
depth.append(torch.from_numpy(np.array(_depth)).unsqueeze(0))
rgb = torch.stack(rgb)
depth = torch.stack(depth).float()
K = torch.stack(K)
Rt = torch.stack(Rt)
Rt = Rt.unsqueeze(0) # add batch dimension
Rt = normalize_trajectory(Rt, center=self.center, normalize_rotation=self.normalize_rotation)
Rt = Rt[0] # remove batch dimension
if self.single_sample_per_trajectory:
selected_indices = torch.multinomial(torch.ones(Rt.shape[0]), num_samples=1).squeeze()
rgb = rgb[selected_indices].unsqueeze(0)
depth = depth[selected_indices].unsqueeze(0)
K = K[selected_indices].unsqueeze(0)
Rt = Rt[selected_indices].unsqueeze(0)
if self.rot_aug:
Rt = random_rotation_augment(Rt)
# Normalize K to img_res
K = K[:, :3, :3]
# https://codeyarns.com/tech/2015-09-08-how-to-compute-intrinsic-camera-matrix-for-a-camera.html
# images were rendered at 512x512 res
fx = 256.0 / np.tan(np.deg2rad(90.0) / 2)
fy = 256.0 / np.tan(np.deg2rad(90.0) / 2)
K[:, 0, 0] = K[:, 0, 0] * fx
K[:, 1, 1] = K[:, 1, 1] * fy
downsampling_ratio = self.img_res / 512
K[:, 0, 0] = K[:, 0, 0] * downsampling_ratio
K[:, 1, 1] = K[:, 1, 1] * downsampling_ratio
depth = depth * 1000 # recommended scaling from game engine units to real world units
if self.depth:
sample = {'rgb': rgb, 'depth': depth, 'K': K, 'Rt': Rt, 'scene_idx': idx}
else:
sample = {'rgb': rgb, 'K': K, 'Rt': Rt, 'scene_idx': idx}
return sample