in datasets/epic_kitchens.py [0:0]
def _read_rulstm_features(self,
video_path: Path,
start_sec: float,
end_sec: float,
fps: float,
df_row: pd.DataFrame,
pts_unit='sec'):
del pts_unit # Not supported here
if self.read_type == 'exact_rulstm':
# get frames every 0.25s between start and end frames
# 0.25 comes from their code, and they typically do 2.5s total
# observation time. 14 is the sequence length they use.
time_stamps = end_sec - np.arange(0.0, 0.25 * 11, 0.25)[::-1]
frames = np.floor(time_stamps * fps).astype(int)
elif self.read_type == 'normal':
# Read every single frame between the start and end, the
# base_video_dataset code will deal with how to sample into 4fps
# (i.e. 0.25s steps)
# Rather than first computing the timestamps, just compute the
# frame ID of the start and end, and do a arange .. that avoids
# any repeated frames due to quantization/floor
time_stamps = None
start_frame = np.floor(start_sec * fps)
end_frame = np.floor(end_sec * fps)
frames = np.arange(end_frame, start_frame, -1).astype(int)[::-1]
else:
raise NotImplementedError(f'Unknown {self.read_type}')
# If the frames go below 1, replace them with the lowest time pt
assert frames.max() >= 1, (
f'The dataset shouldnt have cases otherwise. {video_path} '
f'{start_sec} {end_sec} {df_row} {frames} {time_stamps} ')
frames[frames < 1] = frames[frames >= 1].min()
# Get the features
all_feats = []
for lmdb_env in self.lmdb_envs:
all_feats.append(
self.read_representations(
frames, lmdb_env,
Path(video_path).stem + '_frame_{:010d}.jpg'))
final_feat = torch.cat(all_feats, dim=-1)
# Must return rgb, audio, info; so padding with empty dicts for those
return final_feat, {}, {}