def dense_clip_sampler()

in datasets/base_video_dataset.py [0:0]
51 lines of code
8 McCabe index (conditional complexity)

def dense_clip_sampler(df: pd.DataFrame,
                       root_dir: Sequence[Path],
                       clip_len: Union[float, str] = 'mean_action_len',
                       stride: float = 1.0,
                       shard_per_worker: bool = False,
                       keep_orig_clips: bool = True,
                       featext_skip_done: bool = False):
    """
    Add clips to the data frame sampling the videos densely from the video.
        This function is also compatible with the convert_to_anticipation_fn
        to extract features etc. The class label for those clips
        is -1, it's mostly just used for SSL/feat ext.
    Args:
        stride (float): stride in seconds on how the clips are sampled.
        shard_per_worker (bool): If true, create subset DF for this process
        featext_skip_done (bool): Set this to true only when extracting
            features. This will go through saved results files and check
            what features have been stored and skip those from populating
            into the dataset to the computed, hence continuing from what
            has already been done.
    """
    uniq_videos = sorted(list(df.video_path.unique()))
    if shard_per_worker:
        world_size = get_world_size()
        rank = get_rank()
        vids_per_shard = int(math.ceil(len(uniq_videos) / world_size))
        uniq_videos = uniq_videos[(vids_per_shard * rank):min((
            (rank + 1) * vids_per_shard), len(uniq_videos))]
    skip_uids = []
    if featext_skip_done:
        # TODO replace with RESULTS_SAVE_DIR
        skip_uids = read_saved_results_uids(Path(f'./results/{get_rank()}.h5'))
        logging.info('Found %d done UIDs, skipping those', len(skip_uids))
    if clip_len == 'mean_action_len':
        clip_len = np.mean(df.end - df.start)
    new_rows = []
    total_possible_clips = 0
    for vid_path in uniq_videos:
        end_s = get_video_info(get_abs_path(root_dir, vid_path),
                               ['len'])['len']
        new_ends = np.arange(0, end_s, stride)
        for new_end in new_ends:
            total_possible_clips += 1
            uid = f'{vid_path.stem}_{new_end}'
            if uid in skip_uids:
                continue
            new_rows.append({
                'participant_id': vid_path.stem.split('_')[0],
                'narration': '',
                'video_id': vid_path.stem,
                'start': new_end - clip_len,
                'end': new_end,
                'verb_class': -1,
                'noun_class': -1,
                'action_class': -1,
                'video_path': vid_path,
                'uid': uid,
            })
    logging.info('Out of %d total potential clips, kept %d',
                 total_possible_clips, len(new_rows))
    new_df = pd.DataFrame(new_rows)
    if keep_orig_clips:
        # Convert the uid to str since the new UIDs being added to the new DF
        # are all strings
        df.uid = df.uid.astype('str')
        new_df = pd.concat([df, new_df])
        new_df.reset_index(drop=True, inplace=True)
    return new_df, pd.DataFrame([])