in slowfast/datasets/ptv_datasets.py [0:0]
def Ptvkinetics(cfg, mode):
"""
Construct the Kinetics video loader with a given csv file. The format of
the csv file is:
```
path_to_video_1 label_1
path_to_video_2 label_2
...
path_to_video_N label_N
```
For `train` and `val` mode, a single clip is randomly sampled from every video
with random cropping, scaling, and flipping. For `test` mode, multiple clips are
uniformaly sampled from every video with center cropping.
Args:
cfg (CfgNode): configs.
mode (string): Options includes `train`, `val`, or `test` mode.
For the train and val mode, the data loader will take data
from the train or val set, and sample one clip per video.
For the test mode, the data loader will take data from test set,
and sample multiple clips per video.
"""
# Only support train, val, and test mode.
assert mode in [
"train",
"val",
"test",
], "Split '{}' not supported".format(mode)
logger.info("Constructing Ptvkinetics {}...".format(mode))
clip_duration = (
cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE / cfg.DATA.TARGET_FPS
)
path_to_file = os.path.join(
cfg.DATA.PATH_TO_DATA_DIR, "{}.csv".format(mode)
)
labeled_video_paths = LabeledVideoPaths.from_path(path_to_file)
num_videos = len(labeled_video_paths)
labeled_video_paths.path_prefix = cfg.DATA.PATH_PREFIX
logger.info(
"Constructing kinetics dataloader (size: {}) from {}".format(
num_videos, path_to_file
)
)
if mode in ["train", "val"]:
num_clips = 1
num_crops = 1
transform = Compose(
[
ApplyTransformToKey(
key="video",
transform=Compose(
[
UniformTemporalSubsample(cfg.DATA.NUM_FRAMES),
Lambda(div255),
NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD),
RandomShortSideScale(
min_size=cfg.DATA.TRAIN_JITTER_SCALES[0],
max_size=cfg.DATA.TRAIN_JITTER_SCALES[1],
),
RandomCropVideo(cfg.DATA.TRAIN_CROP_SIZE),
]
+ (
[RandomHorizontalFlipVideo(p=0.5)]
if cfg.DATA.RANDOM_FLIP
else []
)
+ [PackPathway(cfg)]
),
),
DictToTuple(num_clips, num_crops),
]
)
clip_sampler = make_clip_sampler("random", clip_duration)
if cfg.NUM_GPUS > 1:
video_sampler = DistributedSampler
else:
video_sampler = (
RandomSampler if mode == "train" else SequentialSampler
)
else:
num_clips = cfg.TEST.NUM_ENSEMBLE_VIEWS
num_crops = cfg.TEST.NUM_SPATIAL_CROPS
transform = Compose(
[
ApplyTransformToKey(
key="video",
transform=Compose(
[
UniformTemporalSubsample(cfg.DATA.NUM_FRAMES),
Lambda(div255),
NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD),
ShortSideScale(
size=cfg.DATA.TRAIN_JITTER_SCALES[0]
),
]
),
),
UniformCropVideo(size=cfg.DATA.TEST_CROP_SIZE),
ApplyTransformToKey(key="video", transform=PackPathway(cfg)),
DictToTuple(num_clips, num_crops),
]
)
clip_sampler = make_clip_sampler(
"constant_clips_per_video",
clip_duration,
num_clips,
num_crops,
)
video_sampler = (
DistributedSampler if cfg.NUM_GPUS > 1 else SequentialSampler
)
return PTVDatasetWrapper(
num_videos=num_videos,
clips_per_video=num_clips,
crops_per_clip=num_crops,
dataset=LabeledVideoDataset(
labeled_video_paths=labeled_video_paths,
clip_sampler=clip_sampler,
video_sampler=video_sampler,
transform=transform,
decode_audio=False,
decoder=cfg.DATA.DECODING_BACKEND,
),
)