in slowfast/datasets/ptv_datasets.py [0:0]
def Ptvssv2(cfg, mode):
"""
Construct PyTorchVideo Something-Something v2 SSv2 video loader.
Load SSv2 data (frame paths, labels, etc. ) to SSv2 Dataset object.
The dataset could be downloaded from Chrades official website
(https://20bn.com/datasets/something-something).
Please see datasets/DATASET.md for more information about the data format.
For training and validation, a single clip is randomly sampled from every
video with random cropping and scaling. For testing, multiple clips are
uniformaly sampled from every video with uniform cropping. For uniform cropping,
we take the left, center, and right crop if the width is larger than height,
or take top, center, and bottom crop if the height is larger than the width.
Args:
cfg (CfgNode): configs.
mode (string): Options includes `train`, `val`, or `test` mode.
"""
# Only support train, val, and test mode.
assert mode in [
"train",
"val",
"test",
], "Split '{}' not supported".format(mode)
logger.info("Constructing Ptvcharades {}...".format(mode))
if mode in ["train", "val"]:
num_clips = 1
num_crops = 1
transform = Compose(
[
ApplyTransformToKey(
key="video",
transform=Compose(
[
Lambda(div255),
NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD),
RandomShortSideScale(
min_size=cfg.DATA.TRAIN_JITTER_SCALES[0],
max_size=cfg.DATA.TRAIN_JITTER_SCALES[1],
),
RandomCropVideo(cfg.DATA.TRAIN_CROP_SIZE),
Lambda(rgb2bgr),
]
+ (
[RandomHorizontalFlipVideo(p=0.5)]
if cfg.DATA.RANDOM_FLIP
else []
)
+ [PackPathway(cfg)]
),
),
DictToTuple(num_clips, num_crops),
]
)
clip_sampler = make_clip_sampler(
"constant_clips_per_video",
1, # Put arbitrary duration as ssv2 always needs full video clip.
num_clips,
num_crops,
)
if cfg.NUM_GPUS > 1:
video_sampler = DistributedSampler
else:
video_sampler = (
RandomSampler if mode == "train" else SequentialSampler
)
else:
assert cfg.TEST.NUM_ENSEMBLE_VIEWS == 1
num_clips = cfg.TEST.NUM_ENSEMBLE_VIEWS
num_crops = cfg.TEST.NUM_SPATIAL_CROPS
transform = Compose(
[
ApplyTransformToKey(
key="video",
transform=Compose(
[
Lambda(div255),
NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD),
ShortSideScale(size=cfg.DATA.TEST_CROP_SIZE),
]
),
),
UniformCropVideo(size=cfg.DATA.TEST_CROP_SIZE),
ApplyTransformToKey(
key="video",
transform=Compose(
[Lambda(rgb2bgr), PackPathway(cfg)],
),
),
DictToTuple(num_clips, num_crops),
]
)
clip_sampler = make_clip_sampler(
"constant_clips_per_video",
1, # Put arbitrary duration as ssv2 always needs full video clip.
num_clips,
num_crops,
)
video_sampler = (
DistributedSampler if cfg.NUM_GPUS > 1 else SequentialSampler
)
label_name_file = os.path.join(
cfg.DATA.PATH_TO_DATA_DIR, "something-something-v2-labels.json"
)
video_label_file = os.path.join(
cfg.DATA.PATH_TO_DATA_DIR,
"something-something-v2-{}.json".format(
"train" if mode == "train" else "validation"
),
)
data_path = os.path.join(
cfg.DATA.PATH_TO_DATA_DIR,
"{}.csv".format("train" if mode == "train" else "val"),
)
dataset = SSv2(
label_name_file=label_name_file,
video_label_file=video_label_file,
video_path_label_file=data_path,
clip_sampler=clip_sampler,
video_sampler=video_sampler,
transform=transform,
video_path_prefix=cfg.DATA.PATH_PREFIX,
frames_per_clip=cfg.DATA.NUM_FRAMES,
rand_sample_frames=mode == "train",
)
logger.info(
"Constructing ssv2 dataloader (size: {}) from {}".format(
len(dataset._path_to_videos), data_path
)
)
return PTVDatasetWrapper(
num_videos=len(dataset._path_to_videos),
clips_per_video=num_clips,
crops_per_clip=num_crops,
dataset=dataset,
)