in slowfast/models/video_model_builder.py [0:0]
def _construct_network(self, cfg):
"""
Builds a single pathway ResNet model.
Args:
cfg (CfgNode): model building configs, details are in the
comments of the config file.
"""
assert cfg.MODEL.ARCH in _POOL1.keys()
pool_size = _POOL1[cfg.MODEL.ARCH]
assert len({len(pool_size), self.num_pathways}) == 1
assert cfg.RESNET.DEPTH in _MODEL_STAGE_DEPTH.keys()
(d2, d3, d4, d5) = _MODEL_STAGE_DEPTH[cfg.RESNET.DEPTH]
num_groups = cfg.RESNET.NUM_GROUPS
width_per_group = cfg.RESNET.WIDTH_PER_GROUP
dim_inner = num_groups * width_per_group
temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH]
s1 = stem_helper.VideoModelStem(
dim_in=cfg.DATA.INPUT_CHANNEL_NUM,
dim_out=[width_per_group],
kernel=[temp_kernel[0][0] + [7, 7]],
stride=[[1, 2, 2]],
padding=[[temp_kernel[0][0][0] // 2, 3, 3]],
norm_module=self.norm_module,
)
s2 = resnet_helper.ResStage(
dim_in=[width_per_group],
dim_out=[width_per_group * 4],
dim_inner=[dim_inner],
temp_kernel_sizes=temp_kernel[1],
stride=cfg.RESNET.SPATIAL_STRIDES[0],
num_blocks=[d2],
num_groups=[num_groups],
num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[0],
nonlocal_inds=cfg.NONLOCAL.LOCATION[0],
nonlocal_group=cfg.NONLOCAL.GROUP[0],
nonlocal_pool=cfg.NONLOCAL.POOL[0],
instantiation=cfg.NONLOCAL.INSTANTIATION,
trans_func_name=cfg.RESNET.TRANS_FUNC,
stride_1x1=cfg.RESNET.STRIDE_1X1,
inplace_relu=cfg.RESNET.INPLACE_RELU,
dilation=cfg.RESNET.SPATIAL_DILATIONS[0],
norm_module=self.norm_module,
)
# Based on profiling data of activation size, s1 and s2 have the activation sizes
# that are 4X larger than the second largest. Therefore, checkpointing them gives
# best memory savings. Further tuning is possible for better memory saving and tradeoffs
# with recomputing FLOPs.
if cfg.MODEL.ACT_CHECKPOINT:
validate_checkpoint_wrapper_import(checkpoint_wrapper)
self.s1 = checkpoint_wrapper(s1)
self.s2 = checkpoint_wrapper(s2)
else:
self.s1 = s1
self.s2 = s2
for pathway in range(self.num_pathways):
pool = nn.MaxPool3d(
kernel_size=pool_size[pathway],
stride=pool_size[pathway],
padding=[0, 0, 0],
)
self.add_module("pathway{}_pool".format(pathway), pool)
self.s3 = resnet_helper.ResStage(
dim_in=[width_per_group * 4],
dim_out=[width_per_group * 8],
dim_inner=[dim_inner * 2],
temp_kernel_sizes=temp_kernel[2],
stride=cfg.RESNET.SPATIAL_STRIDES[1],
num_blocks=[d3],
num_groups=[num_groups],
num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[1],
nonlocal_inds=cfg.NONLOCAL.LOCATION[1],
nonlocal_group=cfg.NONLOCAL.GROUP[1],
nonlocal_pool=cfg.NONLOCAL.POOL[1],
instantiation=cfg.NONLOCAL.INSTANTIATION,
trans_func_name=cfg.RESNET.TRANS_FUNC,
stride_1x1=cfg.RESNET.STRIDE_1X1,
inplace_relu=cfg.RESNET.INPLACE_RELU,
dilation=cfg.RESNET.SPATIAL_DILATIONS[1],
norm_module=self.norm_module,
)
self.s4 = resnet_helper.ResStage(
dim_in=[width_per_group * 8],
dim_out=[width_per_group * 16],
dim_inner=[dim_inner * 4],
temp_kernel_sizes=temp_kernel[3],
stride=cfg.RESNET.SPATIAL_STRIDES[2],
num_blocks=[d4],
num_groups=[num_groups],
num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[2],
nonlocal_inds=cfg.NONLOCAL.LOCATION[2],
nonlocal_group=cfg.NONLOCAL.GROUP[2],
nonlocal_pool=cfg.NONLOCAL.POOL[2],
instantiation=cfg.NONLOCAL.INSTANTIATION,
trans_func_name=cfg.RESNET.TRANS_FUNC,
stride_1x1=cfg.RESNET.STRIDE_1X1,
inplace_relu=cfg.RESNET.INPLACE_RELU,
dilation=cfg.RESNET.SPATIAL_DILATIONS[2],
norm_module=self.norm_module,
)
self.s5 = resnet_helper.ResStage(
dim_in=[width_per_group * 16],
dim_out=[width_per_group * 32],
dim_inner=[dim_inner * 8],
temp_kernel_sizes=temp_kernel[4],
stride=cfg.RESNET.SPATIAL_STRIDES[3],
num_blocks=[d5],
num_groups=[num_groups],
num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[3],
nonlocal_inds=cfg.NONLOCAL.LOCATION[3],
nonlocal_group=cfg.NONLOCAL.GROUP[3],
nonlocal_pool=cfg.NONLOCAL.POOL[3],
instantiation=cfg.NONLOCAL.INSTANTIATION,
trans_func_name=cfg.RESNET.TRANS_FUNC,
stride_1x1=cfg.RESNET.STRIDE_1X1,
inplace_relu=cfg.RESNET.INPLACE_RELU,
dilation=cfg.RESNET.SPATIAL_DILATIONS[3],
norm_module=self.norm_module,
)
if self.enable_detection:
self.head = head_helper.ResNetRoIHead(
dim_in=[width_per_group * 32],
num_classes=cfg.MODEL.NUM_CLASSES,
pool_size=[[cfg.DATA.NUM_FRAMES // pool_size[0][0], 1, 1]],
resolution=[[cfg.DETECTION.ROI_XFORM_RESOLUTION] * 2],
scale_factor=[cfg.DETECTION.SPATIAL_SCALE_FACTOR],
dropout_rate=cfg.MODEL.DROPOUT_RATE,
act_func=cfg.MODEL.HEAD_ACT,
aligned=cfg.DETECTION.ALIGNED,
)
else:
self.head = head_helper.ResNetBasicHead(
dim_in=[width_per_group * 32],
num_classes=cfg.MODEL.NUM_CLASSES,
pool_size=[None, None]
if cfg.MULTIGRID.SHORT_CYCLE
else [
[
cfg.DATA.NUM_FRAMES // pool_size[0][0],
cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[0][1],
cfg.DATA.TRAIN_CROP_SIZE // 32 // pool_size[0][2],
]
], # None for AdaptiveAvgPool3d((1, 1, 1))
dropout_rate=cfg.MODEL.DROPOUT_RATE,
act_func=cfg.MODEL.HEAD_ACT,
)