in eval_video.py [0:0]
def parse_args():
def str2bool(v):
v = v.lower()
if v in ('yes', 'true', 't', '1'):
return True
elif v in ('no', 'false', 'f', '0'):
return False
raise ValueError('Boolean argument needs to be true or false. '
'Instead, it is %s.' % v)
import argparse
parser = argparse.ArgumentParser(description='Video Action Finetune')
parser.register('type', 'bool', str2bool)
### DATA
parser.add_argument('--dataset', default='hmdb51', type=str,
help='name of dataset')
parser.add_argument('--fold', default='1', type=str,
help='name of dataset')
parser.add_argument('--root_dir', default=None,
type=str, help='name of dataset')
parser.add_argument('--ucf101-annotation-path', default='/datasets01/ucf101/112018/ucfTrainTestlist/',
type=str, help='name of dataset')
parser.add_argument('--hmdb51-annotation-path', default='/datasets01/hmdb51/112018/splits/',
type=str, help='name of dataset')
parser.add_argument('--target-fps', type=int, default=30,
help='video fps')
parser.add_argument('--train-crop-size', type=int, default=128,
help="train crop size")
parser.add_argument('--test-crop-size', type=int, default=128,
help="train crop size")
parser.add_argument('--multi-crop', type='bool', default='False',
help='do multi-crop comparisons')
parser.add_argument('--num-large-crops', type=int, default=1,
help='Number of Large Crops')
parser.add_argument('--num-small-crops', type=int, default=0,
help='Number of small Crops')
parser.add_argument('--use-grayscale', type='bool', default='False',
help='use grayscale augmentation')
parser.add_argument('--use-gaussian', type='bool', default='False',
help='use gaussian augmentation')
parser.add_argument('--clip-len', default=32, type=int,
help='number of frames per clip')
parser.add_argument('--colorjitter', default='True', type='bool',
help='scale jittering as augmentations')
parser.add_argument('--steps-bet-clips', default=1, type=int,
help='number of steps between clips in video')
parser.add_argument('--num-data-samples', default=None, type=int,
help='number of samples in dataset')
parser.add_argument('--train-clips-per-video', default=10, type=int,
help='maximum number of clips per video to consider for training')
parser.add_argument('--val-clips-per-video', default=10, type=int,
help='maximum number of clips per video to consider for testing')
parser.add_argument('--num-spatial-crops', default=3, type=int,
help='number of spatial clips for testing')
parser.add_argument('--test-time-cj', default='False', type='bool',
help='test time CJ augmentation')
parser.add_argument('--workers', default=16, type=int,
help='number of data loading workers (default: 10)')
parser.add_argument('--use_random_resize_crop', default='True', type='bool',
help='use random resized crop instead of short stide jitter')
### MODEL
parser.add_argument('--weights-path', default='', type=str,
help='Path to weights file')
parser.add_argument('--ckpt-epoch', default='0', type=str,
help='Epoch of model checkpoint')
parser.add_argument('--model', default='av_gdt', help='model',
choices=['av_gdt', 'vid_text_gdt', 'stica'])
parser.add_argument('--vid-base-arch', default='r2plus1d_18', type=str,
help='Video Base Arch for A-V model',
choices=['r2plus1d_18', 'r2plus1d_34'])
parser.add_argument('--aud-base-arch', default='resnet9',
help='Audio Base Arch for A-V model',
choices=['resnet18', 'resnet34', 'resnet50', 'resnet9'])
parser.add_argument('--pretrained', default='False', type='bool',
help='Use pre-trained models from the modelzoo')
parser.add_argument('--supervised', default='False', type='bool',
help='Use supervised model')
parser.add_argument('--use-mlp', default='False', type='bool',
help='Use MLP projection head')
parser.add_argument('--mlptype', default=0, type=int,
help='MLP type (default: 0)')
parser.add_argument('--headcount', default=1, type=int,
help='how many heads each modality has')
parser.add_argument('--use-dropout', default='False', type='bool',
help='Use dropout in classifier')
parser.add_argument('--use-bn', default='False', type='bool',
help='Use BN in classifier')
parser.add_argument('--use-l2-norm', default='False', type='bool',
help='Use L2-Norm in classifier')
parser.add_argument('--agg-model', default='False', type='bool',
help="Aggregate model with transformer")
parser.add_argument('--num_layer', default=2, type=int,
help='num of transformer layers')
parser.add_argument('--num_sec', default=2, type=int,
help='num of seconds')
parser.add_argument('--dp', default=0.0, type=float,
help='dropout rate in transformer')
parser.add_argument('--num_head', default=4, type=int,
help='num head in transformer')
parser.add_argument('--use_larger_last', type='bool', default='False',
help='use larger last layer of res5')
### TRANSFORMER PARAMS
parser.add_argument('--positional_emb', default='False', type='bool',
help="use positional emb in transformer")
parser.add_argument('--qkv_mha', default='False', type='bool',
help='complete qkv in MHA')
parser.add_argument('--cross_modal_nce', default='True', type='bool',
help='use cross-modal NCE loss')
parser.add_argument('--fm_crop', type='bool', default='False',
help='use FMCROP model')
parser.add_argument('--transformer_time_dim', default=8, type=int,
help='temporal input for transformer')
parser.add_argument('--cross_modal_alpha', type=float, default=0.5,
help='weighting of cross-modal loss')
### TRAINING
parser.add_argument('--feature-extract', default='False', type='bool',
help='Use model as feature extractor;')
parser.add_argument('--batch-size', default=32, type=int,
help='effective batch size')
parser.add_argument('--epochs', default=12, type=int,
help='number of total epochs to run')
parser.add_argument('--optim-name', default='sgd', type=str,
help='Name of optimizer')
parser.add_argument('--head-lr', default=0.0025, type=float,
help='initial learning rate')
parser.add_argument('--base-lr', default=0.00025, type=float,
help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float,
help='momentum')
parser.add_argument('--weight-decay', default=0.005, type=float,
help='weight decay for classifier')
parser.add_argument('--wd-base', default=0.005, type=float,
help='weight decay for bas encoder')
parser.add_argument('--use-scheduler', default='True', type='bool',
help='Use LR scheduler')
parser.add_argument('--lr-warmup-epochs', default=2, type=int,
help='number of warmup epochs')
parser.add_argument('--lr-milestones', default='6,10', type=str,
help='decrease lr on milestones (epochs)')
parser.add_argument('--lr-gamma', default=0.05, type=float,
help='decrease lr by a factor of lr-gamma')
parser.add_argument('--tsf_lr', default=0.00025, type=float,
help='transformer learning rate')
parser.add_argument('--wd_tsf', default=0.005, type=float,
help='transformer wd')
### LOGGING
parser.add_argument('--print-freq', default=10, type=int,
help='print frequency')
parser.add_argument('--output-dir', default='.', type=str,
help='path where to save')
### AUDIO
parser.add_argument('--num-sec-aud', type=int, default=1,
help='number of seconds of audio')
parser.add_argument('--aud-sample-rate', type=int, default=24000,
help='audio sample rate')
parser.add_argument('--audio-augtype', type=str, default='none',
choices=['none', 'mild', 'medium', 'heavy'],
help='audio augmentation strength with Spec Augment')
parser.add_argument('--aud-spec-type', type=int, default=2,
help="audio spec type")
parser.add_argument('--use-volume-jittering', type='bool', default='True',
help='use volume jittering')
parser.add_argument('--use-audio-temp-jittering', type='bool', default='False',
help='use audio temporal jittering')
parser.add_argument('--z-normalize', type='bool', default='False',
help='z-normalize the audio')
### CHECKPOINTING
parser.add_argument('--resume', default='', type=str,
help='resume from checkpoint')
parser.add_argument('--start-epoch', default=0, type=int,
help='start epoch')
parser.add_argument('--test-only', default='False', type='bool',
help='Only test the model')
args = parser.parse_args()
return args