in main_gdt.py [0:0]
def parse_args():
def str2bool(v):
v = v.lower()
if v in ('yes', 'true', 't', '1'):
return True
elif v in ('no', 'false', 'f', '0'):
return False
raise ValueError('Boolean argument needs to be true or false. '
'Instead, it is %s.' % v)
import argparse
parser = argparse.ArgumentParser(description='Video Representation Learning')
parser.register('type', 'bool', str2bool)
# Data
parser.add_argument('--root_dir', type=str, default=None,
help='path to dataset train directory e.g. /path/to/kinetics/train')
parser.add_argument('--ht100m_caption_root', type=str, default='/private/home/mandelapatrick/data/howto100m_csv',
help='path to ht100m caption root directory')
parser.add_argument('--dataset', default='kinetics', type=str,
help='name of dataset')
parser.add_argument('--dualdata', default='True', type='bool',
help='use dataloader that returns two samples per video')
parser.add_argument('--num_data_samples', default=None, type=int,
help='number of samples in dataset')
parser.add_argument('--fold', default=1, type=str,
help='fold of dataset (ucf101/ hmdb51)')
parser.add_argument('--workers', default=0, type=int,
help='number of data loading workers (default: 0)')
# GDT NCE loss
parser.add_argument('--hypothesis', default=1, type=int,
help="use it for encoding what learning hypothesis we're using")
parser.add_argument('--nce_t', default=0.07, type=float,
help='softmax weighting')
parser.add_argument('--num_negatives', default=-1, type=int,
help='number of negatives in contrastive loss')
# Video Augmentations
parser.add_argument('--clip_len', default=30, type=int,
help='number of frames per clip')
parser.add_argument('--target_fps', default=30, type=int,
help='target fps')
parser.add_argument('--sample_rate', default=1, type=int,
help='Subsampling rate: num frames between clips')
parser.add_argument('--clips_per_video', default=1, type=int,
help='number of clips to sample from video')
parser.add_argument('--train_crop_size', default=112, type=int,
help='Size of spatial crops')
parser.add_argument('--colorjitter', default='False', type='bool',
help='Apply random color jitter')
parser.add_argument('--use_scale_jittering', default='False', type='bool',
help='scale jittering as augmentations')
parser.add_argument('--augtype', default=1, type=int,
help='augmentation type (default: 1)')
parser.add_argument('--use_temp_jitter', default='True', type='bool',
help='Get clips from random timestamps each epoch')
parser.add_argument('--center_crop', default='False', type='bool',
help='Use center cropping instead of random cropping')
# Audio Augmentation
parser.add_argument('--aud_sample_rate', default=24000, type=int,
help='audio sample rate')
parser.add_argument('--aud_spec_type', default=1, type=int,
help='audio spec type') # 1 : (40, 99), (257, 199)
parser.add_argument('--use_volume_jittering', default='True', type='bool',
help='use volume jittering')
parser.add_argument('--use_temporal_jittering', default='False', type='bool',
help='use temporal jittering')
parser.add_argument('--num_sec', default=1, type=int,
help='Number of seconds')
parser.add_argument('--z_normalize', default='True', type='bool',
help='normalize audio')
parser.add_argument('--aug_audio', default='True', type='bool',
help='whether to augment audio')
parser.add_argument('--audio_augtype', default='medium', type=str,
choices=['na', 'mild', 'medium', 'heavy'],
help='type of audio-augment default: mild')
parser.add_argument('--decode_audio', default='True', type='bool',
help='whether to deocde audio')
# Model
parser.add_argument('--model', default='av_gdt', help='model',
choices=['av_gdt', 'vid_text_gdt'])
parser.add_argument('--vid_base_arch', default='r2plus1d_18',
help='Video Base Arch for A-V model',
choices=['r2plus1d_18', 'r2plus1d_34'])
parser.add_argument('--aud_base_arch', default='resnet9',
help='Audio Base Arch for A-V model',
choices=['resnet9', 'resnet18'])
parser.add_argument('--pretrained', default='False', type='bool',
help='Use pre-trained models from the modelzoo')
parser.add_argument('--headcount', default=1, type=int,
help='how many heads each modality has')
parser.add_argument('--use_mlp', default='True', type='bool',
help='Use MLP projection head')
parser.add_argument('--use_max_pool', default='False', type='bool',
help='Use max pool instead of GAP')
parser.add_argument('--mlptype', default=0, type=int,
help='MLP type (default: 0)')
# Training
parser.add_argument('--batch_size', default=16, type=int,
help='batch-size / GPU')
parser.add_argument('--epochs', default=200, type=int,
help='number of total epochs to run')
parser.add_argument('--lr', default=0.01, type=float,
help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float,
help='momentum')
parser.add_argument('--weight_decay', default=1e-5, type=float,
help='weight decay (default: 1e-5)')
parser.add_argument('--use_scheduler', default='True', type='bool',
help='Use LR scheduler')
parser.add_argument('--scheduler_type', default='multi_step', type=str,
choices=['multi_step', 'cosine'],
help='Type of LR scheduler')
parser.add_argument('--lr_milestones', default='150,175', type=str,
help='decrease lr on milestones')
parser.add_argument('--lr_gamma', default=0.1, type=float,
help='decrease lr by a factor of lr-gamma')
parser.add_argument('--lr_warmup_epochs', default=10, type=int,
help='number of warmup epochs')
parser.add_argument('--sync_bn', default='True', type='bool',
help='Use sync batch norm')
parser.add_argument('--warmup_bn', default='False', type='bool',
help='Warmup batchnorm')
parser.add_argument('--norm_feat', default='True', type='bool',
help='Normalize embeddings')
# Logging
parser.add_argument('--print_freq', default=10, type=int,
help='print frequency')
parser.add_argument('--output_dir', default='.',
help='path where to save')
# Checkpointing
parser.add_argument('--resume', default='False', type='bool',
help='resume from checkpoint')
parser.add_argument('--start_epoch', default=0, type=int,
help='start epoch')
# Mixed precision training parameters
parser.add_argument('--apex', default='False', type='bool',
help='Use apex for mixed precision training'
)
parser.add_argument('--apex_opt_level', default='O1', type=str,
help='For apex mixed precision training'
'O0 for FP32 training, O1 for mixed precision training.'
'For further detail, see'
'https://github.com/NVIDIA/apex/tree/master/examples/imagenet'
)
# distributed training parameters
parser.add_argument('--device', default='cuda',
help='device')
parser.add_argument('--distributed', default='False', type='bool',
help='ddp mode')
parser.add_argument('--dist_backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--dist_url', default='env://',
help='url used to set up distributed training')
parser.add_argument('--world_size', default=1, type=int,
help='number of distributed processes')
parser.add_argument('--debug_slurm', default='False', type='bool',
help="Debug SLURM")
parser.add_argument('--local_rank', default=-1, type=int,
help='Local rank of node')
parser.add_argument('--master_port', default=-1, type=int,
help='Master port of Job')
parser.add_argument('--bash', default='False', type='bool',
help='if in bash')
args = parser.parse_args()
return args