in sagemaker-python-sdk/mxnet_horovod_fasterrcnn/source/train_faster_rcnn.py [0:0]
def parse_args():
parser = argparse.ArgumentParser(description="Train Faster R-CNN network end to end.")
parser.add_argument("--datasetloc", type=str, default="", help="where the dataset is located")
parser.add_argument("--sm-save", type=str, default="", help="where to save models")
parser.add_argument("--sm-output", type=str, default="", help="where to save data for sm")
parser.add_argument(
"--num-workers",
"-j",
dest="num_workers",
type=int,
default=4,
help="Number of data workers, you can use larger "
"number to accelerate data loading, if you CPU and GPUs "
"are powerful.",
)
parser.add_argument("--batch-size", type=int, default=8, help="Training mini-batch size.")
parser.add_argument(
"--gpus", type=str, default="0", help="Training with GPUs, you can specify 1,3 for example."
)
parser.add_argument("--epochs", type=str, default="", help="Training epochs.")
parser.add_argument(
"--resume",
type=str,
default="",
help="Resume from previously saved parameters if not None. "
"For example, you can resume from ./mask_rcnn_xxx_0123.params",
)
parser.add_argument(
"--start-epoch",
type=int,
default=0,
help="Starting epoch for resuming, default is 0 for new training."
"You can specify it to 100 for example to start from 100 epoch.",
)
parser.add_argument(
"--lr",
type=str,
default="",
help="Learning rate, default is 0.01 for coco 8 gpus training.",
)
parser.add_argument(
"--lr-decay", type=float, default=0.1, help="decay rate of learning rate. default is 0.1."
)
parser.add_argument(
"--lr-decay-epoch",
type=str,
default="",
help="epochs at which learning rate decays. default is 17,23 for coco.",
)
parser.add_argument(
"--lr-warmup",
type=str,
default="",
help="warmup iterations to adjust learning rate, default is 1000 for coco.",
)
parser.add_argument(
"--lr-warmup-factor", type=float, default=1.0 / 3.0, help="warmup factor of base lr."
)
parser.add_argument("--clip-gradient", type=float, default=-1.0, help="gradient clipping.")
parser.add_argument("--momentum", type=float, default=0.9, help="SGD momentum, default is 0.9")
parser.add_argument("--wd", type=str, default="", help="Weight decay, default is 1e-4 for coco")
parser.add_argument(
"--log-interval", type=int, default=100, help="Logging mini-batch interval. Default is 100."
)
parser.add_argument("--save-prefix", type=str, default="", help="Saving parameter prefix")
parser.add_argument(
"--save-interval",
type=int,
default=1,
help="Saving parameters epoch interval, best model will always be saved.",
)
parser.add_argument(
"--val-interval",
type=int,
default=1,
help="Epoch interval for validation, increase the number will reduce the "
"training time if validation is slow.",
)
parser.add_argument("--seed", type=int, default=233, help="Random seed to be fixed.")
parser.add_argument(
"--verbose", type=str, default="false", help="Print helpful debugging info once set."
)
# Norm layer options
parser.add_argument(
"--norm-layer",
type=str,
default=None,
help="Type of normalization layer to use. "
"If set to None, backbone normalization layer will be fixed,"
" and no normalization layer will be used. "
"Currently supports 'bn', and None, default is None",
)
parser.add_argument(
"--amp", type=str, default="false", help="Use MXNet AMP for mixed precision training."
)
parser.add_argument(
"--horovod",
type=str,
default="false",
help="Use MXNet Horovod for distributed training. Must be run with OpenMPI. "
"--gpus is ignored when using --horovod.",
)
parser.add_argument(
"--executor-threads",
type=int,
default=1,
help="Number of threads for executor for scheduling ops. "
"More threads may incur higher GPU memory footprint, "
"but may speed up throughput. Note that when horovod is used, "
"it is set to 1.",
)
parser.add_argument(
"--kv-store",
type=str,
default="nccl",
help="KV store options. local, device, nccl, dist_sync, dist_device_sync, "
"dist_async are available.",
)
args = parser.parse_args()
def str_2_bool(args, b=False):
if args.lower() == "true":
return True
else:
if b:
return False
return None
args.verbose = str_2_bool(args.verbose)
args.amp = str_2_bool(args.amp)
args.horovod = str_2_bool(args.horovod)
keys = list(os.environ.keys())
args.sm_save = (
os.path.join(os.environ["SM_MODEL_DIR"], args.sm_save)
if "SM_MODEL_DIR" in keys
else args.sm_save
)
args.num_workers = args.num_workers
args.datasetloc = (
os.environ["SM_CHANNEL_DATA"] if "SM_CHANNEL_DATA" in keys else args.datasetloc
)
args.gpus = int(os.environ["SM_NUM_GPUS"]) if "SM_NUM_GPUS" in keys else int(args.gpus)
args.sm_output = (
os.path.join(os.environ["SM_OUTPUT_DATA_DIR"], args.sm_output)
if "SM_OUTPUT_DATA_DIR" in keys
else args.sm_output
)
args.batch_size = int(args.batch_size)
# fix seed for mxnet, numpy and python builtin random generator.
gutils.random.seed(args.seed)
if args.horovod:
if hvd is None:
raise SystemExit("Horovod not found, please check if you installed it correctly.")
hvd.init()
args.epochs = int(args.epochs) if args.epochs else 26
args.lr_decay_epoch = args.lr_decay_epoch if args.lr_decay_epoch else "8,11"
args.lr = float(args.lr) if args.lr else 0.00125
args.lr_warmup = args.lr_warmup if args.lr_warmup else 1000
args.wd = float(args.wd) if args.wd else 1e-4
def str_args2num_args(arguments, args_name, num_type):
try:
ret = [num_type(x) for x in arguments.split(",")]
if len(ret) == 1:
return ret[0]
return ret
except ValueError:
raise ValueError("invalid value for", args_name, arguments)
return args