in train.py [0:0]
def parse_args():
parser = argparse.ArgumentParser(
description="Train a handwriting recognition model."
)
parser.add_argument(
"--config", type=str, help="A json configuration file for experiment."
)
parser.add_argument("--disable_cuda", action="store_true", help="Disable CUDA")
parser.add_argument(
"--restore", action="store_true", help="Restore training from last checkpoint")
parser.add_argument("--last_epoch", type=int, default=0, help="Epoch restoring from.")
parser.add_argument(
"--checkpoint_path",
default="/tmp/",
type=str,
help="Checkpoint path for saving models",
)
parser.add_argument(
"--world_size", default=1, type=int, help="world size for distributed training"
)
parser.add_argument(
"--dist_url",
default="tcp://localhost:23146",
type=str,
help="url used to set up distributed training. This should be"
"the IP address and open port number of the master node",
)
parser.add_argument(
"--dist_backend", default="nccl", type=str, help="distributed backend"
)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
use_cpu = args.disable_cuda or not torch.cuda.is_available()
if args.world_size > 1 and use_cpu:
logging.fatal("CPU distributed training not supported.")
sys.exit(1)
logging.info("World size is : " + str(args.world_size))
if args.restore:
logging.info(f"Restoring model from epoch {args.last_epoch}")
if not use_cpu and torch.cuda.device_count() < args.world_size:
logging.fatal(
"At least {} cuda devices required. {} found".format(
args.world_size, torch.cuda.device_count()
)
)
sys.exit(1)
return args