def train()

in container-dp/resources/train.py [0:0]


def train():
    import pprint
    
    pprint.pprint(dict(os.environ), width = 1) 
    
    model_dir = os.environ['SM_MODEL_DIR']
    log_dir = None
    
    copy_logs_to_model_dir = False
    
    try:
        log_dir = os.environ['SM_CHANNEL_LOG']
        copy_logs_to_model_dir = True
    except KeyError:
        log_dir = model_dir
        
    train_data_dir = os.environ['SM_CHANNEL_TRAIN']
    hyperparamters = json.loads(os.environ['SM_HPS'])
    
    try:
        data_name = hyperparamters['data_name']
    except KeyError:
        data_name = 'MOT20'
    
    try:
        load_model = hyperparamters['load_model']
    except KeyError:
        load_model = ''
    
    try:
        num_workers = hyperparamters['num_workers']
    except KeyError:
        num_workers = 0
        
    try:
        num_workers_val = hyperparamters['num_workers_val']
    except KeyError:
        num_workers_val = 2
    
    try:
        # model architecture.
        # Currently tested resdcn_34 | resdcn_50 | resfpndcn_34 dla_34 | hrnet_18
        arch = hyperparamters['arch']
    except KeyError:
        arch = 'dla_34'

    try:
        # input height. -1 for default from dataset.
        input_h = hyperparamters['input_h']
    except KeyError:
        input_h = 608

    try:
        # input width. -1 for default from dataset.
        input_w = hyperparamters['input_w']
    except KeyError:
        input_w = 1088
        
    try:
        # learning rate for batch size 12.
        lr = hyperparamters['lr']
    except KeyError:
        lr = 1e-4

    try:
        # drop learning rate by 10.
        lr_step = hyperparamters['lr_step']
    except KeyError:
        lr_step = 15

    try:
        # total training epochs.
        num_epochs = hyperparamters['num_epochs']
    except KeyError:
        num_epochs = 30

    try:
        # batch size, For MOT, less than 12 should be set on NVIDIA V100 GPU
        batch_size = hyperparamters['batch_size']
    except KeyError:
        batch_size = 12
    
    try:
        # batch size, For MOT, less than 12 should be set on NVIDIA V100 GPU
        batch_size_val = hyperparamters['batch_size_val']
    except KeyError:
        batch_size_val = 12
        
    try:
        max_label = hyperparamters['max_label']
    except KeyError:
        # should be greater than 1500 on MOT20
        max_label = 1500
        
    try:
        # default: #samples / batch_size.
        num_iters = hyperparamters['num_iters']
    except KeyError:
        num_iters = -1
    
    try:
        val_intervals = hyperparamters['val_intervals']
    except KeyError:
        val_intervals = 5
    
    try:
        # max number of output objects.
        paramK = hyperparamters['K']
    except KeyError:
        paramK = 500
    
    # #############################################
    # Loss
    # #############################################   
    try:
        # use mse loss or focal loss to train keypoint heatmaps.
        mse_loss = hyperparamters['mse_loss']
    except KeyError:
        mse_loss = True
    
    try:
        # regression loss: sl1 | l1 | l2
        reg_loss = hyperparamters['reg_loss']
    except KeyError:
        reg_loss = 'l1'
    
    try:
        # loss weight for keypoint heatmaps.
        hm_weight = hyperparamters['hm_weight']
    except KeyError:
        hm_weight = 1
    
    try:
        # loss weight for keypoint local offsets.
        off_weight = hyperparamters['off_weight']
    except KeyError:
        off_weight = 1
    
    try:
        # loss weight for bounding box size.
        wh_weight = hyperparamters['wh_weight']
    except KeyError:
        wh_weight = 0.1
    
    try:
        # reid loss: ce | triplet
        id_loss = hyperparamters['id_loss']
    except KeyError:
        id_loss = 'ce'
    
    try:
        # loss weight for id
        id_weight = hyperparamters['id_weight']
    except KeyError:
        id_weight = 1
    
    try:
        # feature dim for reid
        reid_dim = hyperparamters['reid_dim']
    except KeyError:
        reid_dim = 128
        
    try:
        # regress left, top, right, bottom of bbox
        ltrb = hyperparamters['ltrb']
    except KeyError:
        ltrb = True
        
    try:
        # L1(\hat(y) / y, 1) or L1(\hat(y), y)
        norm_wh = hyperparamters['norm_wh']
    except KeyError:
        norm_wh = True
    
    try:
        # apply weighted regression near center or just apply regression on center point.
        dense_wh = hyperparamters['dense_wh']
    except KeyError:
        dense_wh = True
    
    try:
        # category specific bounding box size.
        cat_spec_wh = hyperparamters['cat_spec_wh']
    except KeyError:
        cat_spec_wh = True
    
    try:
        # not regress local offset.
        not_reg_offset = hyperparamters['not_reg_offset']
    except KeyError:
        not_reg_offset = True
    
    gpus_per_host = int(os.environ['SM_NUM_GPUS'])
    gpus = ','.join([str(i) for i in range(gpus_per_host)])
    
    # optimize hyperparameter in multi-GPU model
    num_workers = gpus_per_host
    lr *= gpus_per_host
    batch_size *= gpus_per_host
    
    train_cmd = f"""
cd /fairmot/src && python train.py mot \
--batch_size {batch_size} \
--num_epochs {num_epochs} \
--lr_step '{lr_step}' \
--data_cfg {train_data_dir}/{data_name}/data.json \
--num_workers {num_workers} \
--reg_loss {reg_loss} \
--hm_weight {hm_weight} \
--off_weight {off_weight} \
--wh_weight {wh_weight} \
--id_loss {id_loss} \
--id_weight {id_weight} \
--reid_dim {reid_dim} \
--arch {arch} \
--input_h {input_h} \
--input_w {input_w} \
--lr {lr} \
--val_intervals {val_intervals} \
--gpus {gpus} \
--save_dir {model_dir} \
--batch_size_val {batch_size_val} \
--num_workers_val {num_workers_val} \
"""
    
    if len(load_model) > 0:
        train_cmd += f" --load_model {train_data_dir}/pretrained-models/{load_model}"
        
    print("--------Begin Model Training Command----------")
    print(train_cmd)
    print("--------End Model Training Comamnd------------")
    exitcode = 0
    try:
        process = subprocess.Popen(
            train_cmd,
            encoding='utf-8', 
            shell=True, 
            stdout=subprocess.PIPE,
            stderr=subprocess.DEVNULL)

        while True:
            if process.poll() != None:
                break

            output = process.stdout.readline()
            if output:
                print(output.strip())
        
        exitcode = process.poll() 
        print(f"exit code:{exitcode}")
        exitcode = 0 
    except Exception as e:
        print("train exception occured", file=sys.stderr)
        exitcode = 1
        print(str(e), file=sys.stderr)
    finally:
        if copy_logs_to_model_dir:
            copy_files(log_dir, model_dir)
    
    sys.stdout.flush()
    sys.stderr.flush()
    sys.exit(exitcode)