in container-dp/resources/train.py [0:0]
def train():
import pprint
pprint.pprint(dict(os.environ), width = 1)
model_dir = os.environ['SM_MODEL_DIR']
log_dir = None
copy_logs_to_model_dir = False
try:
log_dir = os.environ['SM_CHANNEL_LOG']
copy_logs_to_model_dir = True
except KeyError:
log_dir = model_dir
train_data_dir = os.environ['SM_CHANNEL_TRAIN']
hyperparamters = json.loads(os.environ['SM_HPS'])
try:
data_name = hyperparamters['data_name']
except KeyError:
data_name = 'MOT20'
try:
load_model = hyperparamters['load_model']
except KeyError:
load_model = ''
try:
num_workers = hyperparamters['num_workers']
except KeyError:
num_workers = 0
try:
num_workers_val = hyperparamters['num_workers_val']
except KeyError:
num_workers_val = 2
try:
# model architecture.
# Currently tested resdcn_34 | resdcn_50 | resfpndcn_34 dla_34 | hrnet_18
arch = hyperparamters['arch']
except KeyError:
arch = 'dla_34'
try:
# input height. -1 for default from dataset.
input_h = hyperparamters['input_h']
except KeyError:
input_h = 608
try:
# input width. -1 for default from dataset.
input_w = hyperparamters['input_w']
except KeyError:
input_w = 1088
try:
# learning rate for batch size 12.
lr = hyperparamters['lr']
except KeyError:
lr = 1e-4
try:
# drop learning rate by 10.
lr_step = hyperparamters['lr_step']
except KeyError:
lr_step = 15
try:
# total training epochs.
num_epochs = hyperparamters['num_epochs']
except KeyError:
num_epochs = 30
try:
# batch size, For MOT, less than 12 should be set on NVIDIA V100 GPU
batch_size = hyperparamters['batch_size']
except KeyError:
batch_size = 12
try:
# batch size, For MOT, less than 12 should be set on NVIDIA V100 GPU
batch_size_val = hyperparamters['batch_size_val']
except KeyError:
batch_size_val = 12
try:
max_label = hyperparamters['max_label']
except KeyError:
# should be greater than 1500 on MOT20
max_label = 1500
try:
# default: #samples / batch_size.
num_iters = hyperparamters['num_iters']
except KeyError:
num_iters = -1
try:
val_intervals = hyperparamters['val_intervals']
except KeyError:
val_intervals = 5
try:
# max number of output objects.
paramK = hyperparamters['K']
except KeyError:
paramK = 500
# #############################################
# Loss
# #############################################
try:
# use mse loss or focal loss to train keypoint heatmaps.
mse_loss = hyperparamters['mse_loss']
except KeyError:
mse_loss = True
try:
# regression loss: sl1 | l1 | l2
reg_loss = hyperparamters['reg_loss']
except KeyError:
reg_loss = 'l1'
try:
# loss weight for keypoint heatmaps.
hm_weight = hyperparamters['hm_weight']
except KeyError:
hm_weight = 1
try:
# loss weight for keypoint local offsets.
off_weight = hyperparamters['off_weight']
except KeyError:
off_weight = 1
try:
# loss weight for bounding box size.
wh_weight = hyperparamters['wh_weight']
except KeyError:
wh_weight = 0.1
try:
# reid loss: ce | triplet
id_loss = hyperparamters['id_loss']
except KeyError:
id_loss = 'ce'
try:
# loss weight for id
id_weight = hyperparamters['id_weight']
except KeyError:
id_weight = 1
try:
# feature dim for reid
reid_dim = hyperparamters['reid_dim']
except KeyError:
reid_dim = 128
try:
# regress left, top, right, bottom of bbox
ltrb = hyperparamters['ltrb']
except KeyError:
ltrb = True
try:
# L1(\hat(y) / y, 1) or L1(\hat(y), y)
norm_wh = hyperparamters['norm_wh']
except KeyError:
norm_wh = True
try:
# apply weighted regression near center or just apply regression on center point.
dense_wh = hyperparamters['dense_wh']
except KeyError:
dense_wh = True
try:
# category specific bounding box size.
cat_spec_wh = hyperparamters['cat_spec_wh']
except KeyError:
cat_spec_wh = True
try:
# not regress local offset.
not_reg_offset = hyperparamters['not_reg_offset']
except KeyError:
not_reg_offset = True
gpus_per_host = int(os.environ['SM_NUM_GPUS'])
gpus = ','.join([str(i) for i in range(gpus_per_host)])
# optimize hyperparameter in multi-GPU model
num_workers = gpus_per_host
lr *= gpus_per_host
batch_size *= gpus_per_host
train_cmd = f"""
cd /fairmot/src && python train.py mot \
--batch_size {batch_size} \
--num_epochs {num_epochs} \
--lr_step '{lr_step}' \
--data_cfg {train_data_dir}/{data_name}/data.json \
--num_workers {num_workers} \
--reg_loss {reg_loss} \
--hm_weight {hm_weight} \
--off_weight {off_weight} \
--wh_weight {wh_weight} \
--id_loss {id_loss} \
--id_weight {id_weight} \
--reid_dim {reid_dim} \
--arch {arch} \
--input_h {input_h} \
--input_w {input_w} \
--lr {lr} \
--val_intervals {val_intervals} \
--gpus {gpus} \
--save_dir {model_dir} \
--batch_size_val {batch_size_val} \
--num_workers_val {num_workers_val} \
"""
if len(load_model) > 0:
train_cmd += f" --load_model {train_data_dir}/pretrained-models/{load_model}"
print("--------Begin Model Training Command----------")
print(train_cmd)
print("--------End Model Training Comamnd------------")
exitcode = 0
try:
process = subprocess.Popen(
train_cmd,
encoding='utf-8',
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL)
while True:
if process.poll() != None:
break
output = process.stdout.readline()
if output:
print(output.strip())
exitcode = process.poll()
print(f"exit code:{exitcode}")
exitcode = 0
except Exception as e:
print("train exception occured", file=sys.stderr)
exitcode = 1
print(str(e), file=sys.stderr)
finally:
if copy_logs_to_model_dir:
copy_files(log_dir, model_dir)
sys.stdout.flush()
sys.stderr.flush()
sys.exit(exitcode)