in TensorFlow/yolov3/train.py [0:0]
def main(_argv):
if FLAGS.mode == "eager_tf":
tf.compat.v1.enable_eager_execution()
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
tf.config.experimental.set_memory_growth(physical_devices[0], True)
if FLAGS.tiny:
model = YoloV3Tiny(FLAGS.size, training=True,
classes=FLAGS.num_classes)
anchors = yolo_tiny_anchors
anchor_masks = yolo_tiny_anchor_masks
else:
model = YoloV3(FLAGS.size, training=True, classes=FLAGS.num_classes)
anchors = yolo_anchors
anchor_masks = yolo_anchor_masks
if FLAGS.trace:
run_options = tf.compat.v1.RunOptions(
output_partition_graphs=True,
trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
run_metadata = tf.compat.v1.RunMetadata()
trace_dir = os.path.join("traces", "training")
if not os.path.isdir(trace_dir):
os.makedirs(trace_dir)
graphs_dir = os.path.join("traces", "training", "graphs")
if not os.path.isdir(graphs_dir):
os.makedirs(graphs_dir)
else:
run_options = None
run_metadata = None
train_dataset = dataset.load_fake_dataset()
if FLAGS.dataset:
train_dataset = dataset.load_tfrecord_dataset(
FLAGS.dataset, FLAGS.classes, FLAGS.size)
train_dataset = train_dataset.shuffle(buffer_size=512)
train_dataset = train_dataset.batch(FLAGS.batch_size)
train_dataset = train_dataset.map(lambda x, y: (
dataset.transform_images(x, FLAGS.size),
dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size)))
train_dataset = train_dataset.repeat()
train_dataset = train_dataset.prefetch(
buffer_size=tf.data.experimental.AUTOTUNE)
val_dataset = dataset.load_fake_dataset()
if FLAGS.val_dataset:
val_dataset = dataset.load_tfrecord_dataset(
FLAGS.val_dataset, FLAGS.classes, FLAGS.size)
val_dataset = val_dataset.batch(FLAGS.batch_size)
val_dataset = val_dataset.map(lambda x, y: (
dataset.transform_images(x, FLAGS.size),
dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size)))
val_dataset = val_dataset.repeat()
# TF2 doesn't need this, but we're using TF1.15.
if FLAGS.mode == "fit":
sess = tf.keras.backend.get_session()
sess.run(tf.compat.v1.global_variables_initializer(), options=run_options, run_metadata=run_metadata)
if FLAGS.trace:
fetched_timeline = timeline.Timeline(run_metadata.step_stats)
chrome_trace = fetched_timeline.generate_chrome_trace_format()
with open(os.path.join(trace_dir, f"variables_init.json"), 'w') as f:
f.write(chrome_trace)
for i in range(len(run_metadata.partition_graphs)):
with open(os.path.join(graphs_dir, f"variables_init_partition_{i}.pbtxt"), 'w') as f:
f.write(str(run_metadata.partition_graphs[i]))
sess.run(tf.compat.v1.tables_initializer(), options=run_options, run_metadata=run_metadata)
if FLAGS.trace:
fetched_timeline = timeline.Timeline(run_metadata.step_stats)
chrome_trace = fetched_timeline.generate_chrome_trace_format()
with open(os.path.join(trace_dir, f"table_init.json"), 'w') as f:
f.write(chrome_trace)
for i in range(len(run_metadata.partition_graphs)):
with open(os.path.join(graphs_dir, f"table_init_partition_{i}.pbtxt"), 'w') as f:
f.write(str(run_metadata.partition_graphs[i]))
# Configure the model for transfer learning
if FLAGS.transfer == 'none':
pass # Nothing to do
elif FLAGS.transfer in ['darknet', 'no_output']:
# Darknet transfer is a special case that works
# with incompatible number of classes
# reset top layers
if FLAGS.tiny:
model_pretrained = YoloV3Tiny(
FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes)
else:
model_pretrained = YoloV3(
FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes)
model_pretrained.load_weights(FLAGS.weights)
if FLAGS.transfer == 'darknet':
model.get_layer('yolo_darknet').set_weights(
model_pretrained.get_layer('yolo_darknet').get_weights())
freeze_all(model.get_layer('yolo_darknet'))
elif FLAGS.transfer == 'no_output':
for l in model.layers:
if not l.name.startswith('yolo_output'):
l.set_weights(model_pretrained.get_layer(
l.name).get_weights())
freeze_all(l)
else:
# All other transfer require matching classes
model.load_weights(FLAGS.weights)
if FLAGS.transfer == 'fine_tune':
# freeze darknet and fine tune other layers
darknet = model.get_layer('yolo_darknet')
freeze_all(darknet)
elif FLAGS.transfer == 'frozen':
# freeze everything
freeze_all(model)
optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate)
loss = [YoloLoss(anchors[mask], classes=FLAGS.num_classes)
for mask in anchor_masks]
if FLAGS.mode == 'eager_tf':
# Eager mode is great for debugging
# Non eager graph mode is recommended for real training
avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32)
avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32)
for epoch in range(1, FLAGS.epochs + 1):
for batch, (images, labels) in enumerate(train_dataset):
with tf.GradientTape() as tape:
outputs = model(images, training=True)
regularization_loss = tf.reduce_sum(model.losses)
pred_loss = []
for output, label, loss_fn in zip(outputs, labels, loss):
pred_loss.append(loss_fn(label, output))
total_loss = tf.reduce_sum(pred_loss) + regularization_loss
grads = tape.gradient(total_loss, model.trainable_variables)
optimizer.apply_gradients(
zip(grads, model.trainable_variables))
logging.info("{}_train_{}, {}, {}".format(
epoch, batch, total_loss.numpy(),
list(map(lambda x: np.sum(x.numpy()), pred_loss))))
avg_loss.update_state(total_loss)
for batch, (images, labels) in enumerate(val_dataset):
outputs = model(images)
regularization_loss = tf.reduce_sum(model.losses)
pred_loss = []
for output, label, loss_fn in zip(outputs, labels, loss):
pred_loss.append(loss_fn(label, output))
total_loss = tf.reduce_sum(pred_loss) + regularization_loss
logging.info("{}_val_{}, {}, {}".format(
epoch, batch, total_loss.numpy(),
list(map(lambda x: np.sum(x.numpy()), pred_loss))))
avg_val_loss.update_state(total_loss)
logging.info("{}, train: {}, val: {}".format(
epoch,
avg_loss.result().numpy(),
avg_val_loss.result().numpy()))
avg_loss.reset_states()
avg_val_loss.reset_states()
model.save_weights(
'checkpoints/yolov3_train_{}.tf'.format(epoch))
else:
model.compile(optimizer=optimizer, loss=loss,
run_eagerly=(FLAGS.mode == 'eager_fit'),
options=run_options, run_metadata=run_metadata)
callbacks = [
ReduceLROnPlateau(verbose=1),
EarlyStopping(patience=3, verbose=1),
ModelCheckpoint('checkpoints/yolov3_train_{epoch}.tf',
verbose=1, save_weights_only=True),
]
class TraceCallback(tf.keras.callbacks.Callback):
def on_epoch_begin(self, epoch, logs=None):
self.current_epoch = epoch
def on_train_batch_end(self, batch, logs=None):
fetched_timeline = timeline.Timeline(run_metadata.step_stats)
chrome_trace = fetched_timeline.generate_chrome_trace_format()
with open(os.path.join(trace_dir, f"training_epoch_{self.current_epoch}_batch_{batch}.json"), 'w') as f:
f.write(chrome_trace)
# No need to dump graph partitions for every batch; they should be identical.
if batch == 0:
for i in range(len(run_metadata.partition_graphs)):
with open(os.path.join(graphs_dir, f"training_partition_{i}.pbtxt"), 'w') as f:
f.write(str(run_metadata.partition_graphs[i]))
if FLAGS.trace:
callbacks.append(TraceCallback())
else:
callbacks.append(TensorBoard(write_graph=False, log_dir="logs"))
history = model.fit(train_dataset,
epochs=FLAGS.epochs,
callbacks=callbacks,
validation_data=val_dataset,
steps_per_epoch=FLAGS.num_samples // FLAGS.batch_size,
validation_steps=FLAGS.num_val_samples // FLAGS.batch_size)