def main()

in ec2-spot-deep-learning-training/ec2_spot_keras_training.py [0:0]


def main():

    # Training parameters
    batch_size = 512
    epochs = 50
    volume_mount_dir = '/dltraining/'
    dataset_path = os.path.join(volume_mount_dir, 'datasets')
    checkpoint_path = os.path.join(volume_mount_dir, 'checkpoints')
    checkpoint_names = 'cifar10_model.{epoch:03d}.h5'
    today_date = datetime.datetime.today().strftime('%Y-%m-%d')

    # Load dataset
    (x_train, y_train), (x_test, y_test) = load_prepare_dataset(dataset_path)
    input_shape = x_train.shape[1:]

    # Load model
    if os.path.isdir(checkpoint_path) and any(glob.glob(os.path.join(checkpoint_path, '*'))):
        model, epoch_number = load_checkpoint_model(checkpoint_path, checkpoint_names)
    else:
        model = cifar10_model(input_shape)
        epoch_number = 0

    # Define Callbacks
    callbacks = define_callbacks(volume_mount_dir, checkpoint_path, checkpoint_names, today_date)

    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, initial_epoch=epoch_number, callbacks=callbacks)

    # Score trained model.
    scores = model.evaluate(x_test, y_test, verbose=1)
    print('Test loss:', scores[0])
    print('Test accuracy:', scores[1])

    # Backup terminal output once training is complete
    shutil.copy2('/var/log/cloud-init-output.log', os.path.join(volume_mount_dir,
                                                                'cloud-init-output-{}.log'.format(today_date)))