07_training/serverlessml/flowers/classifier/train.py (114 lines of code) (raw):
#!/usr/bin/env python
# Copyright 2020 Google Inc. Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import os, shutil
import argparse
import hypertune
from distutils.util import strtobool
from tensorflow.data.experimental import AUTOTUNE
from flowers.utils.util import cleanup_dir, create_strategy
from flowers.ingest.tfrecords import *
from flowers.classifier.model import *
from flowers.utils.plots import *
from flowers.classifier.model import MODEL_IMG_SIZE
def train_and_evaluate(strategy, opts):
# calculate the image dimensions given that we have to center crop
# to achieve the model image size
IMG_HEIGHT = IMG_WIDTH = round(MODEL_IMG_SIZE / opts['crop_ratio'])
print('Will pad input images to {}x{}, then crop them to {}x{}'.format(
IMG_HEIGHT, IMG_WIDTH, MODEL_IMG_SIZE, MODEL_IMG_SIZE
))
IMG_CHANNELS = 3
train_dataset = create_preproc_dataset(
os.path.join(opts['input_topdir'], 'train' + opts['pattern']),
IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS
).batch(opts['batch_size'])
eval_dataset = create_preproc_dataset(
os.path.join(opts['input_topdir'], 'valid' + opts['pattern']),
IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS
).batch(opts['batch_size'])
# if number of training examples per epoch is specified
# repeat the training dataset indefinitely
num_steps_per_epoch = None
if (opts['num_training_examples'] > 0):
train_dataset = train_dataset.repeat()
num_steps_per_epoch = opts['num_training_examples'] // opts['batch_size']
print("Will train for {} steps".format(num_steps_per_epoch))
# checkpoint and early stopping callbacks
model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
filepath=os.path.join(opts['outdir'], 'chkpts'),
monitor='val_accuracy', mode='max',
save_best_only=True)
early_stopping_cb = tf.keras.callbacks.EarlyStopping(
monitor='val_accuracy', mode='max',
patience=2)
# model training
with strategy.scope():
model = create_model(opts, IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=opts['lrate']),
loss=tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=False),
metrics=['accuracy']
)
print(model.summary())
history = model.fit(train_dataset,
validation_data=eval_dataset,
epochs=opts['num_epochs'],
steps_per_epoch=num_steps_per_epoch,
callbacks=[model_checkpoint_cb, early_stopping_cb]
)
training_plot(['loss', 'accuracy'], history,
os.path.join(opts['outdir'], 'training_plot.png'))
# export the model
export_model(model,
opts['outdir'],
IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS)
# report hyperparam metric
hpt = hypertune.HyperTune()
accuracy = np.max(history.history['val_accuracy']) # highest encountered
nepochs = len(history.history['val_accuracy'])
hpt.report_hyperparameter_tuning_metric(
hyperparameter_metric_tag='accuracy',
metric_value=accuracy,
global_step=nepochs)
print("Reported hparam metric name=accuracy value={}".format(accuracy))
return model
if __name__ == '__main__':
parser = argparse.ArgumentParser()
## Training parameters
parser.add_argument(
'--job-dir', help='Top-level output directory', required=True)
parser.add_argument(
'--input_topdir', help='Top-level directory of the TF Records',
default='gs://practical-ml-vision-book-data/flowers_tfr'
)
parser.add_argument(
'--pattern', help='Files in {input_topdir}/train to read',
default='-0000[01]-*')
parser.add_argument(
'--num_epochs', help='How many times to iterate over training patterns',
default=3, type=int)
parser.add_argument(
'--num_training_examples',
help='Number of examples to term as a virtual epoch. If not specified, will use actual number of examples.',
default=-1, type=int)
parser.add_argument(
'--distribute', default='gpus_one_machine',
help="""
Has to be one of:
* cpu
* gpus_one_machine
* gpus_multiple_machines
* tpu_colab
* tpu_caip
* the actual name of the cloud_tpu
""")
parser.add_argument('--resume', dest='resume', action='store_true',
help="Starts from checkpoints in output directory")
## model parameters
parser.add_argument(
'--batch_size', help='Number of records in a batch', default=32, type=int)
parser.add_argument(
'--l1', help='L1 regularization', default=0., type=float)
parser.add_argument(
'--l2', help='L2 regularization', default=0., type=float)
parser.add_argument(
'--lrate', help='Adam learning rate', default=0.001, type=float)
parser.add_argument(
'--num_hidden', help='Number of nodes in last but one layer', default=16, type=int)
parser.add_argument(
'--crop_ratio', help='Images are center-cropped to this ratio', default=0.5, type=float)
parser.add_argument('--with_color_distort',
type=lambda x: bool(strtobool(x)), nargs='?', const=True, default=True,
help="Specify True or False. Default is True.")
# parse arguments, and set up outdir based on job-dir
# job-dir is set by CAIP, but outdir is what our code wants.
args = parser.parse_args()
opts = args.__dict__
opts['outdir'] = opts['job_dir']
print("Job Parameters={}".format(opts))
# able to resume
if not opts['resume']:
cleanup_dir(os.path.join(opts['outdir'], 'chkpts'))
# Train, evaluate, export
strategy = create_strategy(opts['distribute']) # has to be first/early call in program
train_and_evaluate(strategy, opts)