benchmarks/tf_benchmarks/execute_tensorflow

#!/usr/bin/env python # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of # the License is located at # # http://aws.amazon.com/apache2.0/ # # or in the "license" file accompanying this file. This file is # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. from __future__ import absolute_import import argparse import itertools import os from sagemaker import Session from sagemaker.estimator import Framework from sagemaker.tensorflow import TensorFlow default_bucket = Session().default_bucket dir_path = os.path.dirname(os.path.realpath(__file__)) _DEFAULT_HYPERPARAMETERS = { "batch_size": 32, "model": "resnet32", "num_epochs": 10, "data_format": "NHWC", "summary_verbosity": 1, "save_summaries_steps": 10, "data_name": "cifar10", } class ScriptModeTensorFlow(Framework): """This class is temporary until the final version of Script Mode is released. """ __framework_name__ = "tensorflow-scriptmode-beta" create_model = TensorFlow.create_model def __init__(self, py_version="py3", **kwargs): super(ScriptModeTensorFlow, self).__init__(**kwargs) self.py_version = py_version self.image_name = None self.framework_version = "1.10.0" def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "-t", "--instance-types", nargs="+", help="<Required> Set flag", required=True ) parser.add_argument("-r", "--role", required=True) parser.add_argument("-w", "--wait", action="store_true") parser.add_argument("--region", default="us-west-2") parser.add_argument("--py-versions", nargs="+", help="<Required> Set flag", default=["py3"]) parser.add_argument( "--checkpoint-path", default=os.path.join(default_bucket(), "benchmarks", "checkpoints"), help="The S3 location where the model checkpoints and tensorboard events are saved after training", ) return parser.parse_known_args() def main(args, script_args): for instance_type, py_version in itertools.product(args.instance_types, args.py_versions): base_name = "%s-%s-%s" % (py_version, instance_type[3:5], instance_type[6:]) model_dir = os.path.join(args.checkpoint_path, base_name) job_hps = create_hyperparameters(model_dir, script_args) print("hyperparameters:") print(job_hps) estimator = ScriptModeTensorFlow( entry_point="tf_cnn_benchmarks.py", role="SageMakerRole", source_dir=os.path.join(dir_path, "tf_cnn_benchmarks"), base_job_name=base_name, train_instance_count=1, hyperparameters=job_hps, train_instance_type=instance_type, ) input_dir = "s3://sagemaker-sample-data-%s/spark/mnist/train/" % args.region estimator.fit({"train": input_dir}, wait=args.wait) print("To use TensorBoard, execute the following command:") cmd = "S3_USE_HTTPS=0 S3_VERIFY_SSL=0 AWS_REGION=%s tensorboard --host localhost --port 6006 --logdir %s" print(cmd % (args.region, args.checkpoint_path)) def create_hyperparameters(model_dir, script_args): job_hps = _DEFAULT_HYPERPARAMETERS.copy() job_hps.update({"train_dir": model_dir, "eval_dir": model_dir}) script_arg_keys_without_dashes = [ key[2:] if key.startswith("--") else key[1:] for key in script_args[::2] ] script_arg_values = script_args[1::2] job_hps.update(dict(zip(script_arg_keys_without_dashes, script_arg_values))) return job_hps if __name__ == "__main__": args, script_args = get_args() main(args, script_args)

benchmarks/tf_benchmarks/execute_tensorflow_training.py (74 lines of code) (raw):