def _run_ps()

in src/sagemaker_tensorflow_container/training.py [0:0]


def _run_ps(env, cluster):
    logger.info("Running distributed training job with parameter servers")

    cluster_spec = tf.train.ClusterSpec(cluster)
    task_index = env.hosts.index(env.current_host)
    # Force parameter server to run on cpu. Running multiple TensorFlow processes on the same
    # GPU is not safe:
    # https://stackoverflow.com/questions/46145100/is-it-unsafe-to-run-multiple-tensorflow-processes-on-the-same-gpu
    no_gpu_config = tf.compat.v1.ConfigProto(device_count={"GPU": 0})

    server = tf.distribute.Server(
        cluster_spec, job_name="ps", task_index=task_index, config=no_gpu_config
    )

    multiprocessing.Process(target=lambda: server.join()).start()