in src/sagemaker_tensorflow_container/training.py [0:0]
def _run_ps(env, cluster):
logger.info("Running distributed training job with parameter servers")
cluster_spec = tf.train.ClusterSpec(cluster)
task_index = env.hosts.index(env.current_host)
# Force parameter server to run on cpu. Running multiple TensorFlow processes on the same
# GPU is not safe:
# https://stackoverflow.com/questions/46145100/is-it-unsafe-to-run-multiple-tensorflow-processes-on-the-same-gpu
no_gpu_config = tf.compat.v1.ConfigProto(device_count={"GPU": 0})
server = tf.distribute.Server(
cluster_spec, job_name="ps", task_index=task_index, config=no_gpu_config
)
multiprocessing.Process(target=lambda: server.join()).start()