in miscellaneous/distributed_tensorflow_mask_rcnn/container-script-mode/resources/train.py [0:0]
def wait_for_training_processes_to_appear_and_finish(proccess_id_string, worker):
training_process_started = False
while True:
time.sleep(300)
training_process_ps = subprocess.check_output(
f'ps -elf | grep "{proccess_id_string}"', encoding="utf-8", shell=True
)
print(training_process_ps)
training_process_count = subprocess.check_output(
f'ps -elf | grep "{proccess_id_string}" | wc -l', encoding="utf-8", shell=True
)
training_process_count_str = training_process_count.replace("\n", "").strip()
training_process_count = int(training_process_count_str) - 2
training_process_running = training_process_count > 0
if training_process_started:
print(f"training processes running: {training_process_count}")
if not training_process_running:
print(f"Worker {worker} training completed.")
time.sleep(5)
sys.exit(0)
if not training_process_started:
if training_process_running:
training_process_started = True
else:
print(f"Worker {worker} exiting: training not started in 300 seconds.")
sys.exit(1)