def wait_for_training_processes_to_appear_and_finish()

in miscellaneous/distributed_tensorflow_mask_rcnn/container-optimized-script-mode/resources/train.py [0:0]


def wait_for_training_processes_to_appear_and_finish(proccess_id_string, worker):

    training_process_started = False
    while True:
        time.sleep(300)
        training_process_ps = subprocess.check_output(
            f'ps -elf | grep "{proccess_id_string}"', encoding="utf-8", shell=True
        )
        print(training_process_ps)
        training_process_count = subprocess.check_output(
            f'ps -elf | grep "{proccess_id_string}" | wc -l', encoding="utf-8", shell=True
        )
        training_process_count_str = training_process_count.replace("\n", "").strip()
        training_process_count = int(training_process_count_str) - 2
        training_process_running = training_process_count > 0
        if training_process_started:
            print(f"training processes running: {training_process_count}")
            if not training_process_running:
                print(f"Worker {worker} training completed.")
                time.sleep(5)
                sys.exit(0)

        if not training_process_started:
            if training_process_running:
                training_process_started = True
            else:
                print(f"Worker {worker} exiting: training not started in 300 seconds.")
                sys.exit(1)