def _query_num_gpus()

in src/vw-serving/src/vw_serving/sagemaker/gpu.py [0:0]


def _query_num_gpus():
    """
    Returns the number of GPU devices on the host. Returns 0 if the host has no GPU devices.
    """

    global _num_gpus
    if _num_gpus is None:
        COMMAND = 'nvidia-smi -L 2>/dev/null | grep \'GPU [0-9]\' | wc -l'
        TIMEOUT_SECONDS = 75
        STATUS_POLL_INTERVAL_SECONDS = 0.025

        try:
            proc = subprocess.Popen(COMMAND, shell=True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE, bufsize=1)
        except (OSError, ValueError):
            logging.exception("Error launching /usr/bin/nvidia-smi.")
            return 0

        start_time = time.time()

        # Wait for the process to finish
        exitcode = None
        while exitcode is None and time.time() - start_time < TIMEOUT_SECONDS:
            time.sleep(STATUS_POLL_INTERVAL_SECONDS)
            exitcode = proc.poll()

        # Terminate the process if not finished
        if exitcode is None:
            logging.error("nvidia-smi timed out after %s secs", time.time() - start_time)
            proc.terminate()
            raise TimeoutError

        _num_gpus = int(proc.stdout.readline())
        logging.info("nvidia-smi took: %s secs to identify %d gpus", time.time() - start_time, _num_gpus)

    return _num_gpus