in src/vw-serving/src/vw_serving/sagemaker/gpu.py [0:0]
def _query_num_gpus():
"""
Returns the number of GPU devices on the host. Returns 0 if the host has no GPU devices.
"""
global _num_gpus
if _num_gpus is None:
COMMAND = 'nvidia-smi -L 2>/dev/null | grep \'GPU [0-9]\' | wc -l'
TIMEOUT_SECONDS = 75
STATUS_POLL_INTERVAL_SECONDS = 0.025
try:
proc = subprocess.Popen(COMMAND, shell=True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE, bufsize=1)
except (OSError, ValueError):
logging.exception("Error launching /usr/bin/nvidia-smi.")
return 0
start_time = time.time()
# Wait for the process to finish
exitcode = None
while exitcode is None and time.time() - start_time < TIMEOUT_SECONDS:
time.sleep(STATUS_POLL_INTERVAL_SECONDS)
exitcode = proc.poll()
# Terminate the process if not finished
if exitcode is None:
logging.error("nvidia-smi timed out after %s secs", time.time() - start_time)
proc.terminate()
raise TimeoutError
_num_gpus = int(proc.stdout.readline())
logging.info("nvidia-smi took: %s secs to identify %d gpus", time.time() - start_time, _num_gpus)
return _num_gpus