std::unordered_map get_id_mapping()

in source/neuropod/internal/cuda_device_mapping.cc [141:214]


std::unordered_map<int, std::string> get_id_mapping()
{
    // Make sure our logging is initialized
    init_logging();

    if (!load_cuda() || !load_nvml())
    {
        // Couldn't load CUDA or NVML so we can't do anything else
        return {};
    }

    // Get device count
    // Based on https://github.com/pytorch/pytorch/blob/master/c10/cuda/CUDAFunctions.h#L19
    int device_count;
    int err = cudaGetDeviceCount(&device_count);

    // Check if CUDA gave us an error
    if (err != 0 /* cudaSuccess */)
    {
        // Clear out the error state, so we don't spuriously trigger someone else.
        cudaGetLastError();
        SPDLOG_DEBUG("Error when getting number of GPU devices");
        return {};
    }

    // Check if we have a GPU
    if (device_count <= 0)
    {
        SPDLOG_DEBUG("No GPUs available");
        return {};
    }

    std::unordered_map<int, std::string> id_mapping;
    for (int i = 0; i < device_count; i++)
    {
        // Get the UUID from the device ID

        // At most 13 chars according to
        // https://docs.nvidia.com/cuda/archive/9.0/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1gea264dad3d8c4898e0b82213c0253def
        char pciBusId[13];
        err = cudaDeviceGetPCIBusId(pciBusId, sizeof(pciBusId), i);
        if (err != 0 /* cudaSuccess */)
        {
            // Clear out the error state, so we don't spuriously trigger someone else.
            cudaGetLastError();
            SPDLOG_ERROR("Error when getting pciBusId for GPU {}", i);
            return {};
        }

        // Get an NVML device handle
        nvmlDevice_t device;
        err = nvmlDeviceGetHandleByPciBusId(pciBusId, &device);
        if (err != 0 /* NVML_SUCCESS */)
        {
            SPDLOG_ERROR("NVML error when getting device from pciBusId: {}", nvmlErrorString(err));
        }

        // Get a UUID from the handle
        // At most 80 chars according to
        // https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g84dca2d06974131ccec1651428596191
        char uuid[80];
        err = nvmlDeviceGetUUID(device, uuid, sizeof(uuid));
        if (err != 0 /* NVML_SUCCESS */)
        {
            SPDLOG_ERROR("NVML error when getting uuid from device: {}", nvmlErrorString(err));
        }

        SPDLOG_INFO("Found GPU {} with UUID {}", i, uuid);

        id_mapping.emplace(i, uuid);
    }

    return id_mapping;
}