gloo/cuda_private.h (130 lines of code) (raw):

/** * Copyright (c) 2017-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include <cstdint> #include <functional> #include <memory> #include <mutex> #ifdef __linux__ #include "gloo/common/linux.h" #endif #include "gloo/common/logging.h" #include "gloo/cuda.h" #include "gloo/transport/device.h" namespace gloo { #define CUDA_CHECK(condition) \ do { \ cudaError_t error = condition; \ GLOO_ENFORCE_EQ( \ error, \ cudaSuccess, \ "Error at: ", \ __FILE__, \ ":", \ __LINE__, \ ": ", \ cudaGetErrorString(error)); \ } while (0) inline int getCurrentGPUID() { int id = 0; CUDA_CHECK(cudaGetDevice(&id)); return id; } inline int getGPUIDForPointer(const void* ptr) { cudaPointerAttributes attr; CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr)); return attr.device; } inline int getDeviceCount() { int count; CUDA_CHECK(cudaGetDeviceCount(&count)); return count; } const std::string& getCudaPCIBusID(int device); template<typename T> int findCudaDevicePointerClosestToDevice( std::vector<CudaDevicePointer<T> >& ptrs, std::shared_ptr<transport::Device>& dev) { // Compute distance between every pointer auto devBusID = dev->getPCIBusID(); std::vector<int> distance(ptrs.size()); int minDistance = INT_MAX; int minDistanceCount = 0; for (auto i = 0; i < ptrs.size(); i++) { #ifdef __linux__ auto cudaBusID = getCudaPCIBusID(ptrs[i].getDeviceID()); distance[i] = pciDistance(devBusID, cudaBusID); #else distance[i] = 0; #endif if (distance[i] <= minDistance) { if (distance[i] < minDistance) { minDistance = distance[i]; minDistanceCount = 0; } minDistanceCount++; } } // Choose random pointer closest to device; auto minOffset = rand() % minDistanceCount; int minIndex = 0; for (auto i = 0; i < ptrs.size(); i++) { if (distance[i] == minDistance) { if (minOffset == 0) { minIndex = i; } minOffset--; } } return minIndex; } class CudaDeviceGuard { public: CudaDeviceGuard() : previous_(getCurrentGPUID()) { } ~CudaDeviceGuard() noexcept(false) { CUDA_CHECK(cudaSetDevice(previous_)); } private: int previous_; }; class CudaDeviceScope { public: explicit CudaDeviceScope(int device) : guard_() { CUDA_CHECK(cudaSetDevice(device)); } private: CudaDeviceGuard guard_; }; // Managed chunk of GPU memory. // Convenience class used for tests and benchmarks. template<typename T> class CudaMemory { public: explicit CudaMemory(size_t elements); CudaMemory(CudaMemory&&) noexcept; ~CudaMemory() noexcept(false); T* operator*() const { return ptr_; } const size_t elements; const size_t bytes; protected: CudaMemory(const CudaMemory&) = delete; CudaMemory& operator=(const CudaMemory&) = delete; int device_; T* ptr_; }; // Container class for a set of per-device streams class CudaDeviceStreams { public: CudaDeviceStreams() { const int numDevices = getDeviceCount(); streams_.reserve(numDevices); for (auto i = 0; i < numDevices; i++) { streams_.push_back(CudaStream(i)); } } cudaStream_t operator[](const int i) { GLOO_ENFORCE_LT(i, streams_.size()); return *streams_[i]; } protected: CudaDeviceStreams(const CudaDeviceStreams&) = delete; CudaDeviceStreams& operator=(const CudaDeviceStreams&) = delete; std::vector<CudaStream> streams_; }; } // namespace gloo