cgoutils/memory/rmm_alloc.cu (267 lines of code) (raw):
// Copyright (c) 2017-2018 Uber Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#include <rmm/rmm.h>
#include <rmm/rmm_api.h>
#include <cstdio>
#include <cstring>
#include "../memory.h"
// checkCUDAError checks the cuda error of last runtime calls and returns the
// pointer to the buffer of error message. This buffer needs to be released
// by caller or upper callers.
char *checkCUDAError(const char *message) {
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
char *buffer = reinterpret_cast<char *>(malloc(MAX_ERROR_LEN));
snprintf(buffer, MAX_ERROR_LEN,
"ERROR when calling CUDA functions from host: %s: %s\n",
message, cudaGetErrorString(error));
return buffer;
}
return NULL;
}
char *checkRMMError(rmmError_t rmmError, const char* message) {
if (rmmError != RMM_SUCCESS) {
char *buffer = reinterpret_cast<char *>(malloc(MAX_ERROR_LEN));
snprintf(buffer, MAX_ERROR_LEN,
"ERROR when calling RMM functions: %s: %s\n",
message, rmmGetErrorString(rmmError));
return buffer;
}
return NULL;
}
// init_helper is purely for running some initializing code before main
// is running. If this logic is required in multiple place, we may consider
// wrap it with a macro.
struct init_helper {
static int init_flag;
// init rmm manager with rmmOptions.
static int init() {
CGoCallResHandle resHandle = GetDeviceCount();
if (resHandle.pStrErr != nullptr) {
throw std::runtime_error(const_cast<char *>(resHandle.pStrErr));
}
size_t deviceCount = reinterpret_cast<size_t>(resHandle.res);
for (size_t device = 0; device < deviceCount; device++) {
cudaSetDevice(device);
rmmOptions_t options = {
CudaDefaultAllocation, // Use PoolAllocation when RMM has improved
// their sub allocator.
0, // Default to half ot total memory
false // Disable logging.
};
resHandle.pStrErr =
checkRMMError(rmmInitialize(&options), "rmmInitialize");
if (resHandle.pStrErr != nullptr) {
throw std::runtime_error(const_cast<char *>(resHandle.pStrErr));
}
}
return 0;
}
};
int init_helper::init_flag = init();
DeviceMemoryFlags GetFlags() {
DeviceMemoryFlags
flags = DEVICE_MEMORY_IMPLEMENTATION_FLAG | POOLED_MEMORY_FLAG;
#ifdef SUPPORT_HASH_REDUCTION
flags |= HASH_REDUCTION_SUPPORT;
#endif
return flags;
}
CGoCallResHandle DeviceAllocate(size_t bytes, int device) {
CGoCallResHandle resHandle = {NULL, NULL};
cudaSetDevice(device);
// For now use default stream to avoid changing the memory allocation
// interface.
// TODO(lucafuji): use the stream of current execution pipeline for
// allocation and free.
resHandle.pStrErr = checkRMMError(RMM_ALLOC(&resHandle.res, bytes, 0),
"DeviceAllocate");
if (resHandle.pStrErr == nullptr) {
cudaMemset(resHandle.res, 0, bytes);
resHandle.pStrErr = checkCUDAError("DeviceAllocate");
}
return resHandle;
}
CGoCallResHandle DeviceFree(void *p, int device) {
CGoCallResHandle resHandle = {NULL, NULL};
cudaSetDevice(device);
// For now use default stream to avoid changing the memory allocation
// interface.
// TODO(lucafuji): use the stream of current execution pipeline for
// allocation and free.
resHandle.pStrErr = checkRMMError(RMM_FREE(p, 0),
"DeviceFree");
return resHandle;
}
// All following function implementation is the same as cuda_malloc.cu.
// We might remove cuda_malloc.cu file after RMM is proven to be working
// in production environment
CGoCallResHandle HostAlloc(size_t bytes) {
CGoCallResHandle resHandle = {NULL, NULL};
// cudaHostAllocPortable makes sure that the allocation is associated with all
// devices, not just the current device.
cudaHostAlloc(&resHandle.res, bytes, cudaHostAllocPortable);
memset(resHandle.res, 0, bytes);
resHandle.pStrErr = checkCUDAError("Allocate");
return resHandle;
}
CGoCallResHandle HostFree(void *p) {
CGoCallResHandle resHandle = {NULL, NULL};
cudaFreeHost(p);
resHandle.pStrErr = checkCUDAError("Free");
return resHandle;
}
CGoCallResHandle HostMemCpy(void *dst, const void* src, size_t bytes) {
CGoCallResHandle resHandle = {NULL, NULL};
void* ptr = memcpy(dst, src, bytes);
if (ptr != dst) {
resHandle.pStrErr =
fmtError("HostMemCpy", "Returned pointer does not match destination");
}
return resHandle;
}
CGoCallResHandle CreateCudaStream(int device) {
CGoCallResHandle resHandle = {NULL, NULL};
cudaSetDevice(device);
cudaStream_t s = NULL;
cudaStreamCreate(&s);
resHandle.res = reinterpret_cast<void *>(s);
resHandle.pStrErr = checkCUDAError("CreateCudaStream");
return resHandle;
}
CGoCallResHandle WaitForCudaStream(void *s, int device) {
CGoCallResHandle resHandle = {NULL, NULL};
cudaSetDevice(device);
cudaStreamSynchronize((cudaStream_t) s);
resHandle.pStrErr = checkCUDAError("WaitForCudaStream");
return resHandle;
}
CGoCallResHandle DestroyCudaStream(void *s, int device) {
CGoCallResHandle resHandle = {NULL, NULL};
cudaSetDevice(device);
cudaStreamDestroy((cudaStream_t) s);
resHandle.pStrErr = checkCUDAError("DestroyCudaStream");
return resHandle;
}
CGoCallResHandle AsyncCopyHostToDevice(
void *dst, void *src, size_t bytes, void *stream, int device) {
CGoCallResHandle resHandle = {NULL, NULL};
cudaSetDevice(device);
cudaMemcpyAsync(dst, src, bytes,
cudaMemcpyHostToDevice, (cudaStream_t) stream);
resHandle.pStrErr = checkCUDAError("AsyncCopyHostToDevice");
return resHandle;
}
CGoCallResHandle AsyncCopyDeviceToDevice(
void *dst, void *src, size_t bytes, void *stream, int device) {
CGoCallResHandle resHandle = {NULL, NULL};
cudaSetDevice(device);
cudaMemcpyAsync(dst, src, bytes,
cudaMemcpyDeviceToDevice, (cudaStream_t) stream);
resHandle.pStrErr = checkCUDAError("AsyncCopyDeviceToDevice");
return resHandle;
}
CGoCallResHandle AsyncCopyDeviceToHost(
void *dst, void *src, size_t bytes, void *stream, int device) {
CGoCallResHandle resHandle = {NULL, NULL};
cudaSetDevice(device);
cudaMemcpyAsync(dst, src, bytes,
cudaMemcpyDeviceToHost, (cudaStream_t) stream);
resHandle.pStrErr = checkCUDAError("AsyncCopyDeviceToHost");
return resHandle;
}
CGoCallResHandle GetDeviceCount() {
CGoCallResHandle resHandle = {NULL, NULL};
cudaGetDeviceCount(reinterpret_cast<int *>(&resHandle.res));
resHandle.pStrErr = checkCUDAError("GetDeviceCount");
return resHandle;
}
CGoCallResHandle GetDeviceGlobalMemoryInMB(int device) {
CGoCallResHandle resHandle = {NULL, NULL};
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, device);
resHandle.res = reinterpret_cast<void *>(prop.totalGlobalMem / (1024 * 1024));
resHandle.pStrErr = checkCUDAError("GetDeviceGlobalMemoryInMB");
return resHandle;
}
CGoCallResHandle CudaProfilerStart() {
CGoCallResHandle resHandle = {NULL, NULL};
cudaProfilerStart();
resHandle.pStrErr = checkCUDAError("cudaProfilerStart");
return resHandle;
}
CGoCallResHandle CudaProfilerStop() {
CGoCallResHandle resHandle = {NULL, NULL};
cudaDeviceSynchronize();
cudaProfilerStop();
resHandle.pStrErr = checkCUDAError("cudaProfilerStop");
return resHandle;
}
CGoCallResHandle GetDeviceMemoryInfo(size_t *freeSize, size_t *totalSize,
int device) {
CGoCallResHandle resHandle = {NULL, NULL};
cudaSetDevice(device);
resHandle.pStrErr = checkRMMError(rmmGetInfo(freeSize, totalSize, 0),
"GetDeviceMemoryInfo");
return resHandle;
}
CGoCallResHandle deviceMalloc(void **devPtr, size_t size) {
CGoCallResHandle resHandle = {NULL, NULL};
// For now use default stream to avoid changing the memory allocation
// interface.
// TODO(lucafuji): use the stream of current execution pipeline for
// allocation and free.
resHandle.pStrErr = checkRMMError(RMM_ALLOC(devPtr, size, 0),
"deviceMalloc");
return resHandle;
}
CGoCallResHandle deviceFree(void *devPtr) {
CGoCallResHandle resHandle = {NULL, NULL};
// For now use default stream to avoid changing the memory allocation
// interface.
// TODO(lucafuji): use the stream of current execution pipeline for
// allocation and free.
resHandle.pStrErr = checkRMMError(RMM_FREE(devPtr, 0), "deviceFree");
return resHandle;
}
CGoCallResHandle deviceMemset(void *devPtr, int value, size_t count) {
CGoCallResHandle resHandle = {NULL, NULL};
cudaMemset(devPtr, value, count);
resHandle.pStrErr = checkCUDAError("deviceMemset");
return resHandle;
}
CGoCallResHandle asyncCopyHostToDevice(void* dst, const void* src,
size_t count, void* stream) {
CGoCallResHandle resHandle = {NULL, NULL};
cudaMemcpyAsync(dst, src, count,
cudaMemcpyHostToDevice, (cudaStream_t) stream);
resHandle.pStrErr = checkCUDAError("asyncCopyHostToDevice");
return resHandle;
}
CGoCallResHandle asyncCopyDeviceToHost(void* dst, const void* src,
size_t count, void* stream) {
CGoCallResHandle resHandle = {NULL, NULL};
cudaMemcpyAsync(dst, src, count,
cudaMemcpyDeviceToHost, (cudaStream_t) stream);
resHandle.pStrErr = checkCUDAError("asyncCopyDeviceToHost");
return resHandle;
}
CGoCallResHandle waitForCudaStream(void *stream) {
CGoCallResHandle resHandle = {NULL, NULL};
cudaStreamSynchronize((cudaStream_t) stream);
resHandle.pStrErr = checkCUDAError("waitForCudaStream");
return resHandle;
}