in src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java [251:434]
public Pointer malloc(String opcode, long size, boolean initialize) {
if(size < 0) {
throw new DMLRuntimeException("Cannot allocate memory of size " + byteCountToDisplaySize(size));
}
if(DEBUG_MEMORY_LEAK) {
LOG.info("GPU Memory info during malloc:" + toString());
}
// Step 1: First try reusing exact match in rmvarGPUPointers to avoid holes in the GPU memory
Pointer A = lazyCudaFreeMemoryManager.getRmvarPointer(opcode, size);
Pointer tmpA = (A == null) ? new Pointer() : null;
// Step 2: Allocate a new pointer in the GPU memory (since memory is available)
// Step 4 has potential to create holes as well as limit future reuse, hence perform this step before step 3.
if(A == null && allocator.canAllocate(size)) {
// This can fail in case of fragmented memory, so don't issue any warning
A = cudaMallocNoWarn(tmpA, size, "allocate a new pointer");
}
// Step 3: Recycle gpu intermediates from lineage cache
if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
// Recycle a cached pointer if exactly matches the required size
LineageCacheEntry le = LineageGPUCacheEviction.pollFirstFreeEntry(size);
if (le != null) {
if(!LineageCacheConfig.GPU2HOSTEVICTION) {
A = le.getGPUPointer(); //recycle
//LineageGPUCacheEviction.removeFromDeviceCache(le, le.getGPUPointer(), true);
if (DMLScript.STATISTICS)
LineageCacheStatistics.incrementGpuRecycle();
}
else {
// Copy from device cache to CPU lineage cache
// TODO: Copy conditionally (if score > theta)
Pointer copiedPtr = LineageGPUCacheEviction.copyToHostCache(le);
LineageGPUCacheEviction.removeFromDeviceCache(le, copiedPtr, false);
A = copiedPtr;
if(DMLScript.STATISTICS)
LineageCacheStatistics.incrementGpuSyncEvicts();
}
}
// TODO: Handle live (dirty) objects separately. Copy them back to the host
if (DMLScript.STATISTICS)
LineageCacheStatistics.incrementEvictTimeGpu(System.nanoTime() - t0);
}
// Step 4: Try reusing non-exact match entry of rmvarGPUPointers
if(A == null) {
A = lazyCudaFreeMemoryManager.getRmvarPointerMinSize(opcode, size);
if(A != null) {
guardedCudaFree(A);
A = cudaMallocNoWarn(tmpA, size, "reuse non-exact match of rmvarGPUPointers");
if(A == null)
LOG.warn("cudaMalloc failed after clearing one of rmvarGPUPointers.");
}
}
// Step 5: Eagerly free-up rmvarGPUPointers and check if memory is available on GPU
// Evictions of matrix blocks are expensive (as they might lead them to be written to disk in case of smaller CPU budget)
// than doing cuda free/malloc/memset. So, rmvar-ing every blocks (step 4) is preferred over eviction (step 6, 7, 8).
if(A == null) {
lazyCudaFreeMemoryManager.clearAll();
if(allocator.canAllocate(size)) {
// This can fail in case of fragmented memory, so don't issue any warning
A = cudaMallocNoWarn(tmpA, size, "allocate a new pointer after eager free");
}
}
// Step 6: Free gpu intermediates from lineage cache
if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
long freedSize = 0;
while (A == null && !LineageGPUCacheEviction.isGPUCacheFreeQEmpty()) {
// Deallocate a non-exact matched entry from the cached free lists
LineageCacheEntry le = LineageGPUCacheEviction.pollFistFreeNotExact(size);
if(le != null) {
freedSize += getSizeAllocatedGPUPointer(le.getGPUPointer());
if(!LineageCacheConfig.GPU2HOSTEVICTION) {
//LineageGPUCacheEviction.removeFromDeviceCache(le, le.getGPUPointer(), true);
guardedCudaFree(le.getGPUPointer()); //free
if (DMLScript.STATISTICS)
LineageCacheStatistics.incrementGpuDel();
}
else {
// Copy from device cache to CPU lineage cache
Pointer copiedPtr = LineageGPUCacheEviction.copyToHostCache(le);
LineageGPUCacheEviction.removeFromDeviceCache(le, copiedPtr, false);
guardedCudaFree(copiedPtr); //free
if(DMLScript.STATISTICS)
LineageCacheStatistics.incrementGpuSyncEvicts();
}
if (freedSize >= size)
A = cudaMallocNoWarn(tmpA, size, "recycle non-exact match of lineage cache");
// Else, deallocate another free pointer. We are calling pollFistFreeNotExact with
// the same size (not with freedSize-size) to reduce potentials for creating holes
}
// FIXME: performance improvement. Slow due to looping and holes.
}
if (DMLScript.STATISTICS)
LineageCacheStatistics.incrementEvictTimeGpu(System.nanoTime() - t0);
if (A == null)
LOG.warn("cudaMalloc failed after Lineage GPU cache eviction.");
}
// Step 7: Try eviction/clearing exactly one with size restriction
if(A == null) {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
synchronized (matrixMemoryManager.gpuObjects) {
Optional<GPUObject> sizeBasedUnlockedGPUObjects = matrixMemoryManager.gpuObjects.stream()
.filter(gpuObj -> !gpuObj.isLocked()
&& matrixMemoryManager.getWorstCaseContiguousMemorySize(gpuObj) >= size)
.min((o1, o2) -> worstCaseContiguousMemorySizeCompare(o1, o2));
if(sizeBasedUnlockedGPUObjects.isPresent()) {
evictOrClear(sizeBasedUnlockedGPUObjects.get(), opcode);
A = cudaMallocNoWarn(tmpA, size, null);
if(A == null)
LOG.warn("cudaMalloc failed after clearing/evicting based on size.");
if(DMLScript.STATISTICS) {
long totalTime = System.nanoTime() - t0;
GPUStatistics.cudaEvictTime.add(totalTime);
GPUStatistics.cudaEvictSizeTime.add(totalTime);
GPUStatistics.cudaEvictCount.increment();
GPUStatistics.cudaEvictSizeCount.increment();
}
}
}
}
// Step 8: Try eviction/clearing one-by-one based on the given policy without size restriction
if(A == null) {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
long currentAvailableMemory = allocator.getAvailableMemory();
boolean canFit = false;
// ---------------------------------------------------------------
// Evict unlocked GPU objects one-by-one and try malloc
synchronized(matrixMemoryManager.gpuObjects) {
List<GPUObject> unlockedGPUObjects = matrixMemoryManager.gpuObjects.stream()
.filter(gpuObj -> !gpuObj.isLocked()).collect(Collectors.toList());
Collections.sort(unlockedGPUObjects, new EvictionPolicyBasedComparator(size));
while(A == null && unlockedGPUObjects.size() > 0) {
GPUObject evictedGPUObject = unlockedGPUObjects.remove(unlockedGPUObjects.size()-1);
evictOrClear(evictedGPUObject, opcode);
if(!canFit) {
currentAvailableMemory += evictedGPUObject.getSizeOnDevice();
if(currentAvailableMemory >= size)
canFit = true;
}
if(canFit) {
// Checking before invoking cudaMalloc reduces the time spent in unnecessary cudaMalloc.
// This was the bottleneck for ResNet200 experiments with batch size > 32 on P100+Intel
A = cudaMallocNoWarn(tmpA, size, null);
}
if(DMLScript.STATISTICS)
GPUStatistics.cudaEvictCount.increment();
}
}
if(DMLScript.STATISTICS) {
long totalTime = System.nanoTime() - t0;
GPUStatistics.cudaEvictTime.add(totalTime);
}
}
// Step 9: Handle defragmentation
if(A == null) {
LOG.warn("Potential fragmentation of the GPU memory. Forcibly evicting all ...");
LOG.info("Before clearAllUnlocked, GPU Memory info:" + toString());
matrixMemoryManager.clearAllUnlocked(opcode);
CudaMemoryAllocator.resetUnusableFreeMemory();
LOG.info("GPU Memory info after evicting all unlocked matrices:" + toString());
A = cudaMallocNoWarn(tmpA, size, null);
}
if(A == null) {
throw new DMLRuntimeException("There is not enough memory on device for this matrix, requested = " + byteCountToDisplaySize(size) + ". \n "
+ toString());
}
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
if(initialize)
cudaMemset(A, 0, size);
addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0);
return A;
}