public Pointer malloc()

in src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java [251:434]


	public Pointer malloc(String opcode, long size, boolean initialize) {
		if(size < 0) {
			throw new DMLRuntimeException("Cannot allocate memory of size " + byteCountToDisplaySize(size));
		}
		if(DEBUG_MEMORY_LEAK) {
			LOG.info("GPU Memory info during malloc:" + toString());
		}

		// Step 1: First try reusing exact match in rmvarGPUPointers to avoid holes in the GPU memory
		Pointer A = lazyCudaFreeMemoryManager.getRmvarPointer(opcode, size);

		Pointer tmpA = (A == null) ? new Pointer() : null;
		// Step 2: Allocate a new pointer in the GPU memory (since memory is available)
		// Step 4 has potential to create holes as well as limit future reuse, hence perform this step before step 3.
		if(A == null && allocator.canAllocate(size)) {
			// This can fail in case of fragmented memory, so don't issue any warning
			A = cudaMallocNoWarn(tmpA, size, "allocate a new pointer");
		}

		// Step 3: Recycle gpu intermediates from lineage cache
		if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
			long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
			// Recycle a cached pointer if exactly matches the required size
			LineageCacheEntry le = LineageGPUCacheEviction.pollFirstFreeEntry(size);
			if (le != null) {
				if(!LineageCacheConfig.GPU2HOSTEVICTION) {
					A = le.getGPUPointer(); //recycle
					//LineageGPUCacheEviction.removeFromDeviceCache(le, le.getGPUPointer(), true);
					if (DMLScript.STATISTICS)
						LineageCacheStatistics.incrementGpuRecycle();
				}
				else {
					// Copy from device cache to CPU lineage cache
					// TODO: Copy conditionally (if score > theta)
					Pointer copiedPtr = LineageGPUCacheEviction.copyToHostCache(le);
					LineageGPUCacheEviction.removeFromDeviceCache(le, copiedPtr, false);
					A = copiedPtr;
					if(DMLScript.STATISTICS)
						LineageCacheStatistics.incrementGpuSyncEvicts();
				}
			}
			// TODO: Handle live (dirty) objects separately. Copy them back to the host

			if (DMLScript.STATISTICS)
				LineageCacheStatistics.incrementEvictTimeGpu(System.nanoTime() - t0);
		}
		
		// Step 4: Try reusing non-exact match entry of rmvarGPUPointers
		if(A == null) {
			A = lazyCudaFreeMemoryManager.getRmvarPointerMinSize(opcode, size);
			if(A != null) {
				guardedCudaFree(A);
				A = cudaMallocNoWarn(tmpA, size, "reuse non-exact match of rmvarGPUPointers");
				if(A == null)
					LOG.warn("cudaMalloc failed after clearing one of rmvarGPUPointers.");
			}
		}
		
		// Step 5: Eagerly free-up rmvarGPUPointers and check if memory is available on GPU
		// Evictions of matrix blocks are expensive (as they might lead them to be written to disk in case of smaller CPU budget) 
		// than doing cuda free/malloc/memset. So, rmvar-ing every blocks (step 4) is preferred over eviction (step 6, 7, 8).
		if(A == null) {
			lazyCudaFreeMemoryManager.clearAll();
			if(allocator.canAllocate(size)) {
				// This can fail in case of fragmented memory, so don't issue any warning
				A = cudaMallocNoWarn(tmpA, size, "allocate a new pointer after eager free");
			}
		}
		
		// Step 6: Free gpu intermediates from lineage cache
		if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
			long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
			long freedSize = 0;
			while (A == null && !LineageGPUCacheEviction.isGPUCacheFreeQEmpty()) {
				// Deallocate a non-exact matched entry from the cached free lists
				LineageCacheEntry le = LineageGPUCacheEviction.pollFistFreeNotExact(size);
				if(le != null) {
					freedSize += getSizeAllocatedGPUPointer(le.getGPUPointer());
					if(!LineageCacheConfig.GPU2HOSTEVICTION) {
						//LineageGPUCacheEviction.removeFromDeviceCache(le, le.getGPUPointer(), true);
						guardedCudaFree(le.getGPUPointer()); //free
						if (DMLScript.STATISTICS)
							LineageCacheStatistics.incrementGpuDel();
					}
					else {
						// Copy from device cache to CPU lineage cache
						Pointer copiedPtr = LineageGPUCacheEviction.copyToHostCache(le);
						LineageGPUCacheEviction.removeFromDeviceCache(le, copiedPtr, false);
						guardedCudaFree(copiedPtr); //free
						if(DMLScript.STATISTICS)
							LineageCacheStatistics.incrementGpuSyncEvicts();
					}
					if (freedSize >= size)
						A = cudaMallocNoWarn(tmpA, size, "recycle non-exact match of lineage cache");
					// Else, deallocate another free pointer. We are calling pollFistFreeNotExact with
					// the same size (not with freedSize-size) to reduce potentials for creating holes
				}
				// FIXME: performance improvement. Slow due to looping and holes.
			}
			if (DMLScript.STATISTICS)
				LineageCacheStatistics.incrementEvictTimeGpu(System.nanoTime() - t0);
			if (A == null)
				LOG.warn("cudaMalloc failed after Lineage GPU cache eviction.");
		}

		// Step 7: Try eviction/clearing exactly one with size restriction
		if(A == null) {
			long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
			synchronized (matrixMemoryManager.gpuObjects) {
				Optional<GPUObject> sizeBasedUnlockedGPUObjects = matrixMemoryManager.gpuObjects.stream()
					.filter(gpuObj -> !gpuObj.isLocked()
						&& matrixMemoryManager.getWorstCaseContiguousMemorySize(gpuObj) >= size)
					.min((o1, o2) -> worstCaseContiguousMemorySizeCompare(o1, o2));
				if(sizeBasedUnlockedGPUObjects.isPresent()) {
					evictOrClear(sizeBasedUnlockedGPUObjects.get(), opcode);
					A = cudaMallocNoWarn(tmpA, size, null);
					if(A == null)
						LOG.warn("cudaMalloc failed after clearing/evicting based on size.");
					if(DMLScript.STATISTICS) {
						long totalTime = System.nanoTime() - t0;
						GPUStatistics.cudaEvictTime.add(totalTime);
						GPUStatistics.cudaEvictSizeTime.add(totalTime);
						GPUStatistics.cudaEvictCount.increment();
						GPUStatistics.cudaEvictSizeCount.increment();
					}
				}
			}
		}
		
		// Step 8: Try eviction/clearing one-by-one based on the given policy without size restriction
		if(A == null) {
			long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
			long currentAvailableMemory = allocator.getAvailableMemory();
			boolean canFit = false;
			// ---------------------------------------------------------------
			// Evict unlocked GPU objects one-by-one and try malloc
			synchronized(matrixMemoryManager.gpuObjects) {
				List<GPUObject> unlockedGPUObjects = matrixMemoryManager.gpuObjects.stream()
						.filter(gpuObj -> !gpuObj.isLocked()).collect(Collectors.toList());
				Collections.sort(unlockedGPUObjects, new EvictionPolicyBasedComparator(size));
				while(A == null && unlockedGPUObjects.size() > 0) {
					GPUObject evictedGPUObject = unlockedGPUObjects.remove(unlockedGPUObjects.size()-1);
					evictOrClear(evictedGPUObject, opcode);
					if(!canFit) {
						currentAvailableMemory += evictedGPUObject.getSizeOnDevice();
						if(currentAvailableMemory >= size)
							canFit = true;
					}
					if(canFit) {
						// Checking before invoking cudaMalloc reduces the time spent in unnecessary cudaMalloc.
						// This was the bottleneck for ResNet200 experiments with batch size > 32 on P100+Intel
						A = cudaMallocNoWarn(tmpA, size, null); 
					}
					if(DMLScript.STATISTICS) 
						GPUStatistics.cudaEvictCount.increment();
				}
			}
			if(DMLScript.STATISTICS) {
				long totalTime = System.nanoTime() - t0;
				GPUStatistics.cudaEvictTime.add(totalTime);
			}
		}
		
		// Step 9: Handle defragmentation
		if(A == null) {
			LOG.warn("Potential fragmentation of the GPU memory. Forcibly evicting all ...");
			LOG.info("Before clearAllUnlocked, GPU Memory info:" + toString());
			matrixMemoryManager.clearAllUnlocked(opcode);
			CudaMemoryAllocator.resetUnusableFreeMemory();
			LOG.info("GPU Memory info after evicting all unlocked matrices:" + toString());
			A = cudaMallocNoWarn(tmpA, size, null);
		}
		
		if(A == null) {
			throw new DMLRuntimeException("There is not enough memory on device for this matrix, requested = " + byteCountToDisplaySize(size) + ". \n "
					+ toString());
		}
		
		long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
		if(initialize)
			cudaMemset(A, 0, size);
		addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0);
		return A;
	}