at::DataPtr TorchCudaAllocator::allocate()

in maga_transformer/cpp/core/torch_utils/torch_cuda_allocator.cc [18:187]


at::DataPtr TorchCudaAllocator::allocate(size_t size) {
#else
at::DataPtr TorchCudaAllocator::allocate(size_t size) const {
#endif
    auto buffer = device_->allocateBuffer({size}, {"torch_allocated"});
    auto buffer_ctx = new BufferPtr(buffer);
    const auto ptr = buffer->data();
    const auto ctx_deleter = [](void* ctx_ptr) {
        auto ptr = (BufferPtr *)ctx_ptr;
        delete ptr;
    };
    return at::DataPtr(ptr, buffer_ctx, ctx_deleter, torch_device_);
}

void TorchCudaAllocator::malloc(void** devPtr, TORCH_CUDA_ALLOCATOR_INDEX_DTYPE device, size_t size, cudaStream_t stream) {
    throw std::runtime_error("not implemented.");
}

void TorchCudaAllocator::free(void** ptr) {
    throw std::runtime_error("not implemented.");
}

#ifdef UNDER_TORCH_2_6
void TorchCudaAllocator::copy_data(void* dest, const void* src, size_t count) const {
    throw std::runtime_error("not implemented.");
}

double TorchCudaAllocator::getMemoryFraction(TORCH_CUDA_ALLOCATOR_INDEX_DTYPE device) {
    throw std::runtime_error("not implemented.");
}

void TorchCudaAllocator::enable(bool value) {
    throw std::runtime_error("not implemented.");
}

bool TorchCudaAllocator::isEnabled() const {
    throw std::runtime_error("not implemented.");
}

void TorchCudaAllocator::beginAllocateToPool(TORCH_CUDA_ALLOCATOR_INDEX_DTYPE device, at::cuda::MempoolId_t mempool_id, std::function<bool(cudaStream_t)> filter) {
    throw std::runtime_error("not implemented.");
};

void TorchCudaAllocator::endAllocateToPool(TORCH_CUDA_ALLOCATOR_INDEX_DTYPE device, at::cuda::MempoolId_t mempool_id){
    throw std::runtime_error("not implemented.");
}

void TorchCudaAllocator::attachAllocatorTraceTracker(c10::cuda::CUDACachingAllocator::AllocatorTraceTracker tracker) {
    throw std::runtime_error("not implemented.");
}

c10::cuda::CUDACachingAllocator::ShareableHandle TorchCudaAllocator::shareIpcHandle(void* ptr) {
    throw std::runtime_error("not implemented.");
}
#else
void TorchCudaAllocator::beginAllocateStreamToPool(TORCH_CUDA_ALLOCATOR_INDEX_DTYPE device, cudaStream_t stream, at::cuda::MempoolId_t mempool_id) {
    throw std::runtime_error("not implemented.");
};

void TorchCudaAllocator::endAllocateStreamToPool(TORCH_CUDA_ALLOCATOR_INDEX_DTYPE device, cudaStream_t stream) {
    throw std::runtime_error("not implemented.");
}
#endif

void TorchCudaAllocator::setMemoryFraction(double fraction, TORCH_CUDA_ALLOCATOR_INDEX_DTYPE device) {
    throw std::runtime_error("not implemented.");
}

void TorchCudaAllocator::recordHistory(bool                                            enabled,
                                       at::cuda::CUDACachingAllocator::CreateContextFn context_recorder,
                                       size_t                                          alloc_trace_max_entries,
                                       at::cuda::CUDACachingAllocator::RecordContext   when) {
    throw std::runtime_error("not implemented.");
}

bool TorchCudaAllocator::isHistoryEnabled() {
    return false;
}

bool TorchCudaAllocator::checkPoolLiveAllocations(TORCH_CUDA_ALLOCATOR_INDEX_DTYPE                 device,
                                                  at::cuda::MempoolId_t            mempool_id,
                                                  const std::unordered_set<void*>& expected_live_allocations) {
    return true;
}

void TorchCudaAllocator::attachOutOfMemoryObserver(at::cuda::CUDACachingAllocator::OutOfMemoryObserver observer) {
    throw std::runtime_error("not implemented.");
}

void TorchCudaAllocator::emptyCache() {}

void* TorchCudaAllocator::getBaseAllocation(void* ptr, size_t* outSize) {
    return ptr;
}

void TorchCudaAllocator::recordStream(const at::DataPtr& ptr, at::cuda::CUDAStream stream) {
}

at::cuda::CUDACachingAllocator::SnapshotInfo TorchCudaAllocator::snapshot() {
    throw std::runtime_error("not implemented.");
}

std::shared_ptr<at::cuda::CUDACachingAllocator::AllocatorState>
TorchCudaAllocator::getCheckpointState(TORCH_CUDA_ALLOCATOR_INDEX_DTYPE device, at::cuda::MempoolId_t id) {
    throw std::runtime_error("not implemented.");
}

at::cuda::CUDACachingAllocator::CheckpointDelta
TorchCudaAllocator::setCheckpointPoolState(TORCH_CUDA_ALLOCATOR_INDEX_DTYPE device, std::shared_ptr<at::cuda::CUDACachingAllocator::AllocatorState> as) {
    at::cuda::CUDACachingAllocator::CheckpointDelta cpd;
    return cpd;
}

at::DeleterFnPtr TorchCudaAllocator::raw_deleter() const {
    throw std::runtime_error("not implemented.");
}

void* TorchCudaAllocator::raw_alloc(size_t nbytes) {
    throw std::runtime_error("not implemented.");
}

void* TorchCudaAllocator::raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
    throw std::runtime_error("not implemented.");
}

cudaError_t TorchCudaAllocator::memcpyAsync(
    void* dst, int dstDevice, const void* src, int srcDevice,
    size_t count, cudaStream_t stream, bool p2p_enabled)
{
    RTP_LLM_CHECK_WITH_INFO(((srcDevice == dstDevice) || (p2p_enabled)),
                       "p2p is required to copy across device.");
    return cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream);
}

void TorchCudaAllocator::raw_delete(void* ptr) {
    throw std::runtime_error("not implemented.");
}

std::shared_ptr<void> TorchCudaAllocator::getIpcDevPtr(std::string handle) {
    return nullptr;
}

std::string TorchCudaAllocator::name() {
    return "torch_cuda_allocator";
}

void TorchCudaAllocator::releasePool(TORCH_CUDA_ALLOCATOR_INDEX_DTYPE device, at::cuda::MempoolId_t mempool_id) {
    throw std::runtime_error("not implemented.");
}

void TorchCudaAllocator::enablePeerAccess(TORCH_CUDA_ALLOCATOR_INDEX_DTYPE dev, TORCH_CUDA_ALLOCATOR_INDEX_DTYPE dev_to_access) {
    throw std::runtime_error("not implemented.");
}

void TorchCudaAllocator::cacheInfo(TORCH_CUDA_ALLOCATOR_INDEX_DTYPE device, size_t* largestBlock) {}

void TorchCudaAllocator::assertValidDevice(TORCH_CUDA_ALLOCATOR_INDEX_DTYPE device) {}

at::cuda::CUDACachingAllocator::DeviceStats TorchCudaAllocator::getDeviceStats(TORCH_CUDA_ALLOCATOR_INDEX_DTYPE device) {
    throw std::runtime_error("not implemented.");
}

void TorchCudaAllocator::resetAccumulatedStats(TORCH_CUDA_ALLOCATOR_INDEX_DTYPE device) {
    throw std::runtime_error("not implemented.");
}

void TorchCudaAllocator::resetPeakStats(TORCH_CUDA_ALLOCATOR_INDEX_DTYPE device) {
    throw std::runtime_error("not implemented.");
}
}  // namespace rtp_llm