diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index 812da076..0241d09a 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -399,13 +399,14 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { // pool with virtual memory #if defined(GGML_USE_VMM) struct ggml_cuda_pool_vmm : public ggml_cuda_pool { - static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB + static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB default int device; CUdeviceptr pool_addr = 0; size_t pool_used = 0; size_t pool_size = 0; size_t granularity; + size_t max_pool_size; #if defined(GGML_USE_HIP) std::vector> mappings; #endif @@ -413,6 +414,16 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { explicit ggml_cuda_pool_vmm(int device) : device(device), granularity(ggml_cuda_info().devices[device].vmm_granularity) { + // Get actual GPU memory and set a reasonable max pool size + size_t free_mem, total_mem; + ggml_cuda_set_device(device); + CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem)); + + // Use 90% of total GPU memory as max, or default 32GB, whichever is smaller + max_pool_size = std::min(CUDA_POOL_VMM_MAX_SIZE, (size_t)(total_mem * 0.9)); + + // CRITICAL: Align max_pool_size to granularity to avoid CUDA_ERROR_INVALID_VALUE + max_pool_size = ((max_pool_size + granularity - 1) / granularity) * granularity; } ~ggml_cuda_pool_vmm() { @@ -425,7 +436,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { #else CU_CHECK(cuMemUnmap(pool_addr, pool_size)); #endif - CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE)); + CU_CHECK(cuMemAddressFree(pool_addr, max_pool_size)); } } @@ -441,7 +452,22 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { size_t reserve_size = size - avail; reserve_size = granularity * ((reserve_size + granularity - 1) / granularity); - GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE); + GGML_ASSERT(pool_size + reserve_size <= max_pool_size); + + // Check if we have enough free memory before attempting allocation + size_t free_mem, total_mem; + ggml_cuda_set_device(device); + CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem)); + + if (reserve_size > free_mem) { + // Not enough free memory, reduce reserve_size to what's available + reserve_size = (free_mem / granularity) * granularity; // round down to granularity + if (reserve_size == 0) { + GGML_LOG_WARN("%s: Not enough free GPU memory on device %d (requested: %zu, available: %zu)\n", + __func__, device, size, free_mem); + return nullptr; + } + } // allocate more physical memory CUmemAllocationProp prop = {}; @@ -453,7 +479,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { // reserve virtual address space (if not already reserved) if (pool_addr == 0) { - CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0)); + CU_CHECK(cuMemAddressReserve(&pool_addr, max_pool_size, 0, 0, 0)); } // map at the end of the pool