Fix Tesla K80 VMM pool crash by aligning to granularity

- Fix CUDA_ERROR_INVALID_VALUE from cuMemAddressReserve by aligning max_pool_size to GPU granularity - Set dynamic max_pool_size based on 90% of actual GPU memory instead of static 32GB - Add memory availability check before allocation to prevent OOM - Tested on Tesla K80 dual GPU setup with successful model loading and chat completions 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-10 15:57:04 +00:00 · 2025-08-08 17:48:31 +08:00
parent e4113f080a
commit 46213c5880
1 changed files with 30 additions and 4 deletions
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -399,13 +399,14 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
 // pool with virtual memory
 #if defined(GGML_USE_VMM)
 struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
-    static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
+    static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB default
    int device;
    CUdeviceptr pool_addr = 0;
    size_t pool_used = 0;
    size_t pool_size = 0;
    size_t granularity;
    size_t max_pool_size;
 #if defined(GGML_USE_HIP)
    std::vector<std::pair<CUdeviceptr, size_t>> mappings;
 #endif
@@ -413,6 +414,16 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
    explicit ggml_cuda_pool_vmm(int device) :
        device(device),
        granularity(ggml_cuda_info().devices[device].vmm_granularity) {
        // Get actual GPU memory and set a reasonable max pool size
        size_t free_mem, total_mem;
        ggml_cuda_set_device(device);
        CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem));
        // Use 90% of total GPU memory as max, or default 32GB, whichever is smaller
        max_pool_size = std::min(CUDA_POOL_VMM_MAX_SIZE, (size_t)(total_mem * 0.9));
        // CRITICAL: Align max_pool_size to granularity to avoid CUDA_ERROR_INVALID_VALUE
        max_pool_size = ((max_pool_size + granularity - 1) / granularity) * granularity;
    }
    ~ggml_cuda_pool_vmm() {
@@ -425,7 +436,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
 #else
            CU_CHECK(cuMemUnmap(pool_addr, pool_size));
 #endif
-            CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
+            CU_CHECK(cuMemAddressFree(pool_addr, max_pool_size));
        }
    }
@@ -441,7 +452,22 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
            size_t reserve_size = size - avail;
            reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
-            GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
+            GGML_ASSERT(pool_size + reserve_size <= max_pool_size);
            // Check if we have enough free memory before attempting allocation
            size_t free_mem, total_mem;
            ggml_cuda_set_device(device);
            CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem));
            if (reserve_size > free_mem) {
                // Not enough free memory, reduce reserve_size to what's available
                reserve_size = (free_mem / granularity) * granularity; // round down to granularity
                if (reserve_size == 0) {
                    GGML_LOG_WARN("%s: Not enough free GPU memory on device %d (requested: %zu, available: %zu)\n", 
                                  __func__, device, size, free_mem);
                    return nullptr;
                }
            }
            // allocate more physical memory
            CUmemAllocationProp prop = {};
@@ -453,7 +479,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
            // reserve virtual address space (if not already reserved)
            if (pool_addr == 0) {
-                CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
+                CU_CHECK(cuMemAddressReserve(&pool_addr, max_pool_size, 0, 0, 0));
            }
            // map at the end of the pool