mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
Fix Tesla K80 VMM pool crash by aligning to granularity
- Fix CUDA_ERROR_INVALID_VALUE from cuMemAddressReserve by aligning max_pool_size to GPU granularity - Set dynamic max_pool_size based on 90% of actual GPU memory instead of static 32GB - Add memory availability check before allocation to prevent OOM - Tested on Tesla K80 dual GPU setup with successful model loading and chat completions 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
34
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
vendored
34
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
vendored
@@ -399,13 +399,14 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|||||||
// pool with virtual memory
|
// pool with virtual memory
|
||||||
#if defined(GGML_USE_VMM)
|
#if defined(GGML_USE_VMM)
|
||||||
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB default
|
||||||
|
|
||||||
int device;
|
int device;
|
||||||
CUdeviceptr pool_addr = 0;
|
CUdeviceptr pool_addr = 0;
|
||||||
size_t pool_used = 0;
|
size_t pool_used = 0;
|
||||||
size_t pool_size = 0;
|
size_t pool_size = 0;
|
||||||
size_t granularity;
|
size_t granularity;
|
||||||
|
size_t max_pool_size;
|
||||||
#if defined(GGML_USE_HIP)
|
#if defined(GGML_USE_HIP)
|
||||||
std::vector<std::pair<CUdeviceptr, size_t>> mappings;
|
std::vector<std::pair<CUdeviceptr, size_t>> mappings;
|
||||||
#endif
|
#endif
|
||||||
@@ -413,6 +414,16 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
|||||||
explicit ggml_cuda_pool_vmm(int device) :
|
explicit ggml_cuda_pool_vmm(int device) :
|
||||||
device(device),
|
device(device),
|
||||||
granularity(ggml_cuda_info().devices[device].vmm_granularity) {
|
granularity(ggml_cuda_info().devices[device].vmm_granularity) {
|
||||||
|
// Get actual GPU memory and set a reasonable max pool size
|
||||||
|
size_t free_mem, total_mem;
|
||||||
|
ggml_cuda_set_device(device);
|
||||||
|
CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem));
|
||||||
|
|
||||||
|
// Use 90% of total GPU memory as max, or default 32GB, whichever is smaller
|
||||||
|
max_pool_size = std::min(CUDA_POOL_VMM_MAX_SIZE, (size_t)(total_mem * 0.9));
|
||||||
|
|
||||||
|
// CRITICAL: Align max_pool_size to granularity to avoid CUDA_ERROR_INVALID_VALUE
|
||||||
|
max_pool_size = ((max_pool_size + granularity - 1) / granularity) * granularity;
|
||||||
}
|
}
|
||||||
|
|
||||||
~ggml_cuda_pool_vmm() {
|
~ggml_cuda_pool_vmm() {
|
||||||
@@ -425,7 +436,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
|||||||
#else
|
#else
|
||||||
CU_CHECK(cuMemUnmap(pool_addr, pool_size));
|
CU_CHECK(cuMemUnmap(pool_addr, pool_size));
|
||||||
#endif
|
#endif
|
||||||
CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
|
CU_CHECK(cuMemAddressFree(pool_addr, max_pool_size));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -441,7 +452,22 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
|||||||
size_t reserve_size = size - avail;
|
size_t reserve_size = size - avail;
|
||||||
reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
|
reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
|
||||||
|
|
||||||
GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
|
GGML_ASSERT(pool_size + reserve_size <= max_pool_size);
|
||||||
|
|
||||||
|
// Check if we have enough free memory before attempting allocation
|
||||||
|
size_t free_mem, total_mem;
|
||||||
|
ggml_cuda_set_device(device);
|
||||||
|
CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem));
|
||||||
|
|
||||||
|
if (reserve_size > free_mem) {
|
||||||
|
// Not enough free memory, reduce reserve_size to what's available
|
||||||
|
reserve_size = (free_mem / granularity) * granularity; // round down to granularity
|
||||||
|
if (reserve_size == 0) {
|
||||||
|
GGML_LOG_WARN("%s: Not enough free GPU memory on device %d (requested: %zu, available: %zu)\n",
|
||||||
|
__func__, device, size, free_mem);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// allocate more physical memory
|
// allocate more physical memory
|
||||||
CUmemAllocationProp prop = {};
|
CUmemAllocationProp prop = {};
|
||||||
@@ -453,7 +479,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
|||||||
|
|
||||||
// reserve virtual address space (if not already reserved)
|
// reserve virtual address space (if not already reserved)
|
||||||
if (pool_addr == 0) {
|
if (pool_addr == 0) {
|
||||||
CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
|
CU_CHECK(cuMemAddressReserve(&pool_addr, max_pool_size, 0, 0, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
// map at the end of the pool
|
// map at the end of the pool
|
||||||
|
|||||||
Reference in New Issue
Block a user