mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
This commit represents a complete rework after pulling the latest changes from official ollama/ollama repository and re-applying Tesla K80 compatibility patches. ## Key Changes ### CUDA Compute Capability 3.7 Support (Tesla K80) - Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt - Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset - Using 37-virtual (PTX with JIT compilation) for maximum compatibility ### Legacy Toolchain Compatibility - **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80) - **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7) - **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h) ### CPU Architecture Trade-offs Due to GCC 10.5 limitation, sacrificed newer CPU optimizations: - Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+) - Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA - Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility) ### Build System Updates - Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7 - Added -Wno-deprecated-gpu-targets flag to suppress warnings - Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI ### Upstream Sync Merged latest llama.cpp changes including: - Enhanced KV cache management with ISWA and hybrid memory support - Improved multi-modal support (mtmd framework) - New model architectures (Gemma3, Llama4, Qwen3, etc.) - GPU backend improvements for CUDA, Metal, and ROCm - Updated quantization support and GGUF format handling ### Documentation - Updated CLAUDE.md with comprehensive build instructions - Documented toolchain constraints and CPU architecture trade-offs - Removed outdated CI/CD workflows (tesla-k80-*.yml) - Cleaned up temporary development artifacts ## Rationale This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in official Ollama due to legacy driver/CUDA requirements. The toolchain constraint creates a deadlock: - K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI We accept the loss of cutting-edge CPU optimizations to enable running modern LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
142 lines
6.7 KiB
Diff
142 lines
6.7 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Jesse Gross <jesse@ollama.com>
|
|
Date: Wed, 27 Aug 2025 14:39:48 -0700
|
|
Subject: [PATCH] ggml: Enable resetting backend devices
|
|
|
|
Touching a CUDA device causes the allocation of a primary context
|
|
with CUDA data structures (~300 MB of VRAM). If a device is
|
|
unused then it can be reset to free these data structures.
|
|
---
|
|
ggml/include/ggml-backend.h | 1 +
|
|
ggml/src/ggml-backend-impl.h | 4 ++++
|
|
ggml/src/ggml-backend.cpp | 8 ++++++++
|
|
ggml/src/ggml-cuda/ggml-cuda.cu | 16 +++++++++++++++-
|
|
ggml/src/ggml-cuda/vendors/hip.h | 1 +
|
|
src/llama.cpp | 4 +++-
|
|
6 files changed, 32 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
|
index 1ff53ed03..ba181d09d 100644
|
|
--- a/ggml/include/ggml-backend.h
|
|
+++ b/ggml/include/ggml-backend.h
|
|
@@ -178,6 +178,7 @@ extern "C" {
|
|
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
|
|
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
|
|
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
|
|
+ GGML_API void ggml_backend_dev_reset(ggml_backend_dev_t device);
|
|
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
|
|
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
|
|
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
|
|
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
|
|
index 3c3f22fc0..43c91d9f2 100644
|
|
--- a/ggml/src/ggml-backend-impl.h
|
|
+++ b/ggml/src/ggml-backend-impl.h
|
|
@@ -195,6 +195,10 @@ extern "C" {
|
|
ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
|
|
void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
|
void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
|
+
|
|
+ // (optional) reset device, clearing existing allocations and context
|
|
+ // the caller must ensure that there are no outstanding buffers, as these will become invalid
|
|
+ void (*reset)(ggml_backend_dev_t dev);
|
|
};
|
|
|
|
struct ggml_backend_device {
|
|
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
|
index 6ef5eeafa..0b757af59 100644
|
|
--- a/ggml/src/ggml-backend.cpp
|
|
+++ b/ggml/src/ggml-backend.cpp
|
|
@@ -526,6 +526,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
|
|
return device->iface.init_backend(device, params);
|
|
}
|
|
|
|
+void ggml_backend_dev_reset(ggml_backend_dev_t device) {
|
|
+ if (device->iface.reset == NULL) {
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ device->iface.reset(device);
|
|
+}
|
|
+
|
|
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
|
|
GGML_ASSERT(device);
|
|
return device->iface.get_buffer_type(device);
|
|
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
index 811462c79..87c6c34a4 100644
|
|
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
@@ -107,6 +107,11 @@ int ggml_cuda_get_device() {
|
|
return id;
|
|
}
|
|
|
|
+void ggml_cuda_reset_device(int device) {
|
|
+ ggml_cuda_set_device(device);
|
|
+ CUDA_CHECK(cudaDeviceReset());
|
|
+}
|
|
+
|
|
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
|
ggml_cuda_set_device(device);
|
|
cudaError_t err;
|
|
@@ -3515,7 +3520,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
|
props->id = ggml_backend_cuda_device_get_id(dev);
|
|
props->type = ggml_backend_cuda_device_get_type(dev);
|
|
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
|
- ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
|
+
|
|
+ // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device).
|
|
+ // If you need the memory data, call ggml_backend_dev_memory() explicitly.
|
|
+ props->memory_total = props->memory_free = 0;
|
|
|
|
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
|
@@ -3948,6 +3956,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
|
|
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
|
}
|
|
|
|
+static void ggml_backend_cuda_device_reset(ggml_backend_dev_t dev) {
|
|
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
|
+ ggml_cuda_reset_device(ctx->device);
|
|
+}
|
|
+
|
|
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
|
/* .get_name = */ ggml_backend_cuda_device_get_name,
|
|
/* .get_description = */ ggml_backend_cuda_device_get_description,
|
|
@@ -3964,6 +3977,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
|
/* .event_new = */ ggml_backend_cuda_device_event_new,
|
|
/* .event_free = */ ggml_backend_cuda_device_event_free,
|
|
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
|
|
+ /* .reset = */ ggml_backend_cuda_device_reset,
|
|
};
|
|
|
|
// backend reg
|
|
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
|
|
index 890c10364..1f06be80e 100644
|
|
--- a/ggml/src/ggml-cuda/vendors/hip.h
|
|
+++ b/ggml/src/ggml-cuda/vendors/hip.h
|
|
@@ -45,6 +45,7 @@
|
|
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
|
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
|
#define cudaDeviceProp hipDeviceProp_t
|
|
+#define cudaDeviceReset hipDeviceReset
|
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
|
#define cudaError_t hipError_t
|
|
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
|
index fe5a7a835..d821a96a0 100644
|
|
--- a/src/llama.cpp
|
|
+++ b/src/llama.cpp
|
|
@@ -267,10 +267,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
for (auto * dev : model->devices) {
|
|
ggml_backend_dev_props props;
|
|
ggml_backend_dev_get_props(dev, &props);
|
|
+ size_t memory_free, memory_total;
|
|
+ ggml_backend_dev_memory(dev, &memory_free, &memory_total);
|
|
LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
|
|
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
|
props.device_id ? props.device_id : "unknown id",
|
|
- props.memory_free/1024/1024);
|
|
+ memory_free/1024/1024);
|
|
}
|
|
|
|
const int status = llama_model_load(path_model, splits, *model, params);
|