mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
This commit represents a complete rework after pulling the latest changes from official ollama/ollama repository and re-applying Tesla K80 compatibility patches. ## Key Changes ### CUDA Compute Capability 3.7 Support (Tesla K80) - Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt - Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset - Using 37-virtual (PTX with JIT compilation) for maximum compatibility ### Legacy Toolchain Compatibility - **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80) - **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7) - **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h) ### CPU Architecture Trade-offs Due to GCC 10.5 limitation, sacrificed newer CPU optimizations: - Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+) - Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA - Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility) ### Build System Updates - Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7 - Added -Wno-deprecated-gpu-targets flag to suppress warnings - Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI ### Upstream Sync Merged latest llama.cpp changes including: - Enhanced KV cache management with ISWA and hybrid memory support - Improved multi-modal support (mtmd framework) - New model architectures (Gemma3, Llama4, Qwen3, etc.) - GPU backend improvements for CUDA, Metal, and ROCm - Updated quantization support and GGUF format handling ### Documentation - Updated CLAUDE.md with comprehensive build instructions - Documented toolchain constraints and CPU architecture trade-offs - Removed outdated CI/CD workflows (tesla-k80-*.yml) - Cleaned up temporary development artifacts ## Rationale This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in official Ollama due to legacy driver/CUDA requirements. The toolchain constraint creates a deadlock: - K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI We accept the loss of cutting-edge CPU optimizations to enable running modern LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
70 lines
3.5 KiB
Diff
70 lines
3.5 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: jmorganca <jmorganca@gmail.com>
|
|
Date: Tue, 8 Apr 2025 20:35:53 -0700
|
|
Subject: [PATCH] fix string arr kv loading
|
|
|
|
certain models would error when loading
|
|
kv metadata fields that contain an array of strings
|
|
such as vocab fields
|
|
---
|
|
ggml/include/gguf.h | 1 +
|
|
ggml/src/gguf.cpp | 7 +++++--
|
|
src/llama-vocab.cpp | 4 +---
|
|
3 files changed, 7 insertions(+), 5 deletions(-)
|
|
|
|
diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
|
|
index 79ee2020..3efb22f0 100644
|
|
--- a/ggml/include/gguf.h
|
|
+++ b/ggml/include/gguf.h
|
|
@@ -114,6 +114,7 @@ extern "C" {
|
|
// get raw pointer to the first element of the array with the given key_id
|
|
// for bool arrays, note that they are always stored as int8 on all platforms (usually this makes no difference)
|
|
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id);
|
|
+ GGML_API size_t gguf_get_arr_data_n(const struct gguf_context * ctx, int64_t key_id);
|
|
|
|
// get ith C string from array with given key_id
|
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
|
|
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
|
|
index 8cc4ef1c..d950dbdf 100644
|
|
--- a/ggml/src/gguf.cpp
|
|
+++ b/ggml/src/gguf.cpp
|
|
@@ -805,10 +805,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
|
|
|
|
const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id) {
|
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
|
- GGML_ASSERT(ctx->kv[key_id].get_type() != GGUF_TYPE_STRING);
|
|
return ctx->kv[key_id].data.data();
|
|
}
|
|
|
|
+size_t gguf_get_arr_data_n(const struct gguf_context * ctx, int64_t key_id) {
|
|
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
|
+ return ctx->kv[key_id].data.size();
|
|
+}
|
|
+
|
|
const char * gguf_get_arr_str(const struct gguf_context * ctx, int64_t key_id, size_t i) {
|
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
|
GGML_ASSERT(ctx->kv[key_id].get_type() == GGUF_TYPE_STRING);
|
|
@@ -902,7 +906,6 @@ const char * gguf_get_val_str(const struct gguf_context * ctx, int64_t key_id) {
|
|
const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id) {
|
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
|
GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
|
|
- GGML_ASSERT(ctx->kv[key_id].get_type() != GGUF_TYPE_STRING);
|
|
return ctx->kv[key_id].data.data();
|
|
}
|
|
|
|
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
|
index 3de95c67..217ede47 100644
|
|
--- a/src/llama-vocab.cpp
|
|
+++ b/src/llama-vocab.cpp
|
|
@@ -1768,9 +1768,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
|
if (precompiled_charsmap_keyidx != -1) {
|
|
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
|
- GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
|
|
-
|
|
- const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
|
+ const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
|
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
|
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
|
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|