Sync with upstream ollama/ollama and restore Tesla K80 (compute 3.7) support

This commit represents a complete rework after pulling the latest changes from official ollama/ollama repository and re-applying Tesla K80 compatibility patches. ## Key Changes ### CUDA Compute Capability 3.7 Support (Tesla K80) - Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt - Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset - Using 37-virtual (PTX with JIT compilation) for maximum compatibility ### Legacy Toolchain Compatibility - **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80) - **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7) - **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h) ### CPU Architecture Trade-offs Due to GCC 10.5 limitation, sacrificed newer CPU optimizations: - Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+) - Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA - Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility) ### Build System Updates - Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7 - Added -Wno-deprecated-gpu-targets flag to suppress warnings - Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI ### Upstream Sync Merged latest llama.cpp changes including: - Enhanced KV cache management with ISWA and hybrid memory support - Improved multi-modal support (mtmd framework) - New model architectures (Gemma3, Llama4, Qwen3, etc.) - GPU backend improvements for CUDA, Metal, and ROCm - Updated quantization support and GGUF format handling ### Documentation - Updated CLAUDE.md with comprehensive build instructions - Documented toolchain constraints and CPU architecture trade-offs - Removed outdated CI/CD workflows (tesla-k80-*.yml) - Cleaned up temporary development artifacts ## Rationale This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in official Ollama due to legacy driver/CUDA requirements. The toolchain constraint creates a deadlock: - K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI We accept the loss of cutting-edge CPU optimizations to enable running modern LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-19 04:07:01 +00:00 · 2025-11-05 14:03:05 +08:00
parent fabe2c5cb7
commit ef14fb5b26
817 changed files with 241634 additions and 70888 deletions
--- a/llama/llama.cpp/common/json-schema-to-grammar.cpp
+++ b/llama/llama.cpp/common/json-schema-to-grammar.cpp
@@ -1,8 +1,9 @@
 #include "json-schema-to-grammar.h"
 #include "common.h"

+#include <nlohmann/json.hpp>
+
 #include <algorithm>
-#include <fstream>
 #include <map>
 #include <regex>
 #include <sstream>
@@ -40,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
    return result;
 }

-/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
-class string_view {
-    const std::string & _str;
-    const size_t _start;
-    const size_t _end;
-public:
-    string_view(const std::string & str, size_t start = 0, size_t end  = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
-
-    size_t size() const {
-        return _end - _start;
-    }
-
-    size_t length() const {
-        return size();
-    }
-
-    operator std::string() const {
-        return str();
-    }
-
-    std::string str() const {
-        return _str.substr(_start, _end - _start);
-    }
-
-    string_view substr(size_t pos, size_t len = std::string::npos) const {
-        return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
-    }
-
-    char operator[](size_t pos) const {
-        auto index = _start + pos;
-        if (index >= _end) {
-            throw std::out_of_range("string_view index out of range");
-        }
-        return _str[_start + pos];
-    }
-
-    bool operator==(const string_view & other) const {
-        std::string this_str = *this;
-        std::string other_str = other;
-        return this_str == other_str;
-    }
-};
-
 static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
    auto has_min = min_value != std::numeric_limits<int>::min();
    auto has_max = max_value != std::numeric_limits<int>::max();
@@ -111,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
        }
        out << "}";
    };
-    std::function<void(const string_view &, const string_view &)> uniform_range =
-        [&](const string_view & from, const string_view & to) {
+    std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
+        [&](const std::string_view & from, const std::string_view & to) {
            size_t i = 0;
            while (i < from.length() && i < to.length() && from[i] == to[i]) {
                i++;
            }
            if (i > 0) {
-                out << "\"" << from.substr(0, i).str() << "\"";
+                out << "\"" << from.substr(0, i) << "\"";
            }
            if (i < from.length() && i < to.length()) {
                if (i > 0) {
@@ -299,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
 };

 static bool is_reserved_name(const std::string & name) {
-    static std::unordered_set<std::string> RESERVED_NAMES;
-    if (RESERVED_NAMES.empty()) {
-        RESERVED_NAMES.insert("root");
-        for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
-        for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
-    }
+    static const std::unordered_set<std::string> RESERVED_NAMES = [] {
+        std::unordered_set<std::string> s;
+        s.insert("root");
+        for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
+        for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
+        return s;
+    }();
    return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
 }

@@ -885,9 +844,10 @@ public:
                _build_object_rule(
                    properties, required, name,
                    schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
-        } else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
+        } else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
            std::unordered_set<std::string> required;
            std::vector<std::pair<std::string, json>> properties;
+            std::map<std::string, size_t> enum_values;
            std::string hybrid_name = name;
            std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
                if (comp_schema.contains("$ref")) {
@@ -899,6 +859,14 @@ public:
                            required.insert(prop.key());
                        }
                    }
+                } else if (comp_schema.contains("enum")) {
+                    for (const auto & v : comp_schema["enum"]) {
+                        const auto rule = _generate_constant_rule(v);
+                        if (enum_values.find(rule) == enum_values.end()) {
+                            enum_values[rule] = 0;
+                        }
+                        enum_values[rule] += 1;
+                    }
                } else {
                  // todo warning
                }
@@ -912,6 +880,17 @@ public:
                    add_component(t, true);
                }
            }
+            if (!enum_values.empty()) {
+                std::vector<std::string> enum_intersection;
+                for (const auto & p : enum_values) {
+                    if (p.second == schema["allOf"].size()) {
+                        enum_intersection.push_back(p.first);
+                    }
+                }
+                if (!enum_intersection.empty()) {
+                    return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
+                }
+            }
            return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
        } else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
            json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];