next build (#8539)

* add build to .dockerignore * test: only build one arch * add build to .gitignore * fix ccache path * filter amdgpu targets * only filter if autodetecting * Don't clobber gpu list for default runner This ensures the GPU specific environment variables are set properly * explicitly set CXX compiler for HIP * Update build_windows.ps1 This isn't complete, but is close. Dependencies are missing, and it only builds the "default" preset. * build: add ollama subdir * add .git to .dockerignore * docs: update development.md * update build_darwin.sh * remove unused scripts * llm: add cwd and build/lib/ollama to library paths * default DYLD_LIBRARY_PATH to LD_LIBRARY_PATH in runner on macOS * add additional cmake output vars for msvc * interim edits to make server detection logic work with dll directories like lib/ollama/cuda_v12 * remove unncessary filepath.Dir, cleanup * add hardware-specific directory to path * use absolute server path * build: linux arm * cmake install targets * remove unused files * ml: visit each library path once * build: skip cpu variants on arm * build: install cpu targets * build: fix workflow * shorter names * fix rocblas install * docs: clean up development.md * consistent build dir removal in development.md * silence -Wimplicit-function-declaration build warnings in ggml-cpu * update readme * update development readme * llm: update library lookup logic now that there is one runner (#8587) * tweak development.md * update docs * add windows cuda/rocm tests --------- Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Daniel Hiltgen <daniel@ollama.com>
2025-12-10 07:46:59 +00:00 · 2025-01-29 15:03:38 -08:00
parent 2ef3c803a1
commit dcfb7a105c
542 changed files with 5796 additions and 11469 deletions
--- a/llama/patches/0001-cuda.patch
+++ b/llama/patches/0001-cuda.patch
@@ -4,39 +4,44 @@ Date: Thu, 6 Jun 2024 23:55:47 -0700
 Subject: [PATCH] cuda

 ---
- ggml/src/ggml-backend.cpp       | 5 +++++
- ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++++
- 2 files changed, 9 insertions(+)
+ ggml/src/ggml-backend.cpp        | 1 -
+ ggml/src/ggml-cuda/ggml-cuda.cu  | 1 +
+ ggml/src/ggml-metal/ggml-metal.m | 1 +
+ 3 files changed, 2 insertions(+), 1 deletion(-)

 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index e2d6c405..1b62c056 100644
+index e2d6c405..a12172dc 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
-@@ -106,7 +106,12 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
+@@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
     if (buffer->iface.free_buffer != NULL) {
         buffer->iface.free_buffer(buffer);
     }
-+
-+// TODO: this needs to be freed in cuda and hip backends because
-+// the cuda backend implementation compiled with msvc
-+#if !defined(GGML_USE_CUDA) && !defined(GGML_USE_HIP)
-     delete buffer;
-+#endif
+-    delete buffer;
 }
 
 size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 0b06be72..0a6ae325 100644
+index 0b06be72..be29e979 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -424,6 +424,10 @@ struct ggml_backend_cuda_buffer_context {
+@@ -424,6 +424,7 @@ struct ggml_backend_cuda_buffer_context {
 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     delete ctx;
-+
-+    // TODO: this needs to be freed in cuda and hipblas backends because
-+    // the cuda backend implementation compiled with msvc
-+    free(buffer);
+    delete buffer;
 }
 
 static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
+diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
+index a85502ee..cd8ef741 100644
+--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
+@@ -4187,6 +4187,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
+     }
+ 
+     free(ctx);
+    free(buffer);
+ }
+ 
+ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
--- a/llama/patches/0006-conditional-fattn.patch
+++ b/llama/patches/0006-conditional-fattn.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 0a6ae325..bb425ee8 100644
+index be29e979..aaa79ea4 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2162,9 +2162,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2159,9 +2159,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_ARGSORT:
             ggml_cuda_op_argsort(ctx, dst);
             break;
--- a/llama/patches/0007-add-mllama-support.patch
+++ b/llama/patches/0007-add-mllama-support.patch
--- a/llama/patches/0007-blas.patch
+++ b/llama/patches/0007-blas.patch
@@ -1,26 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@ollama.com>
-Date: Mon, 30 Sep 2024 16:31:04 -0700
-Subject: [PATCH] blas
-
---
- ggml/src/ggml-blas/ggml-blas.cpp | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
-index ec158dfa..b3ac1fa4 100644
--- a/ggml/src/ggml-blas/ggml-blas.cpp
-+++ b/ggml/src/ggml-blas/ggml-blas.cpp
-@@ -1,3 +1,5 @@
-+#ifdef GGML_USE_BLAS
-+
- #include "ggml-impl.h"
- #include "ggml-blas.h"
- #include "ggml-backend-impl.h"
-@@ -515,3 +517,5 @@ ggml_backend_reg_t ggml_backend_blas_reg(void) {
- }
- 
- GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
-+
-+#endif // GGML_USE_BLAS
-\ No newline at end of file
--- a/llama/patches/0008-add-unpad-operator.patch
+++ b/llama/patches/0008-add-unpad-operator.patch
@@ -126,10 +126,10 @@ index b7fefb9d..b307d554 100644
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index bb425ee8..1e7c2a22 100644
+index aaa79ea4..9286f866 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2085,6 +2085,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2082,6 +2082,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_PAD:
             ggml_cuda_op_pad(ctx, dst);
             break;
@@ -139,7 +139,7 @@ index bb425ee8..1e7c2a22 100644
         case GGML_OP_ARANGE:
             ggml_cuda_op_arange(ctx, dst);
             break;
-@@ -3013,6 +3016,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
+@@ -3010,6 +3013,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_GROUP_NORM:
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
@@ -211,10 +211,10 @@ index 8fd386b0..e2ededc3 100644
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index a85502ee..84e027eb 100644
+index cd8ef741..318addec 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -311,6 +311,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
+@@ -311,6 +311,7 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
     GGML_METAL_KERNEL_TYPE_PAD_F32,
     GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
@@ -222,7 +222,7 @@ index a85502ee..84e027eb 100644
     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-@@ -910,6 +911,7 @@ @implementation GGMLMetalClass
+@@ -910,6 +911,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                   upscale_f32,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                       pad_f32,                        true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,            pad_reflect_1d_f32,             true);
--- a/llama/patches/0009-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0009-fix-deepseek-deseret-regex.patch
--- a/llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
+++ b/llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
--- a/llama/patches/0011-fix-missing-arg-in-static-assert-on-windows.patch
+++ b/llama/patches/0011-fix-missing-arg-in-static-assert-on-windows.patch
--- a/llama/patches/0011-relative-include-paths.patch
+++ b/llama/patches/0011-relative-include-paths.patch
@@ -1,51 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Tue, 3 Dec 2024 21:30:51 -0800
-Subject: [PATCH] relative include paths
-
---
- ggml/src/ggml-cpu/ggml-cpu.c   | 2 +-
- ggml/src/ggml-cpu/ggml-cpu.cpp | 3 +--
- ggml/src/ggml-quants.c         | 2 +-
- 3 files changed, 3 insertions(+), 4 deletions(-)
-
-diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index b307d554..4eb39c52 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
-+++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -10,7 +10,7 @@
- #include "ggml-quants.h"
- #include "ggml-cpu-quants.h"
- #include "ggml-threading.h"
-#include "amx/amx.h"
-+#include "amx.h"
- #include "ggml.h"
- 
- #if defined(_MSC_VER) || defined(__MINGW32__)
-diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
-index f11399cc..2a8b40ce 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
-+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
-@@ -4,8 +4,7 @@
- #include "ggml-cpu-aarch64.h"
- #include "ggml-cpu-traits.h"
- #include "ggml-impl.h"
-#include "amx/amx.h"
-
-+#include "amx.h"
- #include <cctype>
- #include <string>
- #include <vector>
-diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
-index 7918388a..e2ed84e4 100644
--- a/ggml/src/ggml-quants.c
-+++ b/ggml/src/ggml-quants.c
-@@ -3,7 +3,7 @@
- 
- #include "ggml-quants.h"
- #include "ggml-impl.h"
-#include "ggml-cpu/ggml-cpu-impl.h"
-+#include "ggml-cpu-impl.h"
- #include "ggml-cpu.h"
- 
- #include <math.h>
--- a/llama/patches/0012-llama-Ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0012-llama-Ensure-KV-cache-is-fully-defragmented.patch
--- a/llama/patches/0013-re-enable-gpu-for-clip.patch
+++ b/llama/patches/0013-re-enable-gpu-for-clip.patch
--- a/llama/patches/0014-sort-devices-by-score.patch
+++ b/llama/patches/0014-sort-devices-by-score.patch
@@ -0,0 +1,82 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Tue, 14 Jan 2025 12:01:24 -0800
+Subject: [PATCH] sort devices by score
+
+---
+ ggml/src/ggml-backend-reg.cpp | 21 +++++++++++++--------
+ 1 file changed, 13 insertions(+), 8 deletions(-)
+
+diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
+index 899d16f2..ac5cda07 100644
+--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
+@@ -150,7 +150,7 @@ struct ggml_backend_reg_entry {
+ 
+ struct ggml_backend_registry {
+     std::vector<ggml_backend_reg_entry> backends;
+-    std::vector<ggml_backend_dev_t> devices;
+    std::vector<std::pair<ggml_backend_dev_t, int>> devices;
+ 
+     ggml_backend_registry() {
+ #ifdef GGML_USE_CUDA
+@@ -195,7 +195,7 @@ struct ggml_backend_registry {
+         }
+     }
+ 
+-    void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
+    void register_backend(ggml_backend_reg_t reg, int score = -1, dl_handle_ptr handle = nullptr) {
+         if (!reg) {
+             return;
+         }
+@@ -206,15 +206,15 @@ struct ggml_backend_registry {
+ #endif
+         backends.push_back({ reg, std::move(handle) });
+         for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
+-            register_device(ggml_backend_reg_dev_get(reg, i));
+            register_device(ggml_backend_reg_dev_get(reg, i), score);
+         }
+     }
+ 
+-    void register_device(ggml_backend_dev_t device) {
+    void register_device(ggml_backend_dev_t device, int score = -1) {
+ #ifndef NDEBUG
+         GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
+ #endif
+-        devices.push_back(device);
+        devices.push_back({device, score});
+     }
+ 
+     ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
+@@ -257,7 +257,7 @@ struct ggml_backend_registry {
+ 
+         GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
+ 
+-        register_backend(reg, std::move(handle));
+        register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
+ 
+         return reg;
+     }
+@@ -280,7 +280,7 @@ struct ggml_backend_registry {
+         // remove devices
+         devices.erase(
+             std::remove_if(devices.begin(), devices.end(),
+-                            [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
+                            [reg](std::pair<ggml_backend_dev_t, int> dev) { return ggml_backend_dev_backend_reg(dev.first) == reg; }),
+             devices.end());
+ 
+         // remove backend
+@@ -338,7 +338,12 @@ size_t ggml_backend_dev_count() {
+ 
+ ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
+     GGML_ASSERT(index < ggml_backend_dev_count());
+-    return get_reg().devices[index];
+    auto devices = get_reg().devices;
+    if (!std::is_heap(devices.begin(), devices.end())) {
+        std::make_heap(devices.begin(), devices.end(), [](const auto & a, const auto & b) { return a.second < b.second; });
+    }
+
+    return devices[index].first;
+ }
+ 
+ ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
--- a/llama/patches/0015-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0015-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@@ -0,0 +1,29 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Tue, 14 Jan 2025 15:59:04 -0800
+Subject: [PATCH] add phony target ggml-cpu for all cpu variants
+
+---
+ ggml/src/CMakeLists.txt | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
+index 84101c32..72b488dd 100644
+--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
+@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
+     endforeach()
+ 
+     ggml_add_cpu_backend_variant_impl(${tag_name})
+    add_dependencies(ggml-cpu ggml-cpu-${tag_name})
+ endfunction()
+ 
+ ggml_add_backend(CPU)
+@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
+     if (NOT GGML_BACKEND_DL)
+         message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
+     endif()
+    add_custom_target(ggml-cpu)
+     ggml_add_cpu_backend_variant(sandybridge    AVX)
+     ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 FMA)
+     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)