llama: Decouple patching script from submodule (#7139)

* Refine llama.cpp vendoring workflow tools Switch from the sync.sh over to make based tooling * Run new make sync and patch flow
2025-12-10 15:57:04 +00:00 · 2024-10-17 15:03:09 -07:00
parent f86d00cd95
commit bf4018b9ec
16 changed files with 374 additions and 181 deletions
--- a/llama/patches/0001-cuda.patch
+++ b/llama/patches/0001-cuda.patch
@@ -1,3 +1,14 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Thu, 6 Jun 2024 23:55:47 -0700
+Subject: [PATCH] cuda
+
+---
+ ggml/include/ggml-cuda.h | 2 ++
+ ggml/src/ggml-backend.c  | 5 +++++
+ ggml/src/ggml-cuda.cu    | 6 ++++--
+ 3 files changed, 11 insertions(+), 2 deletions(-)
+
 diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
 index 71bb6dcf..08be0895 100644
 --- a/ggml/include/ggml-cuda.h
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -1,3 +1,12 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:13 -0700
+Subject: [PATCH] pretokenizer
+
+---
+ src/llama.cpp | 14 +++-----------
+ 1 file changed, 3 insertions(+), 11 deletions(-)
+
 diff --git a/src/llama.cpp b/src/llama.cpp
 index 4c0a1bb6..800dfb95 100644
 --- a/src/llama.cpp
--- a/llama/patches/0003-metal.patch
+++ b/llama/patches/0003-metal.patch
@@ -1,3 +1,12 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:12 -0700
+Subject: [PATCH] metal
+
+---
+ ggml/src/ggml-metal.m | 30 +++++++++++++-----------------
+ 1 file changed, 13 insertions(+), 17 deletions(-)
+
 diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
 index 9da08fe2..3a433703 100644
 --- a/ggml/src/ggml-metal.m
--- a/llama/patches/0004-ggml-metal.patch
+++ b/llama/patches/0004-ggml-metal.patch
@@ -1,3 +1,12 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Wed, 12 Jun 2024 12:18:40 -0700
+Subject: [PATCH] ggml-metal
+
+---
+ ggml/src/ggml-metal.m | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
 diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
 index 3a433703..829c5e39 100644
 --- a/ggml/src/ggml-metal.m
--- a/llama/patches/0005-embeddings.patch
+++ b/llama/patches/0005-embeddings.patch
@@ -1,28 +1,36 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:14 -0700
+Subject: [PATCH] embeddings
+
+---
+ src/llama.cpp | 15 +++++++++------
+ 1 file changed, 9 insertions(+), 6 deletions(-)
+
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 4c0a1bb6..17e5bc2a 100644
+index 800dfb95..a639522d 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -16928,7 +16928,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
+@@ -16920,7 +16920,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
-
+ 
     // TODO: use a per-batch flag for logits presence instead
 -    const bool has_logits = !cparams.embeddings;
 +    const bool has_logits =  cparams.causal_attn;
     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
-
+ 
     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-@@ -17200,20 +17200,23 @@ static int llama_decode_internal(
+@@ -17192,20 +17192,23 @@ static int llama_decode_internal(
             // no output
             res  = nullptr;
             embd = nullptr;
 -        } else if (cparams.embeddings) {
 -            res  = nullptr; // do not extract logits for embedding case
 -            embd = nullptr;
-            for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
 +        }
 +
 +        if (cparams.embeddings) {
-+            for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
 +                embd = ggml_graph_node(gf, i);
                 if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
 -                    embd = ggml_graph_node(gf, i);
@@ -39,5 +47,5 @@ index 4c0a1bb6..17e5bc2a 100644
 +            res = nullptr; // do not extract logits when not needed
 +        }
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
-
+ 
         ggml_backend_sched_alloc_graph(lctx.sched, gf);
--- a/llama/patches/0006-clip-unicode.patch
+++ b/llama/patches/0006-clip-unicode.patch
@@ -1,3 +1,12 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:15 -0700
+Subject: [PATCH] clip-unicode
+
+---
+ examples/llava/clip.cpp | 40 +++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 39 insertions(+), 1 deletion(-)
+
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
 index 14e02c8d..6e849d8e 100644
 --- a/examples/llava/clip.cpp
--- a/llama/patches/0007-solar-pro.patch
+++ b/llama/patches/0007-solar-pro.patch
@@ -1,5 +1,21 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:16 -0700
+Subject: [PATCH] solar-pro
+
+solar-pro introduces block skip connections where blocks are connected
+to other, non-sequential blocks with a scale multiple
+
+this change adds 4 new keys to store the skip connections and one new
+tensor to store the scalar. the scalar is implemented a 1-dimensional
+tensor with 2 elements dervied from the model's bskcn_tv configuration.
+in general, the values are (bskcn_tv, 1 - bskcn_tv)
+---
+ src/llama.cpp | 269 +++++++++++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 255 insertions(+), 14 deletions(-)
+
 diff --git a/src/llama.cpp b/src/llama.cpp
-index bdad28b3..1fe6189a 100644
+index a639522d..83b80b59 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
@@ -217,6 +217,7 @@ enum llm_arch {
--- a/llama/patches/0008-conditional-fattn.patch
+++ b/llama/patches/0008-conditional-fattn.patch
@@ -1,8 +1,17 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Wed, 9 Oct 2024 17:26:23 -0700
+Subject: [PATCH] conditional-fattn
+
+---
+ ggml/src/ggml-cuda.cu | 2 ++
+ 1 file changed, 2 insertions(+)
+
 diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
-index 8a844b02..61d61542 100644
+index 809d6ab1..fe77b81c 100644
 --- a/ggml/src/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda.cu
-@@ -2310,9 +2310,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2347,9 +2347,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_ARGSORT:
             ggml_cuda_op_argsort(ctx, dst);
             break;
--- a/llama/patches/0009-blas.patch
+++ b/llama/patches/0009-blas.patch
@@ -1,3 +1,12 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Mon, 30 Sep 2024 16:31:04 -0700
+Subject: [PATCH] blas
+
+---
+ ggml/src/ggml-blas.cpp | 4 ++++
+ 1 file changed, 4 insertions(+)
+
 diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp
 index 6d99c6be..8e1ab99d 100644
 --- a/ggml/src/ggml-blas.cpp