llama: update to commit 71e90e88 (#10192)

2025-12-11 08:17:03 +00:00 · 2025-04-16 18:14:01 -04:00
parent 369de832cd
commit 943464ccb8
160 changed files with 42219 additions and 33080 deletions
--- a/llama/patches/0003-embeddings.patch
+++ b/llama/patches/0003-embeddings.patch
@@ -1,52 +1,43 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:14 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 15:28:34 -0700
 Subject: [PATCH] embeddings

+allow a loaded model in llama.cpp to be used for
+both embeddings and causal attention text generation
+instead of forcing one or the error
 ---
- src/llama-context.cpp | 2 +-
- src/llama.cpp         | 6 ++++--
- 2 files changed, 5 insertions(+), 3 deletions(-)
+ src/llama-context.cpp | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)

 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 671d2a81..47e79ed4 100644
+index 4735e98e..65135172 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -479,7 +479,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
+@@ -1232,7 +1232,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+     int64_t n_outputs_all = 0;
+ 
+     // count outputs
+-    if (batch.logits && !embd_pooled) {
+    if (batch.logits) {
+         for (uint32_t i = 0; i < n_tokens_all; ++i) {
+             n_outputs_all += batch.logits[i] != 0;
+         }
+@@ -1344,7 +1344,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
+         //}
+ 
+-        auto * t_logits = cparams.embeddings ? nullptr         : res->get_logits();
+        auto * t_logits = cparams.causal_attn ? res->get_logits() : nullptr;
+         auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;
+ 
+         if (t_embd && res->get_embd_pooled()) {
+@@ -1488,7 +1488,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
-    const bool has_logits = !cparams.embeddings;
-+    const bool has_logits =  cparams.causal_attn;
-     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+-    bool has_logits = !cparams.embeddings;
+    bool has_logits =  cparams.causal_attn;
+     bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
 
-     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 607f2786..ac85bfed 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -8652,7 +8652,6 @@ static int llama_decode_impl(
-             res  = nullptr;
-             embd = nullptr;
-         } else if (cparams.embeddings) {
-            res  = nullptr; // do not extract logits for embedding case
-             embd = nullptr;
-             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
-                 if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
-@@ -8660,12 +8659,15 @@ static int llama_decode_impl(
-                     break;
-                 }
-             }
-            GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
-         } else {
-             embd = nullptr; // do not extract embeddings when not needed
-             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
-         }
- 
-+        if (!cparams.causal_attn) {
-+            res = nullptr; // do not extract logits when not needed
-+        }
-+
-         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
- 
-         ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
+     // TODO: hacky enc-dec support