llama: update to commit 71e90e88 (#10192)

2025-12-10 07:46:59 +00:00 · 2025-04-16 18:14:01 -04:00
parent 369de832cd
commit 943464ccb8
160 changed files with 42219 additions and 33080 deletions
--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
@@ -24,10 +24,10 @@ problem.
 9 files changed, 21 insertions(+), 2 deletions(-)

 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index dba7be33..65e150d6 100644
+index 273075f4..dd11f304 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
-@@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
+@@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
     if (buffer->iface.free_buffer != NULL) {
         buffer->iface.free_buffer(buffer);
     }
@@ -35,7 +35,7 @@ index dba7be33..65e150d6 100644
 }
 
 size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
-@@ -542,6 +541,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
+@@ -544,6 +543,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
 
     free(ctx->buffers);
     free(ctx);
@@ -43,7 +43,7 @@ index dba7be33..65e150d6 100644
 }
 
 static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-@@ -1865,6 +1865,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+@@ -1867,6 +1867,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
 
 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_aligned_free(buffer->context, buffer->size);
@@ -55,7 +55,7 @@ index dba7be33..65e150d6 100644
 }
 
 static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-@@ -1912,7 +1917,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
+@@ -1914,7 +1919,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
 };
 
 static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
@@ -65,7 +65,7 @@ index dba7be33..65e150d6 100644
     /* .init_tensor     = */ NULL, // no initialization required
     /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
 diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
-index d410c024..a207ab1e 100644
+index cec36b36..4b057973 100644
 --- a/ggml/src/ggml-cann/ggml-cann.cpp
 +++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -530,6 +530,7 @@ static void ggml_backend_cann_buffer_free_buffer(
@@ -76,7 +76,7 @@ index d410c024..a207ab1e 100644
 }
 
 /**
-@@ -1198,6 +1199,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
+@@ -1199,6 +1200,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
  */
 static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
     ACL_CHECK(aclrtFreeHost(buffer->context));
@@ -85,10 +85,10 @@ index d410c024..a207ab1e 100644
 
 /**
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index ebb2ccae..dfff21a2 100644
+index fafe9633..59a49560 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -529,6 +529,7 @@ struct ggml_backend_cuda_buffer_context {
+@@ -533,6 +533,7 @@ struct ggml_backend_cuda_buffer_context {
 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     delete ctx;
@@ -96,7 +96,7 @@ index ebb2ccae..dfff21a2 100644
 }
 
 static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
-@@ -783,6 +784,7 @@ struct ggml_backend_cuda_split_buffer_context {
+@@ -788,6 +789,7 @@ struct ggml_backend_cuda_split_buffer_context {
 static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
     delete ctx;
@@ -104,7 +104,7 @@ index ebb2ccae..dfff21a2 100644
 }
 
 static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1055,6 +1057,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
+@@ -1061,6 +1063,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
 
 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     CUDA_CHECK(cudaFreeHost(buffer->context));
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
 
 static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index c550142a..fd9a4e77 100644
+index 9f1c6c6c..310afe8a 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -4350,6 +4350,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
+@@ -4641,6 +4641,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
     }
 
     free(ctx);
@@ -137,10 +137,10 @@ index c550142a..fd9a4e77 100644
 
 static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
-index f5906246..062e93b8 100644
+index b8b5cbd3..14d4561b 100644
 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp
 +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
-@@ -1203,6 +1203,7 @@ static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
+@@ -1443,6 +1443,7 @@ struct ggml_backend_opencl_buffer_context {
 static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
     delete ctx;
@@ -149,10 +149,10 @@ index f5906246..062e93b8 100644
 
 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
-index 97873acc..893ee0b9 100644
+index 862b9b66..34536681 100644
 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
 +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
-@@ -419,6 +419,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -443,6 +443,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
     GGML_ASSERT(status);
     delete ctx;
@@ -161,10 +161,10 @@ index 97873acc..893ee0b9 100644
 
 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index 792e0569..5e233e8b 100644
+index 3e48a924..a3d182fc 100644
 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
 +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
-@@ -311,6 +311,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
     ggml_sycl_set_device(ctx->device);
 
     delete ctx;
@@ -172,7 +172,7 @@ index 792e0569..5e233e8b 100644
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-@@ -720,6 +721,7 @@ struct ggml_backend_sycl_split_buffer_context {
+@@ -761,6 +762,7 @@ struct ggml_backend_sycl_split_buffer_context {
 static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
     delete ctx;
@@ -180,7 +180,7 @@ index 792e0569..5e233e8b 100644
 }
 
 static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1053,6 +1055,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
+@@ -1095,6 +1097,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
 
 static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_sycl_host_free(buffer->context);
@@ -189,10 +189,10 @@ index 792e0569..5e233e8b 100644
 
 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index abe3e790..1dad714b 100644
+index 783a0ff8..8ac1e07e 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -7914,6 +7914,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -8639,6 +8639,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
     ggml_vk_destroy_buffer(ctx->dev_buffer);
     delete ctx;
@@ -200,7 +200,7 @@ index abe3e790..1dad714b 100644
 }
 
 static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -8056,6 +8057,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
+@@ -8782,6 +8783,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
 static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
     ggml_vk_host_free(vk_instance.devices[0], buffer->context);
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -3,15 +3,17 @@ From: Michael Yang <mxyng@pm.me>
 Date: Mon, 16 Sep 2024 15:53:13 -0700
 Subject: [PATCH] pretokenizer

+allow for an unset pretokenizer with a warning in the
+logs instead of throwing an error
 ---
 src/llama-vocab.cpp | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index ad9ffe66..a4eee9b8 100644
+index 464ff01e..0125ee53 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1468,16 +1468,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         if (type == LLAMA_VOCAB_TYPE_BPE) {
             add_space_prefix = false;
             clean_spaces = true;
@@ -29,9 +31,9 @@ index ad9ffe66..a4eee9b8 100644
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -1593,7 +1584,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
-                 tokenizer_pre == "megrez") {
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+@@ -1634,7 +1625,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+                 pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
+                 clean_spaces = false;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
--- a/llama/patches/0003-embeddings.patch
+++ b/llama/patches/0003-embeddings.patch
@@ -1,52 +1,43 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:14 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 15:28:34 -0700
 Subject: [PATCH] embeddings

+allow a loaded model in llama.cpp to be used for
+both embeddings and causal attention text generation
+instead of forcing one or the error
 ---
- src/llama-context.cpp | 2 +-
- src/llama.cpp         | 6 ++++--
- 2 files changed, 5 insertions(+), 3 deletions(-)
+ src/llama-context.cpp | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)

 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 671d2a81..47e79ed4 100644
+index 4735e98e..65135172 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -479,7 +479,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
+@@ -1232,7 +1232,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+     int64_t n_outputs_all = 0;
+ 
+     // count outputs
+-    if (batch.logits && !embd_pooled) {
+    if (batch.logits) {
+         for (uint32_t i = 0; i < n_tokens_all; ++i) {
+             n_outputs_all += batch.logits[i] != 0;
+         }
+@@ -1344,7 +1344,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
+         //}
+ 
+-        auto * t_logits = cparams.embeddings ? nullptr         : res->get_logits();
+        auto * t_logits = cparams.causal_attn ? res->get_logits() : nullptr;
+         auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;
+ 
+         if (t_embd && res->get_embd_pooled()) {
+@@ -1488,7 +1488,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
-    const bool has_logits = !cparams.embeddings;
-+    const bool has_logits =  cparams.causal_attn;
-     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+-    bool has_logits = !cparams.embeddings;
+    bool has_logits =  cparams.causal_attn;
+     bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
 
-     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 607f2786..ac85bfed 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -8652,7 +8652,6 @@ static int llama_decode_impl(
-             res  = nullptr;
-             embd = nullptr;
-         } else if (cparams.embeddings) {
-            res  = nullptr; // do not extract logits for embedding case
-             embd = nullptr;
-             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
-                 if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
-@@ -8660,12 +8659,15 @@ static int llama_decode_impl(
-                     break;
-                 }
-             }
-            GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
-         } else {
-             embd = nullptr; // do not extract embeddings when not needed
-             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
-         }
- 
-+        if (!cparams.causal_attn) {
-+            res = nullptr; // do not extract logits when not needed
-+        }
-+
-         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
- 
-         ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
+     // TODO: hacky enc-dec support
--- a/llama/patches/0004-clip-unicode.patch
+++ b/llama/patches/0004-clip-unicode.patch
@@ -1,19 +1,21 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:15 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 15:34:37 -0700
 Subject: [PATCH] clip-unicode

+fixes loading vision models in llama.cpp on windows
+filesystems for paths that include wide characters
 ---
- examples/llava/clip.cpp | 40 +++++++++++++++++++++++++++++++++++++++-
- 1 file changed, 39 insertions(+), 1 deletion(-)
+ examples/llava/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 39 insertions(+)

 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 76d4a785..205af1eb 100644
+index 49c90b75..4b72ea9f 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
-@@ -58,6 +58,19 @@
- #   define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
- #endif // defined(LLAVA_LOG_OFF)
+@@ -28,6 +28,19 @@
+ #include <cinttypes>
+ #include <limits>
 
 +#if defined(_WIN32)
 +#define WIN32_LEAN_AND_MEAN
@@ -28,49 +30,48 @@ index 76d4a785..205af1eb 100644
 +#endif
 +#endif
 +
+ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
+ 
 //#define CLIP_DEBUG_FUNCTIONS
+@@ -1429,7 +1442,29 @@ struct clip_model_loader {
+         {
+             std::vector<uint8_t> read_buf;
 
- // RGB uint8 image
-@@ -1402,8 +1415,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
-             gguf_free(ctx);
-             return nullptr;
-         }
-
 +#ifdef _WIN32
-+        int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
-+        if (!wlen) {
-+            return NULL;
-+        }
-+        wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
-+        wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
-+        if (!wlen) {
-+            free(wbuf);
-+            return NULL;
-+        }
+            int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
+            if (!wlen) {
+                throw std::runtime_error(string_format("%s: failed to convert filename to wide string\n", __func__));
+            }
+            wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
+            wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wbuf, wlen);
+            if (!wlen) {
+                free(wbuf);
+                throw std::runtime_error(string_format("%s: failed to convert filename to wide string\n", __func__));
+            }
 +#if __GLIBCXX__
-+        int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
-+        __gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
-+        std::istream fin(&buffer);
+            int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
+            __gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
+            std::istream fin(&buffer);
 +#else // MSVC
-+        // unused in our current build
-+        auto fin = std::ifstream(wbuf, std::ios::binary);
+            // unused in our current build
+            auto fin = std::ifstream(wbuf, std::ios::binary);
 +#endif
-+        free(wbuf);
+            free(wbuf);
 +#else
-         auto fin = std::ifstream(fname, std::ios::binary);
+             auto fin = std::ifstream(fname, std::ios::binary);
 +#endif
-         if (!fin) {
-             LOG_ERR("cannot open model file for loading tensors\n");
-             clip_free(new_clip);
-@@ -1443,7 +1477,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
-                 ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
+             if (!fin) {
+                 throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
+             }
+@@ -1456,7 +1491,11 @@ struct clip_model_loader {
+                     ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
+                 }
             }
-         }
 +#if defined(_WIN32) && defined(__GLIBCXX__)
-+        close(fd);
+            close(fd);
 +#else
-         fin.close();
+             fin.close();
 +#endif
-     }
 
-     // vision model
+             LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
+         }
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@@ -1,47 +1,40 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:16 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 16:03:51 -0700
 Subject: [PATCH] solar-pro

-solar-pro introduces block skip connections where blocks are connected
-to other, non-sequential blocks with a scale multiple
-
-this change adds 4 new keys to store the skip connections and one new
-tensor to store the scalar. the scalar is implemented a 1-dimensional
-tensor with 2 elements dervied from the model's bskcn_tv configuration.
-in general, the values are (bskcn_tv, 1 - bskcn_tv)
+adds support for the Solar Pro architecture
 ---
- src/llama-arch.cpp         |  21 +++++
+ src/llama-arch.cpp         |  21 ++++
 src/llama-arch.h           |   3 +
 src/llama-hparams.cpp      |   8 ++
- src/llama-hparams.h        |   5 ++
+ src/llama-hparams.h        |   5 +
 src/llama-model-loader.cpp |   1 +
- src/llama-model.cpp        |  44 +++++++++++
+ src/llama-model.cpp        | 207 +++++++++++++++++++++++++++++++++++++
 src/llama-model.h          |   3 +
- src/llama.cpp              | 152 ++++++++++++++++++++++++++++++++++++-
- 8 files changed, 236 insertions(+), 1 deletion(-)
+ 7 files changed, 248 insertions(+)

 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 97a1e7e5..a1e0ebcc 100644
+index a6fddc7f..0b0fedcd 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
-@@ -61,6 +61,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE,          "granite"          },
     { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
     { LLM_ARCH_CHAMELEON,        "chameleon"        },
 +    { LLM_ARCH_SOLAR,            "solar"            },
     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
-     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
- };
-@@ -125,6 +126,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-     { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
-     { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
-     { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
-+    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection"  },
+     { LLM_ARCH_PLM,              "plm"              },
+     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
+@@ -140,6 +141,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+     { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,       "%s.attention.relative_buckets_count"       },
+     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
+     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
+    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
-@@ -1271,6 +1273,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -1478,6 +1480,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
@@ -66,7 +59,7 @@ index 97a1e7e5..a1e0ebcc 100644
     {
         LLM_ARCH_WAVTOKENIZER_DEC,
         {
-@@ -1429,6 +1449,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -1671,6 +1691,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_FFN_EXP_PROBS_B,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -75,18 +68,18 @@ index 97a1e7e5..a1e0ebcc 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 122fdceb..77919578 100644
+index 2c2099b3..74aa3dd0 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
-@@ -65,6 +65,7 @@ enum llm_arch {
+@@ -72,6 +72,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
 +    LLM_ARCH_SOLAR,
     LLM_ARCH_WAVTOKENIZER_DEC,
-     LLM_ARCH_UNKNOWN,
- };
-@@ -129,6 +130,7 @@ enum llm_kv {
+     LLM_ARCH_PLM,
+     LLM_ARCH_BAILINGMOE,
+@@ -144,6 +145,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
@@ -94,7 +87,7 @@ index 122fdceb..77919578 100644
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_DIMENSION_SECTIONS,
-@@ -311,6 +313,7 @@ enum llm_tensor {
+@@ -340,6 +342,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
@@ -103,14 +96,13 @@ index 122fdceb..77919578 100644
     LLM_TENSOR_CONVNEXT_DW,
     LLM_TENSOR_CONVNEXT_NORM,
 diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index ea87b295..f3955de9 100644
+index 90dfe7a7..8a667960 100644
 --- a/src/llama-hparams.cpp
 +++ b/src/llama-hparams.cpp
-@@ -69,3 +69,11 @@ uint32_t llama_hparams::n_embd_v_s() const {
-     // corresponds to Mamba's ssm_states size
+@@ -70,6 +70,14 @@ uint32_t llama_hparams::n_embd_v_s() const {
     return ssm_d_state * ssm_d_inner;
 }
-+
+ 
 +bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
 +    if (il < n_layer) {
 +        return n_bskcn_arr[n][il] > 0;
@@ -118,12 +110,15 @@ index ea87b295..f3955de9 100644
 +
 +    GGML_ABORT("fatal error");
 +}
-\ No newline at end of file
+
+ bool llama_hparams::is_swa(uint32_t il) const {
+     if (il < n_layer) {
+         return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index 1fe45410..1bdcdfd5 100644
+index 4e0b5719..c3147cbc 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
-@@ -50,6 +50,8 @@ struct llama_hparams {
+@@ -51,6 +51,8 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
@@ -132,18 +127,18 @@ index 1fe45410..1bdcdfd5 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
     uint32_t n_lora_kv          = 0;
-@@ -133,6 +135,9 @@ struct llama_hparams {
- 
+@@ -149,6 +151,9 @@ struct llama_hparams {
     // dimension of the recurrent state embeddings
     uint32_t n_embd_v_s() const;
-+
+ 
 +    // Block skip connection
 +    bool n_bskcn(uint32_t n, uint32_t il) const;
+
+     bool is_swa(uint32_t il) const;
 };
 
- static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index 05d58ad9..1252aca1 100644
+index ea73a8a7..a012aeae 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
@@ -439,6 +439,7 @@ namespace GGUFMeta {
@@ -155,10 +150,10 @@ index 05d58ad9..1252aca1 100644
 llama_model_loader::llama_model_loader(
         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 36a0a009..ad1315c6 100644
+index b74dd72c..5fbd0055 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -1238,6 +1238,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1372,6 +1372,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                }
             } break;
@@ -180,7 +175,7 @@ index 36a0a009..ad1315c6 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -3316,6 +3331,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -3701,6 +3716,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
@@ -215,54 +210,12 @@ index 36a0a009..ad1315c6 100644
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -3900,6 +3943,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
-         case LLM_ARCH_GRANITE:
-         case LLM_ARCH_GRANITE_MOE:
-         case LLM_ARCH_CHAMELEON:
-+        case LLM_ARCH_SOLAR:
-             return LLAMA_ROPE_TYPE_NORM;
+@@ -12244,6 +12287,165 @@ struct llm_build_chameleon : public llm_graph_context {
+     }
+ };
 
-         // the pairs of head values are offset by n_rot/2
-diff --git a/src/llama-model.h b/src/llama-model.h
-index a7c30444..1afb0024 100644
--- a/src/llama-model.h
-+++ b/src/llama-model.h
-@@ -55,6 +55,7 @@ enum llm_type {
-     LLM_TYPE_15B,
-     LLM_TYPE_16B,
-     LLM_TYPE_20B,
-+    LLM_TYPE_22B,
-     LLM_TYPE_30B,
-     LLM_TYPE_32B,
-     LLM_TYPE_34B,
-@@ -281,6 +282,8 @@ struct llama_layer {
-     struct ggml_tensor * ffn_up_scale   = nullptr;
-     struct ggml_tensor * ffn_down_scale = nullptr;
- 
-+    struct ggml_tensor * bskcn_tv = nullptr;
-+
-     struct llama_layer_posnet posnet;
- 
-     struct llama_layer_convnext convnext;
-diff --git a/src/llama.cpp b/src/llama.cpp
-index ac85bfed..6d320ea4 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -7953,9 +7953,155 @@ struct llm_build_context {
-         cb(img_logits, "img_logits", -1);
-         cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
-         cb(cur, "result_output", -1);
-
-         ggml_build_forward_expand(gf, cur);
-+        return gf;
-+   }
-+
-+   ggml_cgraph * build_solar() {
-+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-+
-+        // mutable variable, needed during the last layer of the computation to skip unused tokens
-+        int32_t n_tokens = this->n_tokens;
-+
+struct llm_build_solar : public llm_graph_context {
+    llm_build_solar(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
 +        const int64_t n_embd_head = hparams.n_embd_head_v;
 +        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 +        GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -270,13 +223,15 @@ index ac85bfed..6d320ea4 100644
 +        struct ggml_tensor * cur;
 +        struct ggml_tensor * inpL;
 +
-+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 +
 +        // inp_pos - contains the positions
 +        struct ggml_tensor * inp_pos = build_inp_pos();
 +
 +        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        auto * inp_attn = build_attn_inp_kv_unified();
+
+        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 +
 +        struct ggml_tensor * bskcn_1;
 +        struct ggml_tensor * bskcn_2;
@@ -305,88 +260,94 @@ index ac85bfed..6d320ea4 100644
 +                   ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
 +                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
 +            }
- 
+
 +            // norm
-+            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
 +                    model.layers[il].attn_norm, NULL,
-+                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
 +            cb(cur, "attn_norm", il);
 +
 +            // self-attention
 +            {
 +                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                struct ggml_tensor * rope_factors = build_rope_factors(il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
 +
 +                // compute Q and K and RoPE them
-+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
 +                cb(Qcur, "Qcur", il);
 +                if (model.layers[il].bq) {
 +                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
 +                    cb(Qcur, "Qcur", il);
 +                }
 +
-+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
 +                cb(Kcur, "Kcur", il);
 +                if (model.layers[il].bk) {
 +                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
 +                    cb(Kcur, "Kcur", il);
 +                }
 +
-+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
 +                cb(Vcur, "Vcur", il);
 +                if (model.layers[il].bv) {
 +                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
 +                    cb(Vcur, "Vcur", il);
 +                }
 +
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
 +                Qcur = ggml_rope_ext(
-+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
-+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                    ext_factor, attn_factor, beta_fast, beta_slow
-+                );
-+                cb(Qcur, "Qcur", il);
+                        ctx0, Qcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
 +
 +                Kcur = ggml_rope_ext(
-+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
-+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                    ext_factor, attn_factor, beta_fast, beta_slow
-+                );
+                        ctx0, Kcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+                
+                cb(Qcur, "Qcur", il);
 +                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
 +
-+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(inp_attn, gf,
 +                        model.layers[il].wo, model.layers[il].bo,
-+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+                cb(cur, "attn_out", il);
 +            }
 +
 +            if (il == n_layer - 1) {
 +                // skip computing output for unused tokens
-+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-+                n_tokens = n_outputs;
+                ggml_tensor * inp_out_ids = build_inp_out_ids();
 +                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
 +                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
 +            }
 +
-+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
 +            cb(ffn_inp, "ffn_inp", il);
 +
 +            // feed-forward network
-+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
 +                    model.layers[il].ffn_norm, NULL,
-+                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
 +            cb(cur, "ffn_norm", il);
 +
-+            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
 +                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
 +                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
 +                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
 +                    NULL,
-+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
 +            cb(cur, "ffn_out", il);
 +
 +            cur = ggml_add(ctx0, cur, ffn_inp);
 +            cb(cur, "ffn_out", il);
 +
-+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = build_cvec(cur, il);
 +            cb(cur, "l_out", il);
 +
 +            // input for next layer
@@ -394,25 +355,64 @@ index ac85bfed..6d320ea4 100644
 +        }
 +
 +        cur = inpL;
-+        cur = llm_build_norm(ctx0, cur, hparams,
+
+        cur = build_norm(cur,
 +                model.output_norm, NULL,
-+                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
+
 +        cb(cur, "result_norm", -1);
+        res->t_embd = cur;
+
 +        // lm_head
-+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
+
 +        cb(cur, "result_output", -1);
+        res->t_logits = cur;
+
 +        ggml_build_forward_expand(gf, cur);
-         return gf;
-     }
- 
-@@ -8398,6 +8544,10 @@ static struct ggml_cgraph * llama_build_graph(
+    }
+};
+
+ struct llm_build_wavtokenizer_dec : public llm_graph_context {
+     llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+         ggml_tensor * cur;
+@@ -12993,6 +13195,10 @@ llm_graph_result_ptr llama_model::build_graph(
             {
-                 result = llm.build_chameleon();
+                 llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
             } break;
 +        case LLM_ARCH_SOLAR:
 +            {
-+                result = llm.build_solar();
+                llm = std::make_unique<llm_build_solar>(*this, params, gf);
 +            } break;
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
-                 result = llm.build_wavtokenizer_dec();
+                 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
+@@ -13139,6 +13345,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+         case LLM_ARCH_GRANITE:
+         case LLM_ARCH_GRANITE_MOE:
+         case LLM_ARCH_CHAMELEON:
+        case LLM_ARCH_SOLAR:
+         case LLM_ARCH_BAILINGMOE:
+             return LLAMA_ROPE_TYPE_NORM;
+ 
+diff --git a/src/llama-model.h b/src/llama-model.h
+index 0f18dac1..e08d4ae4 100644
+--- a/src/llama-model.h
+++ b/src/llama-model.h
+@@ -62,6 +62,7 @@ enum llm_type {
+     LLM_TYPE_15B,
+     LLM_TYPE_16B,
+     LLM_TYPE_20B,
+    LLM_TYPE_22B,
+     LLM_TYPE_30B,
+     LLM_TYPE_32B,
+     LLM_TYPE_34B,
+@@ -305,6 +306,8 @@ struct llama_layer {
+     struct ggml_tensor * ffn_up_scale   = nullptr;
+     struct ggml_tensor * ffn_down_scale = nullptr;
+ 
+    struct ggml_tensor * bskcn_tv = nullptr;
+
+     struct llama_layer_posnet posnet;
+ 
+     struct llama_layer_convnext convnext;
--- a/llama/patches/0006-conditional-fattn.patch
+++ b/llama/patches/0006-conditional-fattn.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index dfff21a2..1b0d074b 100644
+index 59a49560..b70c6a32 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2284,9 +2284,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2338,9 +2338,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_ARGSORT:
             ggml_cuda_op_argsort(ctx, dst);
             break;
--- a/llama/patches/0007-add-mllama-support.patch
+++ b/llama/patches/0007-add-mllama-support.patch
--- a/llama/patches/0008-add-unpad-operator.patch
+++ b/llama/patches/0008-add-unpad-operator.patch
@@ -1,24 +1,27 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Thu, 17 Oct 2024 17:19:25 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Sun, 13 Apr 2025 22:10:06 -0400
 Subject: [PATCH] add unpad operator

+adds the unpad operator to GGML
 ---
 ggml/include/ggml.h                  | 10 +++++
- ggml/src/ggml-cpu/ggml-cpu.c         | 58 ++++++++++++++++++++++++++++
+ ggml/src/ggml-cpu/ggml-cpu.c         |  5 +++
+ ggml/src/ggml-cpu/ops.cpp            | 55 ++++++++++++++++++++++++++++
+ ggml/src/ggml-cpu/ops.h              |  1 +
 ggml/src/ggml-cuda/ggml-cuda.cu      |  4 ++
- ggml/src/ggml-cuda/pad.cu            | 46 ++++++++++++++++++++++
+ ggml/src/ggml-cuda/pad.cu            | 46 +++++++++++++++++++++++
 ggml/src/ggml-cuda/pad.cuh           |  1 +
- ggml/src/ggml-metal/ggml-metal.m     | 33 ++++++++++++++++
- ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++
- ggml/src/ggml.c                      | 25 +++++++++++-
- 8 files changed, 220 insertions(+), 2 deletions(-)
+ ggml/src/ggml-metal/ggml-metal.m     | 33 +++++++++++++++++
+ ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++++
+ ggml/src/ggml.c                      | 25 ++++++++++++-
+ 10 files changed, 223 insertions(+), 2 deletions(-)

 diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
-index dd0c6a96..8d269a9c 100644
+index 8fcc16df..d19fc167 100644
 --- a/ggml/include/ggml.h
 +++ b/ggml/include/ggml.h
-@@ -487,6 +487,7 @@ extern "C" {
+@@ -488,6 +488,7 @@ extern "C" {
         GGML_OP_UPSCALE, // nearest interpolate
         GGML_OP_PAD,
         GGML_OP_PAD_REFLECT_1D,
@@ -26,7 +29,7 @@ index dd0c6a96..8d269a9c 100644
         GGML_OP_ARANGE,
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
-@@ -1743,6 +1744,15 @@ extern "C" {
+@@ -1757,6 +1758,15 @@ extern "C" {
             int                   p0,
             int                   p1);
 
@@ -43,13 +46,38 @@ index dd0c6a96..8d269a9c 100644
     // timesteps: [N,]
     // return: [N, dim]
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 72325349..2f606d82 100644
+index 50400328..432942bf 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -10844,6 +10844,59 @@ static void ggml_compute_forward_pad_reflect_1d(
+@@ -1960,6 +1960,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
+             {
+                 ggml_compute_forward_pad_reflect_1d(params, tensor);
+             } break;
+        case GGML_OP_UNPAD:
+            {
+                ggml_compute_forward_unpad(params, tensor);
+            } break;
+         case GGML_OP_ARANGE:
+             {
+                 ggml_compute_forward_arange(params, tensor);
+@@ -2282,6 +2286,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
+         case GGML_OP_UPSCALE:
+         case GGML_OP_PAD:
+         case GGML_OP_PAD_REFLECT_1D:
+        case GGML_OP_UNPAD:
+         case GGML_OP_ARANGE:
+         case GGML_OP_TIMESTEP_EMBEDDING:
+         case GGML_OP_ARGSORT:
+diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
+index 6050147b..66b8da68 100644
+--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
+@@ -6531,6 +6531,61 @@ void ggml_compute_forward_pad_reflect_1d(
     }
 }
 
+// ggml_compute_forward_unpad
+
 +static void ggml_compute_forward_unpad_f32(
 +    const struct ggml_compute_params *params,
 +    struct ggml_tensor *dst) {
@@ -85,7 +113,7 @@ index 72325349..2f606d82 100644
 +    }
 +}
 +
-+static void ggml_compute_forward_unpad(
+void ggml_compute_forward_unpad(
 +    const struct ggml_compute_params * params,
 +    struct ggml_tensor * dst) {
 +
@@ -106,30 +134,23 @@ index 72325349..2f606d82 100644
 // ggml_compute_forward_arange
 
 static void ggml_compute_forward_arange_f32(
-@@ -13137,6 +13190,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
-             {
-                 ggml_compute_forward_pad_reflect_1d(params, tensor);
-             } break;
-+        case GGML_OP_UNPAD:
-+            {
-+                ggml_compute_forward_unpad(params, tensor);
-+            } break;
-         case GGML_OP_ARANGE:
-             {
-                 ggml_compute_forward_arange(params, tensor);
-@@ -13484,6 +13541,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
-         case GGML_OP_UPSCALE:
-         case GGML_OP_PAD:
-         case GGML_OP_PAD_REFLECT_1D:
-+        case GGML_OP_UNPAD:
-         case GGML_OP_ARANGE:
-         case GGML_OP_TIMESTEP_EMBEDDING:
-         case GGML_OP_ARGSORT:
+diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
+index 410a3720..3eca1cf8 100644
+--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
+@@ -71,6 +71,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
+ void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 1b0d074b..c7a957c8 100644
+index b70c6a32..67208cba 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2200,6 +2200,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2245,6 +2245,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_PAD:
             ggml_cuda_op_pad(ctx, dst);
             break;
@@ -139,16 +160,16 @@ index 1b0d074b..c7a957c8 100644
         case GGML_OP_ARANGE:
             ggml_cuda_op_arange(ctx, dst);
             break;
-@@ -3199,6 +3202,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
-             return ggml_is_contiguous(op->src[0]);
+@@ -3223,6 +3226,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_UPSCALE:
+             return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
         case GGML_OP_PAD:
 +        case GGML_OP_UNPAD:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_LEAKY_RELU:
 diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
-index aba539e8..b4b87409 100644
+index 77432b04..7d45a7e1 100644
 --- a/ggml/src/ggml-cuda/pad.cu
 +++ b/ggml/src/ggml-cuda/pad.cu
@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -212,10 +233,10 @@ index 8fd386b0..e2ededc3 100644
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index fd9a4e77..e4c093f9 100644
+index 310afe8a..b121ab9e 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -331,6 +331,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
+@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
     GGML_METAL_KERNEL_TYPE_PAD_F32,
     GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
@@ -223,23 +244,23 @@ index fd9a4e77..e4c093f9 100644
     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-@@ -946,6 +947,7 @@ @implementation GGMLMetalClass
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                   upscale_f32,                    true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                       pad_f32,                        true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,            pad_reflect_1d_f32,             true);
-+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                     unpad_f32,                      true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,        timestep_embedding_f32,         true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                    arange_f32,                     true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,           argsort_f32_i32_asc,            true);
-@@ -1254,6 +1256,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
-         case GGML_OP_UPSCALE:
+@@ -998,6 +999,7 @@ @implementation GGMLMetalClass
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                     upscale_f32,                     true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                         pad_f32,                         true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,              pad_reflect_1d_f32,              true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                       unpad_f32,                       true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,          timestep_embedding_f32,          true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                      arange_f32,                      true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
+@@ -1339,6 +1341,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
+         case GGML_OP_POOL_2D:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
 +        case GGML_OP_UNPAD:
-         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
-@@ -3469,6 +3472,36 @@ static void ggml_metal_encode_node(
+         case GGML_OP_LEAKY_RELU:
+@@ -3669,6 +3672,36 @@ static void ggml_metal_encode_node(
 
                 const int nth = MIN(1024, ne0);
 
@@ -277,10 +298,10 @@ index fd9a4e77..e4c093f9 100644
             } break;
         case GGML_OP_ARANGE:
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index d092a169..f38909d0 100644
+index b08666e2..e3185e5b 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -2953,6 +2953,51 @@ kernel void kernel_pad_reflect_1d_f32(
+@@ -2968,6 +2968,51 @@ kernel void kernel_pad_reflect_1d_f32(
     }
 }
 
@@ -331,12 +352,12 @@ index d092a169..f38909d0 100644
 +
 kernel void kernel_arange_f32(
     device        char * dst,
-     constant   int64_t & ne0,
+     constant   ggml_metal_kargs_arange & args,
 diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index 7fc06724..635aa299 100644
+index 950772c7..2276b631 100644
 --- a/ggml/src/ggml.c
 +++ b/ggml/src/ggml.c
-@@ -962,6 +962,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+@@ -963,6 +963,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "UPSCALE",
     "PAD",
     "PAD_REFLECT_1D",
@@ -344,16 +365,16 @@ index 7fc06724..635aa299 100644
     "ARANGE",
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
-@@ -996,7 +997,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
-+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
+-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
-@@ -1059,6 +1060,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+@@ -1057,6 +1058,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "upscale(x)",
     "pad(x)",
     "pad_reflect_1d(x)",
@@ -361,16 +382,16 @@ index 7fc06724..635aa299 100644
     "arange(start, stop, step)",
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
-@@ -1093,7 +1095,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
 
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
-+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
+-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
-@@ -4225,6 +4227,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
+@@ -4262,6 +4264,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
     return result;
 }
 
--- a/llama/patches/0009-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0009-fix-deepseek-deseret-regex.patch
@@ -1,20 +1,21 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Daniel Hiltgen <daniel@ollama.com>
-Date: Fri, 25 Oct 2024 16:25:18 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 19:43:06 -0700
 Subject: [PATCH] fix deepseek deseret regex

-On windows compiled with gcc the c++ regex library failed to handle
-the characters
+on some systems, deepseek's regex would throw an error
+on windows due to the deseret characters in the matching
+regex
 ---
 src/llama-vocab.cpp |  2 +-
- src/unicode.cpp     | 22 ++++++++++++++++++++++
- 2 files changed, 23 insertions(+), 1 deletion(-)
+ src/unicode.cpp     | 21 +++++++++++++++++++++
+ 2 files changed, 22 insertions(+), 1 deletion(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index a4eee9b8..1ca827eb 100644
+index 0125ee53..d74919d2 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -295,7 +295,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
+@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
             case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
                 regex_exprs = {
                     "[\r\n]",
@@ -24,7 +25,7 @@ index a4eee9b8..1ca827eb 100644
                     "\\s+$",
                     "[一-龥ࠀ-一가-퟿]+",
 diff --git a/src/unicode.cpp b/src/unicode.cpp
-index e63bb4ab..9dd53b9a 100644
+index e63bb4ab..73cb2b1a 100644
 --- a/src/unicode.cpp
 +++ b/src/unicode.cpp
@@ -2,6 +2,11 @@
@@ -39,7 +40,7 @@ index e63bb4ab..9dd53b9a 100644
 #include "unicode.h"
 #include "unicode-data.h"
 
-@@ -200,6 +205,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
+@@ -200,6 +205,21 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
 }
 
 static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
@@ -58,11 +59,10 @@ index e63bb4ab..9dd53b9a 100644
 +    free(wbuf);
 +    return ret;
 +#else
-+
 #if defined(__clang__)
     // disable C++17 deprecation warning for std::codecvt_utf8
 #    pragma clang diagnostic push
-@@ -213,6 +234,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
+@@ -213,6 +233,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
 #endif
 
     return conv.from_bytes(s);
--- a/llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
+++ b/llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
@@ -1,14 +1,14 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: ParthSareen <parth.sareen@ollama.com>
-Date: Wed, 11 Dec 2024 15:37:32 -0800
-Subject: [PATCH] Maintain ordering for rules for grammar
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 19:43:40 -0700
+Subject: [PATCH] maintain ordering for rules for grammar

 ---
 common/json-schema-to-grammar.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
-index 3ebcc3d9..30c28808 100644
+index 90679822..56043678 100644
 --- a/common/json-schema-to-grammar.cpp
 +++ b/common/json-schema-to-grammar.cpp
@@ -346,7 +346,7 @@ private:
--- a/llama/patches/0011-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0011-ensure-KV-cache-is-fully-defragmented.patch
@@ -0,0 +1,361 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 15 Apr 2025 14:27:40 -0400
+Subject: [PATCH] ensure KV cache is fully defragmented
+
+Sometimes the KV cache requires defragmentation even without
+triggering the threshold heuristic. In this case, decoding
+will not being able to find a KV cache slot. This is particularly
+difficult for the caller to handle if it happens in between
+ubatches. To avoid this, we should immediately trigger a defrag.
+
+In addition, a heavily fragmented cache can require more than
+max_moves to defragment. Currently, we stop when we hit the limit
+but this can leave a cache that still does not have adequate space
+even after defragmentation is triggered. Instead, we should do
+multiple batches of processing until everything is complete.
+---
+ src/llama-context.cpp  | 105 +++++++++++++----------------------------
+ src/llama-context.h    |   4 +-
+ src/llama-kv-cache.cpp |  39 +++------------
+ src/llama-kv-cache.h   |   9 +++-
+ 4 files changed, 51 insertions(+), 106 deletions(-)
+
+diff --git a/src/llama-context.cpp b/src/llama-context.cpp
+index afe6f552..d6e7b3af 100644
+--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
+@@ -590,13 +590,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
+ 
+ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+         ggml_context * ctx0,
+-        ggml_cgraph * gf) const {
+        ggml_cgraph * gf,
+        const std::vector<struct llama_kv_defrag_move> & moves) const {
+     auto res = std::make_unique<llm_graph_result>();
+ 
+     const auto & hparams = model.hparams;
+ 
+-    const auto & ids = kv_self->defrag_info.ids;
+-
+ #if 0
+     // CPU defrag
+     //
+@@ -668,32 +667,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+         ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
+     }
+ #else
+-    for (uint32_t i = 0; i < ids.size(); ++i) {
+-        const uint32_t id = ids[i];
+-
+-        if (i == id || id == ids.size()) {
+-            continue;
+-        }
+-
+-        uint32_t nm = 1;
+-
+-        while (i + nm < ids.size() && ids[i + nm] == id + nm) {
+-            nm++;
+-        }
+-
+    for (const auto & move : moves) {
+         for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT
+             const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+             const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+ 
+             ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il],
+-                    n_embd_k_gqa, nm,
+                    n_embd_k_gqa, move.len,
+                     ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
+-                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i));
+                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.src));
+ 
+             ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il],
+-                    n_embd_k_gqa, nm,
+                    n_embd_k_gqa, move.len,
+                     ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
+-                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id));
+                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.dst));
+ 
+             ggml_tensor * view_v_src;
+             ggml_tensor * view_v_dst;
+@@ -701,34 +688,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+             if (cparams.flash_attn) {
+                 // NOTE: the V cache is not transposed when using flash attention
+                 view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
+-                        n_embd_v_gqa, nm,
+                        n_embd_v_gqa, move.len,
+                         ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
+-                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i));
+                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.src));
+ 
+                 view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
+-                        n_embd_v_gqa, nm,
+                        n_embd_v_gqa, move.len,
+                         ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
+-                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id));
+                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.dst));
+             } else {
+                 view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
+-                        nm, n_embd_v_gqa,
+                        move.len, n_embd_v_gqa,
+                         ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
+-                        ggml_row_size(kv_self->v_l[il]->type, i));
+                        ggml_row_size(kv_self->v_l[il]->type, move.src));
+ 
+                 view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
+-                        nm, n_embd_v_gqa,
+                        move.len, n_embd_v_gqa,
+                         ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
+-                        ggml_row_size(kv_self->v_l[il]->type, id));
+                        ggml_row_size(kv_self->v_l[il]->type, move.dst));
+             }
+ 
+             ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
+             ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
+         }
+-
+-        i += nm - 1;
+     }
+-
+-    //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
+ #endif
+ 
+     return res;
+@@ -737,8 +720,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+ void llama_context::kv_self_update() {
+     auto & kv = kv_self;
+ 
+-    bool need_reserve = false;
+-
+     if (kv->has_shift) {
+         if (!kv->get_can_shift()) {
+             GGML_ABORT("The current context does not support K-shift");
+@@ -759,8 +740,6 @@ void llama_context::kv_self_update() {
+             res->set_inputs(nullptr);
+ 
+             graph_compute(gf, false);
+-
+-            need_reserve = true;
+         }
+ 
+         {
+@@ -775,49 +754,28 @@ void llama_context::kv_self_update() {
+     // defragment the KV cache if needed
+     if (kv->do_defrag) {
+         LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+        const uint32_t n_max_nodes = graph_max_nodes();
+        const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
+        if (!kv->defrag_prepare(n_max_nodes)) {
+            LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
+            return;
+        }
+ 
+-        if (kv->defrag_prepare(graph_max_nodes())) {
+-            ggml_backend_sched_reset(sched.get());
+        for (std::size_t i = 0; i < kv_self->defrag_info.moves.size(); i += max_moves) {
+            std::vector<struct llama_kv_defrag_move> chunk;
+            auto end = std::min(i + max_moves, kv_self->defrag_info.moves.size());
+            chunk.assign(kv_self->defrag_info.moves.begin() + i, kv_self->defrag_info.moves.begin() + end);
+ 
+            ggml_backend_sched_reset(sched.get());
+             auto * gf = graph_init();
+-
+-            auto res = build_kv_self_defrag(ctx_compute.get(), gf);
+-
+            auto res = build_kv_self_defrag(ctx_compute.get(), gf, chunk);
+             ggml_backend_sched_alloc_graph(sched.get(), gf);
+-
+             res->set_inputs(nullptr);
+-
+             graph_compute(gf, false);
+-
+-            need_reserve = true;
+         }
+ 
+         kv->do_defrag = false;
+     }
+-
+-    // reserve a worst case graph if needed
+-    if (need_reserve) {
+-        LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
+-
+-        // build worst-case graph
+-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+-
+-        // simulate full KV cache
+-        kv_self->n = kv_self->size;
+-
+-        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+-        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+-
+-        auto * gf = graph_init();
+-        graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
+-
+-        // initialize scheduler with the worst-case graph
+-        ggml_backend_sched_reset(sched.get());
+-        if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+-            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+-        }
+-    }
+ }
+ 
+ enum llama_pooling_type llama_context::pooling_type() const {
+@@ -1301,9 +1259,12 @@ int llama_context::decode(llama_batch & inp_batch) {
+         // find KV slot
+         {
+             if (!kv_self->find_slot(ubatch)) {
+-                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+-
+-                return 1;
+                kv_self->defrag();
+                kv_self_update();
+                if (!kv_self->find_slot(ubatch)) {
+                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+                    return 1;
+                }
+             }
+ 
+             if (!kv_self->recurrent) {
+diff --git a/src/llama-context.h b/src/llama-context.h
+index baa03276..a59ff8fd 100644
+--- a/src/llama-context.h
+++ b/src/llama-context.h
+@@ -5,6 +5,7 @@
+ #include "llama-cparams.h"
+ #include "llama-graph.h"
+ #include "llama-adapter.h"
+#include "llama-kv-cache.h"
+ 
+ #include "ggml-cpp.h"
+ 
+@@ -180,7 +181,8 @@ private:
+ 
+     llm_graph_result_ptr build_kv_self_defrag(
+             ggml_context * ctx0,
+-            ggml_cgraph * gf) const;
+            ggml_cgraph * gf,
+            const std::vector<struct llama_kv_defrag_move> & moves) const;
+ 
+     // TODO: read/write lora adapters and cvec
+     size_t state_write_data(llama_io_write_i & io);
+diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
+index 9310f262..5c941e7c 100644
+--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
+@@ -781,17 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+ 
+     assert(n_used <= n_kv);
+ 
+-    //const int64_t t_start = ggml_time_us();
+-
+-    // number of cells moved
+-    uint32_t n_moves = 0;
+-
+-    // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
+-    //   - source view, destination view, copy operation
+-    //   - x2 for keys and values
+-    //const uint32_t max_moves = max_nodes()/(6*n_layer);
+-    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
+-    const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
+    defrag_info.moves.clear();
+ 
+     // determine which KV cells to move where
+     //
+@@ -799,10 +789,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+     //
+     //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
+     //
+-    auto & ids = defrag_info.ids;
+-
+-    ids.clear();
+-    ids.resize(n_kv, n_kv);
+    std::vector<uint32_t> ids(n_kv, n_kv);
+ 
+     for (uint32_t i0 = 0; i0 < n_used; ++i0) {
+         const auto & cell0 = cells[i0];
+@@ -851,19 +838,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+         // are we moving a continuous block of memory?
+         bool cont = false;
+ 
+-        // should we stop searching for the next move?
+-        bool stop = false;
+-
+         // go back and move the nf cells to the hole
+         for (; i1 < n_kv; ++i1) {
+             auto & cell1 = cells[i1];
+ 
+             if (cell1.is_empty() || ids[i1] != n_kv) {
+-                if (n_moves == max_moves) {
+-                    stop = true;
+-                    break;
+-                }
+-
+                 cont = false;
+                 continue;
+             }
+@@ -879,8 +858,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+             head = n_used;
+ 
+             if (!cont) {
+-                n_moves++;
+                defrag_info.moves.push_back({i1, i0 + nf, 1});
+                 cont = true;
+            } else {
+                defrag_info.moves.back().len++;
+             }
+ 
+             nf++;
+@@ -890,22 +871,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+             }
+         }
+ 
+-        if (stop || n_moves == max_moves) {
+-            break;
+-        }
+-
+         //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
+ 
+         i0 += nh - 1;
+     }
+ 
+-    if (n_moves == 0) {
+    if (defrag_info.moves.size() == 0) {
+         return false;
+     }
+ 
+-    LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
+-
+-    LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer);
+    // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
+ 
+     return true;
+ }
+diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
+index 56c74035..25cbcb56 100644
+--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
+@@ -43,6 +43,13 @@ private:
+     llama_kv_cache * kv;
+ };
+ 
+// block of KV slots to move when defragging
+struct llama_kv_defrag_move {
+    uint32_t src;
+    uint32_t dst;
+    uint32_t len;
+};
+
+ struct llama_kv_cell {
+     llama_pos pos   = -1;
+     llama_pos delta =  0;
+@@ -131,7 +138,7 @@ public:
+     // defrag
+ 
+     struct {
+-        std::vector<uint32_t> ids;
+        std::vector<llama_kv_defrag_move> moves;
+     } defrag_info;
+ 
+     // return true if cells have been moved
--- a/llama/patches/0011-llama-Ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0011-llama-Ensure-KV-cache-is-fully-defragmented.patch
@@ -1,242 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@ollama.com>
-Date: Fri, 13 Dec 2024 16:11:59 -0800
-Subject: [PATCH] llama: Ensure KV cache is fully defragmented.
-
-Sometimes the KV cache requires defragmentation even without
-triggering the threshold heuristic. In this case, decoding
-will not being able to find a KV cache slot. This is particularly
-difficult for the caller to handle if it happens in between
-ubatches. To avoid this, we should immediately trigger a defrag.
-
-In addition, a heavily fragmented cache can require more than
-max_moves to defragment. Currently, we stop when we hit the limit
-but this can leave a cache that still does not have adequate space
-even after defragmentation is triggered. Instead, we should do
-multiple batches of processing until everything is complete.
---
- src/llama.cpp | 99 ++++++++++++++++++++++++---------------------------
- 1 file changed, 46 insertions(+), 53 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 8f7902df..01854fce 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -1054,6 +1054,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
-     return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
- }
- 
-+// block of KV slots to move when defragging
-+struct llama_kv_defrag_move {
-+    uint32_t src;
-+    uint32_t dst;
-+    uint32_t len;
-+};
-+
- struct llm_build_context {
-     const llama_model    & model;
-           llama_context  & lctx;
-@@ -1230,35 +1237,23 @@ struct llm_build_context {
-         return gf;
-     }
- 
-    struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
-+    struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
-         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
- 
-        for (uint32_t i = 0; i < ids.size(); ++i) {
-            const uint32_t id = ids[i];
-
-            if (i == id || id == ids.size()) {
-                continue;
-            }
-
-            uint32_t nm = 1;
-
-            while (i + nm < ids.size() && ids[i + nm] == id + nm) {
-                nm++;
-            }
-
-+        for (const auto & move : moves) {
-             for (int il = 0; il < n_layer; ++il) {
-                 const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-                 const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
- 
-                 ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
-                        n_embd_k_gqa, nm,
-+                        n_embd_k_gqa, move.len,
-                         ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
-+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));
- 
-                 ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
-                        n_embd_k_gqa, nm,
-+                        n_embd_k_gqa, move.len,
-                         ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
-+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));
- 
-                 ggml_tensor * view_v_src;
-                 ggml_tensor * view_v_dst;
-@@ -1266,31 +1261,29 @@ struct llm_build_context {
-                 if (flash_attn) {
-                     // NOTE: the V cache is not transposed when using flash attention
-                     view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            n_embd_v_gqa, nm,
-+                            n_embd_v_gqa, move.len,
-                             ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
-+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));
- 
-                     view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            n_embd_v_gqa, nm,
-+                            n_embd_v_gqa, move.len,
-                             ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
-+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
-                 } else {
-                     view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            nm, n_embd_v_gqa,
-+                            move.len, n_embd_v_gqa,
-                             ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                            ggml_row_size(kv_self.v_l[il]->type, i));
-+                            ggml_row_size(kv_self.v_l[il]->type, move.src));
- 
-                     view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            nm, n_embd_v_gqa,
-+                            move.len, n_embd_v_gqa,
-                             ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                            ggml_row_size(kv_self.v_l[il]->type, id));
-+                            ggml_row_size(kv_self.v_l[il]->type, move.dst));
-                 }
- 
-                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
-                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
-             }
-
-            i += nm - 1;
-         }
- 
-         //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
-@@ -8508,7 +8501,7 @@ struct llm_build_context {
-     }
- };
- 
-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
-+static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
-     llama_ubatch dummy = {};
-     dummy.equal_seqs = true;
- 
-@@ -8518,7 +8511,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
- 
-     llm.init();
- 
-    struct ggml_cgraph * result = llm.build_defrag(ids);
-+    struct ggml_cgraph * result = llm.build_defrag(moves);
- 
-     llm.free();
- 
-@@ -8956,7 +8949,12 @@ static int llama_prepare_ubatch(
-             kv_self.head = 0;
-         }
- 
-        const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
-+        auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
-+        if (!slot) {
-+            llama_kv_cache_defrag(kv_self);
-+            llama_kv_cache_update(&lctx);
-+            slot = llama_kv_cache_find_slot(kv_self, ubatch);
-+        }
-         if (!slot) {
-             return 1;
-         }
-@@ -9431,8 +9429,8 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
- 
-     //const int64_t t_start = ggml_time_us();
- 
-    // number of cells moved
-    uint32_t n_moves = 0;
-+    // groups of cells moved
-+    std::vector<struct llama_kv_defrag_move> moves;
- 
-     // each move requires 6*n_layer tensors (see build_defrag)
-     //   - source view, destination view, copy operation
-@@ -9496,19 +9494,11 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
-         // are we moving a continuous block of memory?
-         bool cont = false;
- 
-        // should we stop searching for the next move?
-        bool stop = false;
-
-         // go back and move the nf cells to the hole
-         for (; i1 < n_kv; ++i1) {
-             auto & cell1 = kv_self.cells[i1];
- 
-             if (cell1.is_empty() || ids[i1] != n_kv) {
-                if (n_moves == max_moves) {
-                    stop = true;
-                    break;
-                }
-
-                 cont = false;
-                 continue;
-             }
-@@ -9524,8 +9514,10 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
-             kv_self.head = n_used;
- 
-             if (!cont) {
-                n_moves++;
-+                moves.push_back({i1, i0 + nf, 1});
-                 cont = true;
-+            } else {
-+                moves.back().len++;
-             }
- 
-             nf++;
-@@ -9535,22 +9527,16 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
-             }
-         }
- 
-        if (stop || n_moves == max_moves) {
-            break;
-        }
-
-         //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
- 
-         i0 += nh - 1;
-     }
- 
-    if (n_moves == 0) {
-+    if (moves.size() == 0) {
-         return;
-     }
- 
-    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
-
-    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
-+    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n",  moves.size());
- 
- #if 0
-     // CPU defrag
-@@ -9625,11 +9611,18 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
- #else
-     // ggml_graph defrag
- 
-    ggml_backend_sched_reset(lctx.sched.get());
-+    for (std::size_t i = 0; i < moves.size(); i += max_moves) {
-+        std::vector<struct llama_kv_defrag_move> chunk;
-+        auto end = std::min(i + max_moves, moves.size());
-+        chunk.assign(moves.begin() + i, moves.begin() + end);
- 
-    ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
-+        ggml_backend_sched_reset(lctx.sched.get());
-+
-+        //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
-+        ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
- 
-    llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
-+        llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
-+    }
- #endif
- 
-     //const int64_t t_end = ggml_time_us();
--- a/llama/patches/0012-sort-devices-by-score.patch
+++ b/llama/patches/0012-sort-devices-by-score.patch
@@ -1,17 +1,20 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Tue, 14 Jan 2025 12:01:24 -0800
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:31:38 -0700
 Subject: [PATCH] sort devices by score

+in the ggml backend loading code, devices
+are now sorted by score, ensuring the device
+with the fastest acceleration is loaded
 ---
 ggml/src/ggml-backend-reg.cpp | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 95036ef8..98d5e14d 100644
+index 82ae1b5b..1487f322 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
-@@ -150,7 +150,7 @@ struct ggml_backend_reg_entry {
+@@ -157,7 +157,7 @@ struct ggml_backend_reg_entry {
 
 struct ggml_backend_registry {
     std::vector<ggml_backend_reg_entry> backends;
@@ -20,7 +23,7 @@ index 95036ef8..98d5e14d 100644
 
     ggml_backend_registry() {
 #ifdef GGML_USE_CUDA
-@@ -195,7 +195,7 @@ struct ggml_backend_registry {
+@@ -202,7 +202,7 @@ struct ggml_backend_registry {
         }
     }
 
@@ -29,7 +32,7 @@ index 95036ef8..98d5e14d 100644
         if (!reg) {
             return;
         }
-@@ -206,15 +206,20 @@ struct ggml_backend_registry {
+@@ -213,15 +213,20 @@ struct ggml_backend_registry {
 #endif
         backends.push_back({ reg, std::move(handle) });
         for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
@@ -52,17 +55,17 @@ index 95036ef8..98d5e14d 100644
 +        );
     }
 
-     ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
-@@ -257,7 +262,7 @@ struct ggml_backend_registry {
+     ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
+@@ -265,7 +270,7 @@ struct ggml_backend_registry {
 
-         GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
+         GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
 
 -        register_backend(reg, std::move(handle));
 +        register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
 
         return reg;
     }
-@@ -280,7 +285,7 @@ struct ggml_backend_registry {
+@@ -288,7 +293,7 @@ struct ggml_backend_registry {
         // remove devices
         devices.erase(
             std::remove_if(devices.begin(), devices.end(),
@@ -71,7 +74,7 @@ index 95036ef8..98d5e14d 100644
             devices.end());
 
         // remove backend
-@@ -338,7 +343,7 @@ size_t ggml_backend_dev_count() {
+@@ -346,7 +351,7 @@ size_t ggml_backend_dev_count() {
 
 ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
     GGML_ASSERT(index < ggml_backend_dev_count());
--- a/llama/patches/0012-use-dynamic-backend-loading-for-clip.patch
+++ b/llama/patches/0012-use-dynamic-backend-loading-for-clip.patch
@@ -1,102 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sat, 4 Jan 2025 22:52:48 -0800
-Subject: [PATCH] use dynamic backend loading for clip
-
---
- examples/llava/clip.cpp | 74 +++++++++++++++--------------------------
- 1 file changed, 27 insertions(+), 47 deletions(-)
-
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 205af1eb..560021c7 100644
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -9,25 +9,25 @@
- #include "ggml-backend.h"
- #include "gguf.h"
- 
-//#ifdef GGML_USE_CUDA
-//#include "ggml-cuda.h"
-//#endif
-//
-//#ifdef GGML_USE_SYCL
-//#include "ggml-sycl.h"
-//#endif
-//
-//#ifdef GGML_USE_METAL
-//#include "ggml-metal.h"
-//#endif
-//
-//#ifdef GGML_USE_CANN
-//#include "ggml-cann.h"
-//#endif
-//
-//#ifdef GGML_USE_VULKAN
-//#include "ggml-vulkan.h"
-//#endif
-+#ifdef GGML_USE_CUDA
-+#include "ggml-cuda.h"
-+#endif
-+
-+#ifdef GGML_USE_SYCL
-+#include "ggml-sycl.h"
-+#endif
-+
-+#ifdef GGML_USE_METAL
-+#include "ggml-metal.h"
-+#endif
-+
-+#ifdef GGML_USE_CANN
-+#include "ggml-cann.h"
-+#endif
-+
-+#ifdef GGML_USE_VULKAN
-+#include "ggml-vulkan.h"
-+#endif
- 
- #define STB_IMAGE_IMPLEMENTATION
- #include "stb_image.h"
-@@ -1309,35 +1309,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
-         }
-     }
- 
-//#ifdef GGML_USE_CUDA
-//    new_clip->backend = ggml_backend_cuda_init(0);
-//    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_METAL
-//    new_clip->backend = ggml_backend_metal_init();
-//    LOG_INF("%s: CLIP using Metal backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_CANN
-//    new_clip->backend = ggml_backend_cann_init(0);
-//    LOG_INF("%s: CLIP using CANN backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_VULKAN
-//    new_clip->backend = ggml_backend_vk_init(0);
-//    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_SYCL
-//    new_clip->backend = ggml_backend_sycl_init(0);
-//    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
-//#endif
-
-    if (!new_clip->backend) {
-        new_clip->backend = ggml_backend_cpu_init();
-        LOG_INF("%s: CLIP using CPU backend\n", __func__);
-+    ggml_backend_t backend = ggml_backend_init_best();
-+    if (backend == nullptr) {
-+        LOG_ERR("%s: failed to initialize backend\n", __func__);
-+        clip_free(new_clip);
-+        gguf_free(ctx);
-+        return nullptr;
-     }
-+    LOG_INF("%s: using %s backend\n", __func__, ggml_backend_name(backend));
-+    new_clip->backend = backend;
- 
-     // model size and capabilities
-     {
--- a/llama/patches/0013-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0013-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@@ -1,6 +1,6 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Tue, 14 Jan 2025 15:59:04 -0800
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:32:07 -0700
 Subject: [PATCH] add phony target ggml-cpu for all cpu variants

 ---
@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 0002ac18..0a8d1092 100644
+index f00700da..91d6a7d5 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -297,6 +297,7 @@ function(ggml_add_cpu_backend_variant tag_name)
+@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
     endforeach()
 
     ggml_add_cpu_backend_variant_impl(${tag_name})
@@ -19,11 +19,11 @@ index 0002ac18..0a8d1092 100644
 endfunction()
 
 ggml_add_backend(CPU)
-@@ -305,6 +306,7 @@ if (GGML_CPU_ALL_VARIANTS)
+@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
     if (NOT GGML_BACKEND_DL)
         message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
     endif()
 +    add_custom_target(ggml-cpu)
     ggml_add_cpu_backend_variant(sandybridge    AVX)
-     ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 FMA)
-     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
+     ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 BMI2 FMA)
+     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 BMI2 FMA AVX512)
--- a/llama/patches/0014-remove-amx.patch
+++ b/llama/patches/0014-remove-amx.patch
@@ -0,0 +1,25 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:33:01 -0700
+Subject: [PATCH] remove amx
+
+disable amx as it reduces performance on some systems
+---
+ ggml/src/CMakeLists.txt | 4 ----
+ 1 file changed, 4 deletions(-)
+
+diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
+index 91d6a7d5..d6b393a2 100644
+--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
+@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
+     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 BMI2 FMA AVX512)
+     ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+     ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 BMI2 FMA AVX_VNNI)
+-    if (NOT MSVC)
+-        # MSVC doesn't support AMX
+-        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+-    endif()
+ elseif (GGML_CPU)
+     ggml_add_cpu_backend_variant_impl("")
+ endif()
--- a/llama/patches/0015-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0015-fix-string-arr-kv-loading.patch
@@ -1,8 +1,11 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: jmorganca <jmorganca@gmail.com>
-Date: Wed, 5 Mar 2025 17:41:07 -0800
+Date: Tue, 8 Apr 2025 20:35:53 -0700
 Subject: [PATCH] fix string arr kv loading

+certain models would error when loading
+kv metadata fields that contain an array of strings
+such as vocab fields
 ---
 ggml/include/gguf.h | 1 +
 ggml/src/gguf.cpp   | 7 +++++--
@@ -22,7 +25,7 @@ index 79ee2020..3efb22f0 100644
     // get ith C string from array with given key_id
     GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
 diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
-index ab13669c..f75b923f 100644
+index 381a9c7d..e45b453d 100644
 --- a/ggml/src/gguf.cpp
 +++ b/ggml/src/gguf.cpp
@@ -777,10 +777,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
@@ -50,10 +53,10 @@ index ab13669c..f75b923f 100644
 }
 
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index c7ff28be..7a185443 100644
+index d74919d2..c90f636c 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1443,7 +1443,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
 
             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {
--- a/llama/patches/0015-use-std-filesystem-path-instead-of-wstring.patch
+++ b/llama/patches/0015-use-std-filesystem-path-instead-of-wstring.patch
@@ -1,369 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sun, 16 Feb 2025 20:00:22 -0500
-Subject: [PATCH] use std::filesystem::path instead of wstring
-
---
- ggml/src/ggml-backend-reg.cpp | 199 +++++++++++++++-------------------
- 1 file changed, 88 insertions(+), 111 deletions(-)
-
-diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 98d5e14d..799af5f3 100644
--- a/ggml/src/ggml-backend-reg.cpp
-+++ b/ggml/src/ggml-backend-reg.cpp
-@@ -66,26 +66,6 @@
- #include "ggml-kompute.h"
- #endif
- 
-// disable C++17 deprecation warning for std::codecvt_utf8
-#if defined(__clang__)
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-static std::wstring utf8_to_utf16(const std::string & str) {
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-    return converter.from_bytes(str);
-}
-
-static std::string utf16_to_utf8(const std::wstring & str) {
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-    return converter.to_bytes(str);
-}
-
-#if defined(__clang__)
-#    pragma clang diagnostic pop
-#endif
-
- #ifdef _WIN32
- 
- using dl_handle = std::remove_pointer_t<HMODULE>;
-@@ -96,7 +76,7 @@ struct dl_handle_deleter {
-     }
- };
- 
-static dl_handle * dl_load_library(const std::wstring & path) {
-+static dl_handle * dl_load_library(const std::filesystem::path & path) {
-     // suppress error dialogs for missing DLLs
-     DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
-     SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
-@@ -129,8 +109,8 @@ struct dl_handle_deleter {
-     }
- };
- 
-static void * dl_load_library(const std::wstring & path) {
-    dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
-+static void * dl_load_library(const std::filesystem::path & path) {
-+    dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
- 
-     return handle;
- }
-@@ -141,6 +121,25 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
- 
- #endif
- 
-+static std::string path_to_string(const std::filesystem::path & path)
-+{
-+#ifdef _WIN32
-+    const std::wstring wstr = path.wstring();
-+    const int size_needed = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, nullptr, 0, nullptr, nullptr);
-+    if (size_needed <= 0) {
-+        return std::string();
-+    }
-+
-+    // size_needed includes the null terminator
-+    std::string str(size_needed - 1, '\0');
-+    WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, str.data(), size_needed, nullptr, nullptr);
-+    return str;
-+#else
-+    return path.string();
-+#endif
-+}
-+
-+
- using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
- 
- struct ggml_backend_reg_entry {
-@@ -222,11 +221,11 @@ struct ggml_backend_registry {
-         );
-     }
- 
-    ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
-+    ggml_backend_reg_t load_backend(const std::filesystem::path & path, bool silent) {
-         dl_handle_ptr handle { dl_load_library(path) };
-         if (!handle) {
-             if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
-+                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_to_string(path).c_str());
-             }
-             return nullptr;
-         }
-@@ -234,7 +233,7 @@ struct ggml_backend_registry {
-         auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
-         if (score_fn && score_fn() == 0) {
-             if (!silent) {
-                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
-+                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path_to_string(path).c_str());
-             }
-             return nullptr;
-         }
-@@ -242,7 +241,7 @@ struct ggml_backend_registry {
-         auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
-         if (!backend_init_fn) {
-             if (!silent) {
-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
-+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path_to_string(path).c_str());
-             }
-             return nullptr;
-         }
-@@ -251,16 +250,16 @@ struct ggml_backend_registry {
-         if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
-             if (!silent) {
-                 if (!reg) {
-                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
-+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path_to_string(path).c_str());
-                 } else {
-                     GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
-                        __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
-+                        __func__, path_to_string(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
-                 }
-             }
-             return nullptr;
-         }
- 
-        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
-+        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_to_string(path).c_str());
- 
-         register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
- 
-@@ -396,14 +395,14 @@ ggml_backend_t ggml_backend_init_best(void) {
- 
- // Dynamic loading
- ggml_backend_reg_t ggml_backend_load(const char * path) {
-    return get_reg().load_backend(utf8_to_utf16(path), false);
-+    return get_reg().load_backend(path, false);
- }
- 
- void ggml_backend_unload(ggml_backend_reg_t reg) {
-     get_reg().unload_backend(reg, true);
- }
- 
-static std::wstring get_executable_path() {
-+static std::filesystem::path get_executable_path() {
- #if defined(__APPLE__)
-     // get executable path
-     std::vector<char> path;
-@@ -415,15 +414,9 @@ static std::wstring get_executable_path() {
-         }
-         path.resize(size);
-     }
-    std::string base_path(path.data(), size);
-    // remove executable name
-    auto last_slash = base_path.find_last_of('/');
-    if (last_slash != std::string::npos) {
-        base_path = base_path.substr(0, last_slash);
-    }
-    return utf8_to_utf16(base_path + "/");
-+
-+    return std::filesystem::path(path.data()).parent_path();
- #elif defined(__linux__) || defined(__FreeBSD__)
-    std::string base_path = ".";
-     std::vector<char> path(1024);
-     while (true) {
-         // get executable path
-@@ -436,76 +429,55 @@ static std::wstring get_executable_path() {
-             break;
-         }
-         if (len < (ssize_t) path.size()) {
-            base_path = std::string(path.data(), len);
-            // remove executable name
-            auto last_slash = base_path.find_last_of('/');
-            if (last_slash != std::string::npos) {
-                base_path = base_path.substr(0, last_slash);
-            }
-            break;
-+            return std::filesystem::path(path.data()).parent_path();
-         }
-         path.resize(path.size() * 2);
-     }
-
-    return utf8_to_utf16(base_path + "/");
- #elif defined(_WIN32)
-     std::vector<wchar_t> path(MAX_PATH);
-     DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
-     if (len == 0) {
-         return {};
-     }
-    std::wstring base_path(path.data(), len);
-    // remove executable name
-    auto last_slash = base_path.find_last_of('\\');
-    if (last_slash != std::string::npos) {
-        base_path = base_path.substr(0, last_slash);
-    }
-    return base_path + L"\\";
-#else
-    return {};
-#endif
-}
- 
-static std::wstring backend_filename_prefix() {
-#ifdef _WIN32
-    return L"ggml-";
-#else
-    return L"libggml-";
-+    return std::filesystem::path(path.data()).parent_path();
- #endif
-+    return {};
- }
- 
-static std::wstring backend_filename_suffix() {
-+static std::string backend_filename_prefix() {
- #ifdef _WIN32
-    return L".dll";
-+    return "ggml-";
- #else
-    return L".so";
-+    return "libggml-";
- #endif
- }
- 
-static std::wstring path_separator() {
-+static std::string backend_filename_suffix() {
- #ifdef _WIN32
-    return L"\\";
-+    return ".dll";
- #else
-    return L"/";
-+    return ".so";
- #endif
- }
- 
- static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
-     // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
-      // TODO: search system paths
-    std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
-    std::vector<std::wstring> search_paths;
-+    namespace fs = std::filesystem;
-+    std::string file_prefix = backend_filename_prefix() + name + "-";
-+    std::vector<fs::path> search_paths;
-+
-     if (user_search_path == nullptr) {
-        search_paths.push_back(L"." + path_separator());
-+        search_paths.push_back(fs::current_path());
-         search_paths.push_back(get_executable_path());
-     } else {
-        search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
-+        search_paths.push_back(fs::u8path(user_search_path));
-     }
- 
-     int best_score = 0;
-    std::wstring best_path;
-+    fs::path best_path;
- 
-    namespace fs = std::filesystem;
-     for (const auto & search_path : search_paths) {
-         if (!fs::exists(search_path)) {
-             continue;
-@@ -513,29 +485,26 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
-         fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
-         for (const auto & entry : dir_it) {
-             if (entry.is_regular_file()) {
-                std::wstring filename = entry.path().filename().wstring();
-                std::wstring ext = entry.path().extension().wstring();
-+                std::string filename = entry.path().filename().string();
-+                std::string ext = entry.path().extension().string();
-                 if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
-                    dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
-                    if (!handle && !silent) {
-                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
-+                    dl_handle_ptr handle { dl_load_library(entry.path()) };
-+                    if (!handle) {
-+                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_to_string(entry.path()).c_str());
-+                        continue;
-                     }
-                    if (handle) {
-                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
-                        if (score_fn) {
-                            int s = score_fn();
-#ifndef NDEBUG
-                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
-#endif
-                            if (s > best_score) {
-                                best_score = s;
-                                best_path = entry.path().wstring();
-                            }
-                        } else {
-                            if (!silent) {
-                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
-                            }
-                        }
-+
-+                    auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
-+                    if (!score_fn) {
-+                        GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, path_to_string(entry.path()).c_str());
-+                        continue;
-+                    }
-+
-+                    int s = score_fn();
-+                    GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, path_to_string(entry.path()).c_str(), s);
-+                    if (s > best_score) {
-+                        best_score = s;
-+                        best_path = entry.path();
-                     }
-                 }
-             }
-@@ -545,7 +514,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
-     if (best_score == 0) {
-         // try to load the base backend
-         for (const auto & search_path : search_paths) {
-            std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
-+            fs::path path = fs::path(search_path) / (backend_filename_prefix() + name + backend_filename_suffix());
-             if (fs::exists(path)) {
-                 return get_reg().load_backend(path, silent);
-             }
-@@ -560,6 +529,14 @@ void ggml_backend_load_all() {
-     ggml_backend_load_all_from_path(nullptr);
- }
- 
-+static void ggml_backend_try_load_best(const char * name, bool silent, const char * user_search_path) {
-+    try {
-+        ggml_backend_load_best(name, silent, user_search_path);
-+    } catch (const std::exception & e) {
-+        GGML_LOG_DEBUG("%s: failed to load %s: %s\n", __func__, name, e.what());
-+    }
-+}
-+
- void ggml_backend_load_all_from_path(const char * dir_path) {
- #ifdef NDEBUG
-     bool silent = true;
-@@ -567,18 +544,18 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
-     bool silent = false;
- #endif
- 
-    ggml_backend_load_best("blas", silent, dir_path);
-    ggml_backend_load_best("cann", silent, dir_path);
-    ggml_backend_load_best("cuda", silent, dir_path);
-    ggml_backend_load_best("hip", silent, dir_path);
-    ggml_backend_load_best("kompute", silent, dir_path);
-    ggml_backend_load_best("metal", silent, dir_path);
-    ggml_backend_load_best("rpc", silent, dir_path);
-    ggml_backend_load_best("sycl", silent, dir_path);
-    ggml_backend_load_best("vulkan", silent, dir_path);
-    ggml_backend_load_best("opencl", silent, dir_path);
-    ggml_backend_load_best("musa", silent, dir_path);
-    ggml_backend_load_best("cpu", silent, dir_path);
-+    ggml_backend_try_load_best("blas", silent, dir_path);
-+    ggml_backend_try_load_best("cann", silent, dir_path);
-+    ggml_backend_try_load_best("cuda", silent, dir_path);
-+    ggml_backend_try_load_best("hip", silent, dir_path);
-+    ggml_backend_try_load_best("kompute", silent, dir_path);
-+    ggml_backend_try_load_best("metal", silent, dir_path);
-+    ggml_backend_try_load_best("rpc", silent, dir_path);
-+    ggml_backend_try_load_best("sycl", silent, dir_path);
-+    ggml_backend_try_load_best("vulkan", silent, dir_path);
-+    ggml_backend_try_load_best("opencl", silent, dir_path);
-+    ggml_backend_try_load_best("musa", silent, dir_path);
-+    ggml_backend_try_load_best("cpu", silent, dir_path);
-     // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
-     const char * backend_path = std::getenv("GGML_BACKEND_PATH");
-     if (backend_path) {
--- a/llama/patches/0016-ollama-debug-tensor.patch
+++ b/llama/patches/0016-ollama-debug-tensor.patch
@@ -1,6 +1,6 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Sun, 9 Mar 2025 14:44:16 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:36:41 -0700
 Subject: [PATCH] ollama debug tensor

 ---
@@ -8,11 +8,11 @@ Subject: [PATCH] ollama debug tensor
 1 file changed, 6 insertions(+)

 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 2f606d82..ec60e8fc 100644
+index 432942bf..6d4abe4c 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -11,6 +11,8 @@
- #include "ggml-threading.h"
+@@ -15,6 +15,8 @@
+ #include "ops.h"
 #include "ggml.h"
 
 +#include "ollama-debug.h"
@@ -20,7 +20,7 @@ index 2f606d82..ec60e8fc 100644
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-@@ -14103,6 +14105,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+@@ -2854,6 +2856,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
         ggml_compute_forward(&params, node);
 
--- a/llama/patches/0016-remove-amx.patch
+++ b/llama/patches/0016-remove-amx.patch
@@ -1,24 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Tue, 18 Feb 2025 14:47:21 -0800
-Subject: [PATCH] remove amx
-
---
- ggml/src/CMakeLists.txt | 4 ----
- 1 file changed, 4 deletions(-)
-
-diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 0a8d1092..4564df91 100644
--- a/ggml/src/CMakeLists.txt
-+++ b/ggml/src/CMakeLists.txt
-@@ -312,10 +312,6 @@ if (GGML_CPU_ALL_VARIANTS)
-     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
-     ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
-     ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 FMA AVX_VNNI)
-    if (NOT MSVC)
-        # MSVC doesn't support AMX
-        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
-    endif()
- elseif (GGML_CPU)
-     ggml_add_cpu_backend_variant_impl("")
- endif()
--- a/llama/patches/0017-add-model-quantizations.patch
+++ b/llama/patches/0017-add-model-quantizations.patch
@@ -0,0 +1,96 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:39:32 -0700
+Subject: [PATCH] add model quantizations
+
+a temporary patch to add model quantization for
+models not supported in llama.cpp
+---
+ src/llama-arch.cpp  | 17 +++++++++++++++++
+ src/llama-arch.h    |  1 +
+ src/llama-model.cpp |  2 ++
+ src/llama-quant.cpp |  4 ++++
+ 4 files changed, 24 insertions(+)
+
+diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
+index c1f78618..bdf3d898 100644
+--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
+@@ -73,6 +73,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
+     { LLM_ARCH_PLM,              "plm"              },
+     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
+    { LLM_ARCH_MISTRAL3,         "mistral3"         },
+     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
+ };
+ 
+@@ -1582,6 +1583,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+             { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
+         },
+     },
+    {
+        LLM_ARCH_MISTRAL3,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,  "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,   "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,      "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,      "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,      "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,    "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,    "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,    "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_UP,      "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,    "blk.%d.ffn_down" },
+        }
+    },
+     {
+         LLM_ARCH_UNKNOWN,
+         {
+diff --git a/src/llama-arch.h b/src/llama-arch.h
+index f987844d..ee081fbf 100644
+--- a/src/llama-arch.h
+++ b/src/llama-arch.h
+@@ -75,6 +75,7 @@ enum llm_arch {
+     LLM_ARCH_CHAMELEON,
+     LLM_ARCH_SOLAR,
+     LLM_ARCH_WAVTOKENIZER_DEC,
+    LLM_ARCH_MISTRAL3,
+     LLM_ARCH_PLM,
+     LLM_ARCH_BAILINGMOE,
+     LLM_ARCH_UNKNOWN,
+diff --git a/src/llama-model.cpp b/src/llama-model.cpp
+index d5ad466e..cd1d239c 100644
+--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
+@@ -1423,6 +1423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+                     default: type = LLM_TYPE_UNKNOWN;
+                 }
+             } break;
+        case LLM_ARCH_MISTRAL3: break;
+         default: throw std::runtime_error("unsupported model architecture");
+     }
+ 
+@@ -13652,6 +13653,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+         case LLM_ARCH_CHAMELEON:
+         case LLM_ARCH_SOLAR:
+         case LLM_ARCH_BAILINGMOE:
+        case LLM_ARCH_MISTRAL3:
+             return LLAMA_ROPE_TYPE_NORM;
+ 
+         // the pairs of head values are offset by n_rot/2
+diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
+index 223e1f3f..8ae6dde8 100644
+--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
+@@ -744,6 +744,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
+         // This used to be a regex, but <regex> has an extreme cost to compile times.
+         bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
+ 
+        // don't quantize vision stuff
+        quantize &= name.find("v.") == std::string::npos;
+        quantize &= name.find("mm.") == std::string::npos;
+
+         // quantize only 2D and 3D tensors (experts)
+         quantize &= (ggml_n_dims(tensor) >= 2);
+ 
--- a/llama/patches/0017-fix-clip-compiler-error.patch
+++ b/llama/patches/0017-fix-clip-compiler-error.patch
@@ -1,36 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Tue, 25 Feb 2025 19:14:51 -0800
-Subject: [PATCH] fix-clip-compiler-error
-
---
- examples/llava/clip.cpp | 2 +-
- examples/llava/clip.h   | 2 +-
- 2 files changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 560021c7..54265beb 100644
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -1788,7 +1788,7 @@ void clip_image_f32_batch_free(struct clip_image_f32_batch  * batch) {
-     }
- }
- 
-void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
-+void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img) {
-     img->nx = nx;
-     img->ny = ny;
-     img->buf.resize(3 * nx * ny);
-diff --git a/examples/llava/clip.h b/examples/llava/clip.h
-index ce6f6194..f9f80d7d 100644
--- a/examples/llava/clip.h
-+++ b/examples/llava/clip.h
-@@ -75,7 +75,7 @@ CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
- CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
- 
- /** build image from pixels decoded by other libraries instead of stb_image.h for better performance. The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes */
-CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img);
-+CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
- 
- CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
- 
--- a/llama/patches/0022-metal-add-op_neg.patch
+++ b/llama/patches/0022-metal-add-op_neg.patch
@@ -1,18 +1,19 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <git@mxy.ng>
-Date: Wed, 2 Apr 2025 15:26:15 -0700
-Subject: [PATCH] metal: add op_neg
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:41:24 -0700
+Subject: [PATCH] add op_neg

+adds the neg operator to ggml
 ---
 ggml/src/ggml-metal/ggml-metal.m     | 15 +++++++++++++++
 ggml/src/ggml-metal/ggml-metal.metal |  7 +++++++
 2 files changed, 22 insertions(+)

 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index e4c093f9..d8422f1b 100644
+index b121ab9e..fea50521 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -423,6 +423,7 @@ enum ggml_metal_kernel_type {
+@@ -461,6 +461,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_SQRT,
     GGML_METAL_KERNEL_TYPE_SIN,
     GGML_METAL_KERNEL_TYPE_COS,
@@ -20,23 +21,23 @@ index e4c093f9..d8422f1b 100644
     GGML_METAL_KERNEL_TYPE_SUM_ROWS,
     GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
     GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
-@@ -1039,6 +1040,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT,                          sqrt,                           true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN,                           sin,                            true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                           cos,                            true);
-+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                           neg,                            true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                      sum_rows,                       true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                        argmax,                         true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,               pool_2d_avg_f32,                true);
-@@ -1202,6 +1204,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
+@@ -1119,6 +1120,7 @@ @implementation GGMLMetalClass
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT,                            sqrt,                            true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN,                             sin,                             true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                             cos,                             true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                             neg,                             true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                          argmax,                          true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,                 pool_2d_avg_f32,                 true);
+@@ -1280,6 +1282,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_SILU:
                 case GGML_UNARY_OP_ELU:
 +                case GGML_UNARY_OP_NEG:
-                     return ggml_is_contiguous(op->src[0]);
+                     return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
                 default:
                     return false;
-@@ -1873,6 +1876,18 @@ static void ggml_metal_encode_node(
+@@ -1966,6 +1969,18 @@ static void ggml_metal_encode_node(
 
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
@@ -56,10 +57,10 @@ index e4c093f9..d8422f1b 100644
                 {
                     GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index f38909d0..bb0ff668 100644
+index e3185e5b..ede9d1e6 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -945,6 +945,13 @@ kernel void kernel_cos(
+@@ -949,6 +949,13 @@ kernel void kernel_cos(
     dst[tpig] = cos(src0[tpig]);
 }
 
--- a/llama/patches/0018-add-phi4-support.patch
+++ b/llama/patches/0018-add-phi4-support.patch
@@ -1,80 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Thu, 27 Feb 2025 15:12:26 -0800
-Subject: [PATCH] add phi4 support
-
---
- include/llama.h     |  1 +
- src/llama-model.cpp | 10 +++++++---
- src/llama-vocab.cpp | 11 +++++++++++
- 3 files changed, 19 insertions(+), 3 deletions(-)
-
-diff --git a/include/llama.h b/include/llama.h
-index cc948005..16774711 100644
--- a/include/llama.h
-+++ b/include/llama.h
-@@ -105,6 +105,7 @@ extern "C" {
-         LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
-         LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
-         LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
-+        LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
-     };
- 
-     enum llama_rope_type {
-diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 21819080..ab1a07d1 100644
--- a/src/llama-model.cpp
-+++ b/src/llama-model.cpp
-@@ -2283,7 +2283,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
- 
-                     // output
-                     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
-+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-+                    // if output is NULL, init from the input tok embed
-+                    if (output == NULL) {
-+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-+                    }
- 
-                     for (int i = 0; i < n_layer; ++i) {
-                         auto & layer = layers[i];
-@@ -2298,8 +2302,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
-                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
-                         layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
- 
-                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-+                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-+                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                     }
-                 } break;
-             case LLM_ARCH_PHIMOE:
-diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 1ca827eb..c7ff28be 100644
--- a/src/llama-vocab.cpp
-+++ b/src/llama-vocab.cpp
-@@ -392,6 +392,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
-                     "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
-                 };
-                 break;
-+            case LLAMA_VOCAB_PRE_TYPE_GPT4O:
-+                // original regex from tokenizer.json
-+                // [^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+
-+                regex_exprs = {
-+                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
-+                };
-+                break;
-             default:
-                 // default regex for BPE tokenization pre-processing
-                 regex_exprs = {
-@@ -1583,6 +1590,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
-             } else if (
-                 tokenizer_pre == "megrez") {
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
-+            } else if (
-+                tokenizer_pre == "gpt-4o") {
-+                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
-+                clean_spaces = false;
-             } else {
-                 LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
--- a/llama/patches/0019-fix-compiler-error-in-clip.h.patch
+++ b/llama/patches/0019-fix-compiler-error-in-clip.h.patch
@@ -0,0 +1,39 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:49:50 -0700
+Subject: [PATCH] fix compiler error in clip.h
+
+fixes an error that occurs in clip.h when compiling
+using CGo
+---
+ examples/llava/clip.h | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/examples/llava/clip.h b/examples/llava/clip.h
+index cc133a58..5fc45d3e 100644
+--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
+@@ -30,12 +30,13 @@ struct clip_image_size {
+     int height;
+ };
+ 
+struct clip_image_f32;
+ struct clip_image_u8_batch;
+ struct clip_image_f32_batch;
+ 
+ struct clip_context_params {
+     bool use_gpu;
+-    ggml_log_level verbosity;
+    enum ggml_log_level verbosity;
+ };
+ 
+ // deprecated, use clip_init
+@@ -84,7 +85,7 @@ CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
+ CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
+ CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
+ CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
+-CLIP_API clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
+CLIP_API struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
+ 
+ /**
+  * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
--- a/llama/patches/0020-Revert-Simplify-and-improve-CUDA-graphs-through-use-.patch
+++ b/llama/patches/0020-Revert-Simplify-and-improve-CUDA-graphs-through-use-.patch
@@ -0,0 +1,600 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Sat, 12 Apr 2025 13:06:57 -0700
+Subject: [PATCH] Revert "Simplify and improve CUDA graphs through use of
+ indirect copy pointers (#9017)"
+
+this commit in llama.cpp causes errors when running llama 3.2
+vision - temporarily revert it
+
+This reverts commit 3f9da22c2b21a2cef216de50006436ef1cab8764.
+---
+ ggml/src/ggml-cuda/common.cuh   |   8 +-
+ ggml/src/ggml-cuda/cpy.cu       | 149 ++++++++++++--------------------
+ ggml/src/ggml-cuda/cpy.cuh      |   2 -
+ ggml/src/ggml-cuda/ggml-cuda.cu |  93 +++++++++++++++-----
+ 4 files changed, 124 insertions(+), 128 deletions(-)
+
+diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
+index 8284a001..a718b6a1 100644
+--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
+@@ -729,13 +729,7 @@ struct ggml_cuda_graph {
+     bool disable_due_to_failed_graph_capture = false;
+     int number_consecutive_updates = 0;
+     std::vector<ggml_graph_node_properties> ggml_graph_properties;
+-    bool use_cpy_indirection = false;
+-    std::vector<char *> cpy_dest_ptrs;
+-    char ** dest_ptrs_d;
+-    int dest_ptrs_size = 0;
+-    // Index to allow each cpy kernel to be aware of it's position within the graph
+-    // relative to other cpy nodes.
+-    int graph_cpynode_index = -1;
+    std::vector<char **> updated_kernel_arg;
+ #endif
+ };
+ 
+diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
+index 4f4faa3e..8396df28 100644
+--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
+@@ -39,18 +39,16 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
+ }
+ 
+ template <cpy_kernel_t cpy_1>
+-static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
+static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
+                                    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+-                                   const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
+                                   const int nb12, const int nb13) {
+     const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
+ 
+     if (i >= ne) {
+         return;
+     }
+ 
+-    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
+-
+     // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
+     // then combine those indices with the corresponding byte offsets to get the total offsets
+     const int64_t i03 = i/(ne00 * ne01 * ne02);
+@@ -297,18 +295,16 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
+ }
+ 
+ template <cpy_kernel_t cpy_blck, int qk>
+-static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int ne,
+static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
+                                  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                  const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+-                                 const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
+                                 const int nb12, const int nb13) {
+     const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
+ 
+     if (i >= ne) {
+         return;
+     }
+ 
+-    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
+-
+     const int i03 = i/(ne00 * ne01 * ne02);
+     const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+     const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+@@ -325,18 +321,16 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int
+ }
+ 
+ template <cpy_kernel_t cpy_blck, int qk>
+-static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int ne,
+static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
+                                  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                  const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+-                                 const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
+                                 const int nb12, const int nb13) {
+     const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
+ 
+     if (i >= ne) {
+         return;
+     }
+ 
+-    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
+-
+     const int i03 = i/(ne00 * ne01 * ne02);
+     const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+     const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+@@ -352,97 +346,76 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
+     cpy_blck(cx + x_offset, cdst + dst_offset);
+ }
+ 
+-// Copy destination pointers to GPU to be available when pointer indirection is in use
+-
+-void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
+-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+-    if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
+-        CUDA_CHECK(cudaStreamSynchronize(stream));
+-        if (cuda_graph->dest_ptrs_d != nullptr) {
+-            CUDA_CHECK(cudaFree(cuda_graph->dest_ptrs_d));
+-        }
+-        CUDA_CHECK(cudaMalloc(&cuda_graph->dest_ptrs_d, host_dest_ptrs_size*sizeof(char *)));
+-        cuda_graph->dest_ptrs_size = host_dest_ptrs_size;
+-    }
+-    // copy destination pointers to GPU
+-    CUDA_CHECK(cudaMemcpyAsync(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice, stream));
+-    cuda_graph->graph_cpynode_index = 0; // reset index
+-#else
+-    GGML_UNUSED(cuda_graph); GGML_UNUSED(host_dest_ptrs);
+-    GGML_UNUSED(host_dest_ptrs_size); GGML_UNUSED(stream);
+-#endif
+-}
+-
+ static void ggml_cpy_f16_f32_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+     cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_f32_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+     cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_bf16_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+     cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_f16_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+     cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_q8_0_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK8_0 == 0);
+     const int num_blocks = ne / QK8_0;
+     cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_q8_0_f32_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = ne;
+     cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_q4_0_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK4_0 == 0);
+     const int num_blocks = ne / QK4_0;
+     cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_q4_0_f32_cuda(
+@@ -451,22 +424,22 @@ static void ggml_cpy_q4_0_f32_cuda(
+     const int nb00, const int nb01, const int nb02,
+     const int nb03, const int ne10, const int ne11, const int ne12,
+     const int nb10, const int nb11, const int nb12, const int nb13,
+-    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    cudaStream_t stream) {
+     const int num_blocks = ne;
+     cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
+         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+-         ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_q4_1_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK4_1 == 0);
+     const int num_blocks = ne / QK4_1;
+     cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_q4_1_f32_cuda(
+@@ -475,22 +448,22 @@ static void ggml_cpy_q4_1_f32_cuda(
+     const int nb00, const int nb01, const int nb02,
+     const int nb03, const int ne10, const int ne11, const int ne12,
+     const int nb10, const int nb11, const int nb12, const int nb13,
+-    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    cudaStream_t stream) {
+     const int num_blocks = ne;
+     cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
+         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+-         ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_q5_0_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK5_0 == 0);
+     const int num_blocks = ne / QK5_0;
+     cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_q5_0_f32_cuda(
+@@ -499,22 +472,22 @@ static void ggml_cpy_q5_0_f32_cuda(
+     const int nb00, const int nb01, const int nb02,
+     const int nb03, const int ne10, const int ne11, const int ne12,
+     const int nb10, const int nb11, const int nb12, const int nb13,
+-    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    cudaStream_t stream) {
+     const int num_blocks = ne;
+     cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
+         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+-        ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_q5_1_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK5_1 == 0);
+     const int num_blocks = ne / QK5_1;
+     cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_q5_1_f32_cuda(
+@@ -523,32 +496,32 @@ static void ggml_cpy_q5_1_f32_cuda(
+     const int nb00, const int nb01, const int nb02,
+     const int nb03, const int ne10, const int ne11, const int ne12,
+     const int nb10, const int nb11, const int nb12, const int nb13,
+-    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    cudaStream_t stream) {
+     const int num_blocks = ne;
+     cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
+         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+-        ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_iq4_nl_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK4_NL == 0);
+     const int num_blocks = ne / QK4_NL;
+     cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f16_f16_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+     cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
+@@ -585,62 +558,48 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+     char * src0_ddc = (char *) src0->data;
+     char * src1_ddc = (char *) src1->data;
+ 
+-    char ** dest_ptrs_d = nullptr;
+-    int graph_cpynode_index = -1;
+-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+-    if(ctx.cuda_graph->use_cpy_indirection) {
+-        dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
+-        graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
+-    }
+-#endif
+     if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
+         GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
+         CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
+-        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
+-        ggml_cpy_f32_bf16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_bf16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
+-        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
+-        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
+-        ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
+-        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
+         ggml_cpy_q4_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
+-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
+-        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
+         ggml_cpy_q4_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
+-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
+-        ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
+         ggml_cpy_q5_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
+-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
+-        ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
+-        ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
+-        ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
+-        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
+-        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else {
+         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
+                 ggml_type_name(src0->type), ggml_type_name(src1->type));
+     }
+-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+-    if(ctx.cuda_graph->use_cpy_indirection) {
+-        ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
+-    }
+-#endif
+-
+ }
+ 
+ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+diff --git a/ggml/src/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh
+index 6bed0564..28b06cdd 100644
+--- a/ggml/src/ggml-cuda/cpy.cuh
+++ b/ggml/src/ggml-cuda/cpy.cuh
+@@ -7,5 +7,3 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+ 
+ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
+-
+-void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream);
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index 67208cba..a44788db 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2477,11 +2477,10 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
+ 
+ #ifdef USE_CUDA_GRAPH
+ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
+-    bool use_cuda_graph) {
+    std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool use_cuda_graph) {
+ 
+     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
+-    cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
+-
+    cuda_ctx->cuda_graph->updated_kernel_arg.clear();
+     for (int i = 0; i < cgraph->n_nodes; i++) {
+         ggml_tensor * node = cgraph->nodes[i];
+ 
+@@ -2513,11 +2512,8 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+         }
+ 
+         if (node->op == GGML_OP_CPY) {
+-
+-            // Store the pointers which are updated for each token, such that these can be sent
+-            // to the device and accessed using indirection from CUDA graph
+-            cuda_ctx->cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data);
+-
+            // store the copy op parameter which changes with each token.
+            cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
+             // store a pointer to each copy op CUDA kernel to identify it later
+             void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
+             if (!ptr) {
+@@ -2525,6 +2521,10 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+ #ifndef NDEBUG
+                 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
+ #endif
+            } else {
+                if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
+                    ggml_cuda_cpy_fn_ptrs.push_back(ptr);
+                }
+             }
+         }
+ 
+@@ -2533,12 +2533,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+         }
+     }
+ 
+-    if (use_cuda_graph) {
+-        cuda_ctx->cuda_graph->use_cpy_indirection = true;
+-        // copy pointers to GPU so they can be accessed via indirection within CUDA graph
+-        ggml_cuda_cpy_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream());
+-    }
+-
+     return use_cuda_graph;
+ }
+ 
+@@ -2593,6 +2587,51 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
+     return true;
+ }
+ 
+static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool cuda_graph_update_required) {
+
+    if (cuda_graph_update_required) {
+        // Extract nodes from graph
+        // First call with null argument gets number of nodes in graph
+        CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
+        // Subsequent call with non-null argument gets nodes
+        cuda_ctx->cuda_graph->nodes.clear();
+        cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
+        cuda_ctx->cuda_graph->params.clear();
+        cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
+        if (cuda_ctx->cuda_graph->num_nodes > 0) {
+            CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
+
+            // Loop over nodes, and extract kernel parameters from each node
+            for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
+                cudaGraphNodeType node_type;
+                CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
+                if (node_type == cudaGraphNodeTypeKernel) {
+                    cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
+                    if (stat == cudaErrorInvalidDeviceFunction) {
+                        // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
+                        // We don't need to update blas nodes, so clear error and move on.
+                        (void)cudaGetLastError();
+                    } else {
+                        GGML_ASSERT(stat == cudaSuccess);
+                    }
+                }
+            }
+        }
+    } else {
+        // One of the arguments to the copy kernel is updated for each token, hence we need to
+        // replace that argument with the updated value in the CUDA graph
+        // on update steps, the live parameters will already be captured
+        int k = 0;
+        for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
+            if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
+                char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
+                *(void**)cuda_ctx->cuda_graph->params[i].kernelParams[1] = *(void**)updated_kernel_arg_ptr;
+                CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
+            }
+        }
+    }
+}
+
+ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
+ 
+     bool cuda_graph_update_required = false;
+@@ -2652,7 +2691,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
+ #endif
+ 
+ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
+-    bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
+   [[maybe_unused]] std::vector<void *> & ggml_cuda_cpy_fn_ptrs,  bool & graph_evaluated_or_captured, bool & use_cuda_graph,
+    bool & cuda_graph_update_required) {
+ 
+     while (!graph_evaluated_or_captured) {
+         // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
+@@ -2702,9 +2742,13 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
+         if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
+             CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
+         }
+-        if (cuda_graph_update_required) { // Update graph executable
+-            update_cuda_graph_executable(cuda_ctx);
+-        }
+
+        // Perform update to graph (if required for this token), and change copy parameter (required for every token)
+        maintain_cuda_graph(cuda_ctx, ggml_cuda_cpy_fn_ptrs, cuda_graph_update_required);
+
+        // Update graph executable
+        update_cuda_graph_executable(cuda_ctx);
+
+         // Launch graph
+         CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
+ #else
+@@ -2718,6 +2762,10 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+ 
+     ggml_cuda_set_device(cuda_ctx->device);
+ 
+    // vector of pointers to CUDA cpy kernels, which are required to identify
+    // kernel parameters which need updated in the graph for each token
+    std::vector<void *> ggml_cuda_cpy_fn_ptrs;
+
+ #ifdef USE_CUDA_GRAPH
+     static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
+ 
+@@ -2751,7 +2799,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+     if (use_cuda_graph) {
+         cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
+ 
+-        use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph);
+        use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph,
+                             ggml_cuda_cpy_fn_ptrs, use_cuda_graph);
+ 
+         // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
+         if (use_cuda_graph && cuda_graph_update_required) {
+@@ -2772,10 +2821,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+         CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
+     }
+ 
+-    if (!use_cuda_graph) {
+-        cuda_ctx->cuda_graph->use_cpy_indirection = false;
+-    }
+-
+ #else
+     bool use_cuda_graph = false;
+     bool cuda_graph_update_required = false;
+@@ -2783,7 +2828,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+ 
+     bool graph_evaluated_or_captured = false;
+ 
+-    evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
+    evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
+ 
+     return GGML_STATUS_SUCCESS;
+ }
--- a/llama/patches/0021-add-model-quantizations.patch
+++ b/llama/patches/0021-add-model-quantizations.patch
@@ -1,173 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Patrick Devine <patrick@infrahq.com>
-Date: Fri, 14 Mar 2025 16:33:23 -0700
-Subject: [PATCH] add model quantizations
-
- gemma3
- mistral3
---
- src/llama-arch.cpp  | 36 ++++++++++++++++++++++++++++++++++++
- src/llama-arch.h    |  2 ++
- src/llama-model.cpp | 10 ++++++++++
- src/llama-quant.cpp |  4 ++++
- 4 files changed, 52 insertions(+)
-
-diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index b6f20286..13a0a988 100644
--- a/src/llama-arch.cpp
-+++ b/src/llama-arch.cpp
-@@ -37,6 +37,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-     { LLM_ARCH_MINICPM3,         "minicpm3"         },
-     { LLM_ARCH_GEMMA,            "gemma"            },
-     { LLM_ARCH_GEMMA2,           "gemma2"           },
-+    { LLM_ARCH_GEMMA3,           "gemma3"           },
-     { LLM_ARCH_STARCODER2,       "starcoder2"       },
-     { LLM_ARCH_MAMBA,            "mamba"            },
-     { LLM_ARCH_XVERSE,           "xverse"           },
-@@ -64,6 +65,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-     { LLM_ARCH_CHAMELEON,        "chameleon"        },
-     { LLM_ARCH_SOLAR,            "solar"            },
-     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
-+    { LLM_ARCH_MISTRAL3,         "mistral3"         },
-     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
- };
- 
-@@ -804,6 +806,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
-             { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
-         },
-     },
-+    {
-+        LLM_ARCH_GEMMA3,
-+        {
-+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-+            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
-+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-+            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
-+        },
-+    },
-     {
-         LLM_ARCH_STARCODER2,
-         {
-@@ -1352,6 +1372,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
-             { LLM_TENSOR_POS_NET_ATTN_OUT,  "posnet.%d.attn_output" },
-         },
-     },
-+    {
-+        LLM_ARCH_MISTRAL3,
-+        {
-+            { LLM_TENSOR_TOKEN_EMBD,  "token_embd" },
-+            { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
-+            { LLM_TENSOR_ATTN_NORM,   "blk.%d.attn_norm" },
-+            { LLM_TENSOR_ATTN_Q,      "blk.%d.attn_q" },
-+            { LLM_TENSOR_ATTN_K,      "blk.%d.attn_k" },
-+            { LLM_TENSOR_ATTN_V,      "blk.%d.attn_v" },
-+            { LLM_TENSOR_ATTN_OUT,    "blk.%d.attn_output" },
-+            { LLM_TENSOR_FFN_NORM,    "blk.%d.ffn_norm" },
-+            { LLM_TENSOR_FFN_GATE,    "blk.%d.ffn_gate" },
-+            { LLM_TENSOR_FFN_UP,      "blk.%d.ffn_up" },
-+            { LLM_TENSOR_FFN_DOWN,    "blk.%d.ffn_down" },
-+        }
-+    },
-     {
-         LLM_ARCH_UNKNOWN,
-         {
-diff --git a/src/llama-arch.h b/src/llama-arch.h
-index ec742224..8476ae0a 100644
--- a/src/llama-arch.h
-+++ b/src/llama-arch.h
-@@ -41,6 +41,7 @@ enum llm_arch {
-     LLM_ARCH_MINICPM3,
-     LLM_ARCH_GEMMA,
-     LLM_ARCH_GEMMA2,
-+    LLM_ARCH_GEMMA3,
-     LLM_ARCH_STARCODER2,
-     LLM_ARCH_MAMBA,
-     LLM_ARCH_XVERSE,
-@@ -68,6 +69,7 @@ enum llm_arch {
-     LLM_ARCH_CHAMELEON,
-     LLM_ARCH_SOLAR,
-     LLM_ARCH_WAVTOKENIZER_DEC,
-+    LLM_ARCH_MISTRAL3,
-     LLM_ARCH_UNKNOWN,
- };
- 
-diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index ab1a07d1..db4f2685 100644
--- a/src/llama-model.cpp
-+++ b/src/llama-model.cpp
-@@ -878,6 +878,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-                     default: type = LLM_TYPE_UNKNOWN;
-                }
-             } break;
-+        case LLM_ARCH_GEMMA3:
-+            {
-+            } break;
-         case LLM_ARCH_STARCODER2:
-             {
-                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-@@ -1274,6 +1277,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-                 ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
-                 ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
-             } break;
-+        case LLM_ARCH_MISTRAL3: break;
-         default: throw std::runtime_error("unsupported model architecture");
-     }
- 
-@@ -2537,6 +2541,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
-                         layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
-                     }
-                 } break;
-+            case LLM_ARCH_GEMMA3:
-+                {
-+                } break;
-             case LLM_ARCH_STARCODER2:
-                 {
-                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-@@ -3531,6 +3538,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
-                     output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
-                     output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {n_embd}, 0);
-                 } break;
-+            case LLM_ARCH_MISTRAL3: break;
-             default:
-                 throw std::runtime_error("unknown architecture");
-         }
-@@ -4009,6 +4017,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
-         case LLM_ARCH_GRANITE_MOE:
-         case LLM_ARCH_CHAMELEON:
-         case LLM_ARCH_SOLAR:
-+        case LLM_ARCH_MISTRAL3:
-             return LLAMA_ROPE_TYPE_NORM;
- 
-         // the pairs of head values are offset by n_rot/2
-@@ -4029,6 +4038,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
-         case LLM_ARCH_PHIMOE:
-         case LLM_ARCH_GEMMA:
-         case LLM_ARCH_GEMMA2:
-+        case LLM_ARCH_GEMMA3:
-         case LLM_ARCH_STARCODER2:
-         case LLM_ARCH_OPENELM:
-         case LLM_ARCH_GPTNEOX:
-diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
-index 6eb1da08..ebcbafa1 100644
--- a/src/llama-quant.cpp
-+++ b/src/llama-quant.cpp
-@@ -737,6 +737,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
-         // This used to be a regex, but <regex> has an extreme cost to compile times.
-         bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
- 
-+        // don't quantize vision stuff
-+        quantize &= name.find("v.") == std::string::npos;
-+        quantize &= name.find("mm.") == std::string::npos;
-+
-         // quantize only 2D and 3D tensors (experts)
-         quantize &= (ggml_n_dims(tensor) >= 2);
- 
--- a/llama/patches/0021-remove-ggml-git-build-info.patch
+++ b/llama/patches/0021-remove-ggml-git-build-info.patch
@@ -0,0 +1,45 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Sat, 12 Apr 2025 21:13:44 -0400
+Subject: [PATCH] remove ggml git build info
+
+---
+ ggml/CMakeLists.txt | 25 -------------------------
+ 1 file changed, 25 deletions(-)
+
+diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
+index d33f843b..a6c59f22 100644
+--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
+@@ -287,31 +287,6 @@ if (GGML_STANDALONE)
+         DESTINATION share/pkgconfig)
+ endif()
+ 
+-#
+-# Create CMake package
+-#
+-
+-# Generate version info based on git commit.
+-
+-if(NOT DEFINED GGML_BUILD_NUMBER)
+-    find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
+-    execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
+-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+-        OUTPUT_VARIABLE GGML_BUILD_NUMBER
+-        OUTPUT_STRIP_TRAILING_WHITESPACE
+-    )
+-
+-    if(GGML_BUILD_NUMBER EQUAL 1)
+-        message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.")
+-    endif()
+-
+-    execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
+-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+-        OUTPUT_VARIABLE GGML_BUILD_COMMIT
+-        OUTPUT_STRIP_TRAILING_WHITESPACE
+-    )
+-endif()
+-
+ 
+ # Capture variables prefixed with GGML_.
+ 
--- a/llama/patches/0022-add-rdna4-support.patch
+++ b/llama/patches/0022-add-rdna4-support.patch
@@ -1,103 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Saman <saman.khatir@amd.com>
-Date: Wed, 19 Mar 2025 14:02:26 -0700
-Subject: [PATCH] add rdna4 support
-
---
- ggml/src/ggml-cuda/common.cuh    | 6 ++++--
- ggml/src/ggml-cuda/mmq.cu        | 2 +-
- ggml/src/ggml-cuda/mmq.cuh       | 4 ++--
- ggml/src/ggml-cuda/mmvq.cu       | 4 ++--
- ggml/src/ggml-cuda/vendors/hip.h | 4 ++++
- 5 files changed, 13 insertions(+), 7 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
-index adf0d3ec..b24593fc 100644
--- a/ggml/src/ggml-cuda/common.cuh
-+++ b/ggml/src/ggml-cuda/common.cuh
-@@ -61,11 +61,13 @@
- #define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
- #define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
- #define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
-+#define GGML_CUDA_CC_RDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
- 
- #define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
- #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
- #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
-#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
-+#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
-+#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
- #define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
- #define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
- 
-@@ -386,7 +388,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
- #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
-     c = __builtin_amdgcn_sdot4(a, b, c, false);
-#elif defined(RDNA3)
-+#elif defined(RDNA3) || defined(RDNA4)
-     c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
- #elif defined(__gfx1010__) || defined(__gfx900__)
-     int tmp1;
-diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
-index 10f2ebb1..933d945c 100644
--- a/ggml/src/ggml-cuda/mmq.cu
-+++ b/ggml/src/ggml-cuda/mmq.cu
-@@ -149,5 +149,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
-         return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
-     }
- 
-    return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
-+    return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
- }
-diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
-index 0451c65f..66ce2bc9 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
-+++ b/ggml/src/ggml-cuda/mmq.cuh
-@@ -2577,9 +2577,9 @@ static __device__ void mul_mat_q_process_tile(
- 
- template <ggml_type type, int mmq_x, int nwarps, bool need_check>
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
-+#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
-     __launch_bounds__(WARP_SIZE*nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
-+#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
- #else
- #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
-     __launch_bounds__(WARP_SIZE*nwarps, 1)
-diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
-index 4fb466ca..23ae7abc 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
-+++ b/ggml/src/ggml-cuda/mmvq.cu
-@@ -62,13 +62,13 @@ static __global__ void mul_mat_vec_q(
- 
-     constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
- 
-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
-+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4))
-     constexpr int nwarps              = 1;
-     constexpr int rows_per_cuda_block = 1;
- #else
-     constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
-     constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
-+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) && !defined(RDNA4)
- 
-     const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
-     const     int row0 = rows_per_cuda_block*blockIdx.x;
-diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
-index 81964611..a62544b5 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
-+++ b/ggml/src/ggml-cuda/vendors/hip.h
-@@ -150,6 +150,10 @@
- #define CDNA
- #endif
- 
-+#if defined(__gfx1200__) || defined(__gfx1201__)
-+#define RDNA4
-+#endif
-+
- #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
-     defined(__gfx1150__) || defined(__gfx1151__)
- #define RDNA3