llama: update to commit 2016f07b (#10352)

2025-12-10 15:57:04 +00:00 · 2025-04-25 09:26:02 +09:00
parent 11dde41824
commit e9e5f61c45
46 changed files with 1967 additions and 1753 deletions
--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
@@ -65,10 +65,10 @@ index 273075f4..dd11f304 100644
     /* .init_tensor     = */ NULL, // no initialization required
     /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
 diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
-index cec36b36..4b057973 100644
+index e2617b06..242e50a7 100644
 --- a/ggml/src/ggml-cann/ggml-cann.cpp
 +++ b/ggml/src/ggml-cann/ggml-cann.cpp
-@@ -530,6 +530,7 @@ static void ggml_backend_cann_buffer_free_buffer(
+@@ -800,6 +800,7 @@ static void ggml_backend_cann_buffer_free_buffer(
     ggml_backend_cann_buffer_context* ctx =
         (ggml_backend_cann_buffer_context*)buffer->context;
     delete ctx;
@@ -76,7 +76,7 @@ index cec36b36..4b057973 100644
 }
 
 /**
-@@ -1199,6 +1200,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
+@@ -1472,6 +1473,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
  */
 static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
     ACL_CHECK(aclrtFreeHost(buffer->context));
@@ -85,10 +85,10 @@ index cec36b36..4b057973 100644
 
 /**
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index fafe9633..59a49560 100644
+index a7febef7..31750b6f 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -533,6 +533,7 @@ struct ggml_backend_cuda_buffer_context {
+@@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     delete ctx;
@@ -96,7 +96,7 @@ index fafe9633..59a49560 100644
 }
 
 static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
-@@ -788,6 +789,7 @@ struct ggml_backend_cuda_split_buffer_context {
+@@ -789,6 +790,7 @@ struct ggml_backend_cuda_split_buffer_context {
 static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
     delete ctx;
@@ -104,7 +104,7 @@ index fafe9633..59a49560 100644
 }
 
 static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1061,6 +1063,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
+@@ -1062,6 +1064,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
 
 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     CUDA_CHECK(cudaFreeHost(buffer->context));
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
 
 static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 9f1c6c6c..310afe8a 100644
+index 266d8af4..12886cd3 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -4641,6 +4641,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
+@@ -4759,6 +4759,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
     }
 
     free(ctx);
@@ -137,10 +137,10 @@ index 9f1c6c6c..310afe8a 100644
 
 static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
-index b8b5cbd3..14d4561b 100644
+index 05a2f4e6..392cc18d 100644
 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp
 +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
-@@ -1443,6 +1443,7 @@ struct ggml_backend_opencl_buffer_context {
+@@ -1940,6 +1940,7 @@ struct ggml_backend_opencl_buffer_context {
 static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
     delete ctx;
@@ -149,10 +149,10 @@ index b8b5cbd3..14d4561b 100644
 
 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
-index 862b9b66..34536681 100644
+index a0667b7d..bd83adc5 100644
 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
 +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
-@@ -443,6 +443,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -468,6 +468,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
     GGML_ASSERT(status);
     delete ctx;
@@ -161,7 +161,7 @@ index 862b9b66..34536681 100644
 
 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index 3e48a924..a3d182fc 100644
+index 1de34c96..4600f61e 100644
 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
 +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@@ -189,10 +189,10 @@ index 3e48a924..a3d182fc 100644
 
 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 783a0ff8..8ac1e07e 100644
+index 39f3cd34..c569a8a5 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -8639,6 +8639,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -8653,6 +8653,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
     ggml_vk_destroy_buffer(ctx->dev_buffer);
     delete ctx;
@@ -200,7 +200,7 @@ index 783a0ff8..8ac1e07e 100644
 }
 
 static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -8782,6 +8783,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
+@@ -8796,6 +8797,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
 static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
     ggml_vk_host_free(vk_instance.devices[0], buffer->context);
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -10,7 +10,7 @@ logs instead of throwing an error
 1 file changed, 3 insertions(+), 11 deletions(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 464ff01e..0125ee53 100644
+index 48060517..a35b498c 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
--- a/llama/patches/0003-embeddings.patch
+++ b/llama/patches/0003-embeddings.patch
@@ -11,10 +11,10 @@ instead of forcing one or the error
 1 file changed, 3 insertions(+), 3 deletions(-)

 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 4735e98e..65135172 100644
+index 983385f8..32f59819 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -1232,7 +1232,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1236,7 +1236,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     int64_t n_outputs_all = 0;
 
     // count outputs
@@ -23,7 +23,7 @@ index 4735e98e..65135172 100644
         for (uint32_t i = 0; i < n_tokens_all; ++i) {
             n_outputs_all += batch.logits[i] != 0;
         }
-@@ -1344,7 +1344,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1348,7 +1348,7 @@ int llama_context::decode(llama_batch & inp_batch) {
         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
         //}
 
@@ -32,7 +32,7 @@ index 4735e98e..65135172 100644
         auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;
 
         if (t_embd && res->get_embd_pooled()) {
-@@ -1488,7 +1488,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
+@@ -1492,7 +1492,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
--- a/llama/patches/0004-clip-unicode.patch
+++ b/llama/patches/0004-clip-unicode.patch
@@ -10,12 +10,12 @@ filesystems for paths that include wide characters
 1 file changed, 39 insertions(+)

 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 49c90b75..4b72ea9f 100644
+index 75970615..d57b4bd6 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
-@@ -28,6 +28,19 @@
- #include <cinttypes>
+@@ -29,6 +29,19 @@
 #include <limits>
+ #include <array>
 
 +#if defined(_WIN32)
 +#define WIN32_LEAN_AND_MEAN
@@ -33,7 +33,7 @@ index 49c90b75..4b72ea9f 100644
 struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
 
 //#define CLIP_DEBUG_FUNCTIONS
-@@ -1429,7 +1442,29 @@ struct clip_model_loader {
+@@ -1430,7 +1443,29 @@ struct clip_model_loader {
         {
             std::vector<uint8_t> read_buf;
 
@@ -63,7 +63,7 @@ index 49c90b75..4b72ea9f 100644
             if (!fin) {
                 throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
             }
-@@ -1456,7 +1491,11 @@ struct clip_model_loader {
+@@ -1457,7 +1492,11 @@ struct clip_model_loader {
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
                 }
             }
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@@ -1,6 +1,6 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: jmorganca <jmorganca@gmail.com>
-Date: Tue, 8 Apr 2025 16:03:51 -0700
+Date: Sun, 20 Apr 2025 16:11:09 -0700
 Subject: [PATCH] solar-pro

 adds support for the Solar Pro architecture
@@ -15,7 +15,7 @@ adds support for the Solar Pro architecture
 7 files changed, 248 insertions(+)

 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index a6fddc7f..0b0fedcd 100644
+index 62e1480b..f754bc8f 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -31,10 +31,10 @@ index a6fddc7f..0b0fedcd 100644
     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
 +    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
+     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
+     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
 
-     { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
-     { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
-@@ -1478,6 +1480,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -1482,6 +1484,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
@@ -59,7 +59,7 @@ index a6fddc7f..0b0fedcd 100644
     {
         LLM_ARCH_WAVTOKENIZER_DEC,
         {
-@@ -1671,6 +1691,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -1660,6 +1680,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_FFN_EXP_PROBS_B,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -68,7 +68,7 @@ index a6fddc7f..0b0fedcd 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 2c2099b3..74aa3dd0 100644
+index 98ca00a1..439aaeab 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
@@ -72,6 +72,7 @@ enum llm_arch {
@@ -84,10 +84,10 @@ index 2c2099b3..74aa3dd0 100644
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
 +    LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
+     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
+     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
 
-     LLM_KV_ROPE_DIMENSION_COUNT,
-     LLM_KV_ROPE_DIMENSION_SECTIONS,
-@@ -340,6 +342,7 @@ enum llm_tensor {
+@@ -344,6 +346,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
@@ -115,10 +115,10 @@ index 90dfe7a7..8a667960 100644
     if (il < n_layer) {
         return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index 4e0b5719..c3147cbc 100644
+index 80fcd65d..6e278945 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
-@@ -51,6 +51,8 @@ struct llama_hparams {
+@@ -55,6 +55,8 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
@@ -127,7 +127,7 @@ index 4e0b5719..c3147cbc 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
     uint32_t n_lora_kv          = 0;
-@@ -149,6 +151,9 @@ struct llama_hparams {
+@@ -153,6 +155,9 @@ struct llama_hparams {
     // dimension of the recurrent state embeddings
     uint32_t n_embd_v_s() const;
 
@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
 llama_model_loader::llama_model_loader(
         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index b74dd72c..5fbd0055 100644
+index 6b7bfecf..aba42819 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -1372,6 +1372,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1374,6 +1374,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                }
             } break;
@@ -175,7 +175,7 @@ index b74dd72c..5fbd0055 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -3701,6 +3716,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -3717,6 +3732,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
@@ -210,7 +210,7 @@ index b74dd72c..5fbd0055 100644
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -12244,6 +12287,165 @@ struct llm_build_chameleon : public llm_graph_context {
+@@ -12296,6 +12339,165 @@ struct llm_build_chameleon : public llm_graph_context {
     }
 };
 
@@ -309,14 +309,14 @@ index b74dd72c..5fbd0055 100644
 +                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
 +                        ext_factor, attn_factor, beta_fast, beta_slow
 +                        );
-+                
+
 +                cb(Qcur, "Qcur", il);
 +                cb(Kcur, "Kcur", il);
 +                cb(Vcur, "Vcur", il);
 +
 +                cur = build_attn(inp_attn, gf,
 +                        model.layers[il].wo, model.layers[il].bo,
-+                        Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
 +                cb(cur, "attn_out", il);
 +            }
 +
@@ -376,7 +376,7 @@ index b74dd72c..5fbd0055 100644
 struct llm_build_wavtokenizer_dec : public llm_graph_context {
     llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
         ggml_tensor * cur;
-@@ -12993,6 +13195,10 @@ llm_graph_result_ptr llama_model::build_graph(
+@@ -13045,6 +13247,10 @@ llm_graph_result_ptr llama_model::build_graph(
             {
                 llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
             } break;
@@ -387,7 +387,7 @@ index b74dd72c..5fbd0055 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
-@@ -13139,6 +13345,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+@@ -13191,6 +13397,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_CHAMELEON:
@@ -396,7 +396,7 @@ index b74dd72c..5fbd0055 100644
             return LLAMA_ROPE_TYPE_NORM;
 
 diff --git a/src/llama-model.h b/src/llama-model.h
-index 0f18dac1..e08d4ae4 100644
+index fd82d106..5865d5e9 100644
 --- a/src/llama-model.h
 +++ b/src/llama-model.h
@@ -62,6 +62,7 @@ enum llm_type {
@@ -407,7 +407,7 @@ index 0f18dac1..e08d4ae4 100644
     LLM_TYPE_30B,
     LLM_TYPE_32B,
     LLM_TYPE_34B,
-@@ -305,6 +306,8 @@ struct llama_layer {
+@@ -307,6 +308,8 @@ struct llama_layer {
     struct ggml_tensor * ffn_up_scale   = nullptr;
     struct ggml_tensor * ffn_down_scale = nullptr;
 
--- a/llama/patches/0006-add-mllama-support.patch
+++ b/llama/patches/0006-add-mllama-support.patch
@@ -1,6 +1,6 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: jmorganca <jmorganca@gmail.com>
-Date: Tue, 8 Apr 2025 19:27:12 -0700
+Date: Sun, 20 Apr 2025 16:12:36 -0700
 Subject: [PATCH] add mllama support

 adds support for the llama 3.2 vision architecture
@@ -28,7 +28,7 @@ adds support for the llama 3.2 vision architecture
 20 files changed, 475 insertions(+), 22 deletions(-)

 diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
-index 91a07e2a..13127c7b 100644
+index 3d566475..654d1358 100644
 --- a/examples/llava/gemma3-cli.cpp
 +++ b/examples/llava/gemma3-cli.cpp
@@ -106,7 +106,7 @@ struct decode_embd_batch {
@@ -79,10 +79,10 @@ index 03a22cbb..5eb40bcd 100644
             LOG_ERR("%s : failed to eval\n", __func__);
             return false;
 diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
-index 114c274b..a0e649ad 100644
+index 3fd5bebc..f0cec596 100644
 --- a/examples/llava/mtmd.cpp
 +++ b/examples/llava/mtmd.cpp
-@@ -213,7 +213,7 @@ struct decode_embd_batch {
+@@ -233,7 +233,7 @@ struct decode_embd_batch {
     std::vector<llama_seq_id *> seq_ids;
     std::vector<int8_t>         logits;
     llama_batch batch;
@@ -91,7 +91,7 @@ index 114c274b..a0e649ad 100644
         pos     .resize(n_tokens);
         n_seq_id.resize(n_tokens);
         seq_ids .resize(n_tokens + 1);
-@@ -225,6 +225,7 @@ struct decode_embd_batch {
+@@ -245,6 +245,7 @@ struct decode_embd_batch {
             /*n_tokens       =*/ n_tokens,
             /*tokens         =*/ nullptr,
             /*embd           =*/ embd,
@@ -99,9 +99,9 @@ index 114c274b..a0e649ad 100644
             /*pos            =*/ pos.data(),
             /*n_seq_id       =*/ n_seq_id.data(),
             /*seq_id         =*/ seq_ids.data(),
-@@ -291,7 +292,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
+@@ -311,7 +312,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
 
-             int32_t n_tokens = chunk.tokens_image->n_tokens();
+             int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
             float * embd = mtmd_get_output_embd(ctx);
 -            decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
 +            int n_embd  = llama_model_n_embd(llama_get_model(lctx));
@@ -158,7 +158,7 @@ index 5657fbf0..f91896e4 100644
     LLAMA_API void llama_free(struct llama_context * ctx);
 
 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 0b0fedcd..c1f78618 100644
+index f754bc8f..0568565f 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@
@@ -174,10 +174,10 @@ index 0b0fedcd..c1f78618 100644
     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
     { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
 +    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,       "%s.attention.cross_attention_layers"       },
+     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
+     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
 
-     { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
-     { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
-@@ -269,6 +271,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -271,6 +273,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
         },
     },
@@ -218,7 +218,7 @@ index 0b0fedcd..c1f78618 100644
     {
         LLM_ARCH_DECI,
         {
-@@ -1692,6 +1728,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -1681,6 +1717,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
     {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -234,7 +234,7 @@ index 0b0fedcd..c1f78618 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 74aa3dd0..f987844d 100644
+index 439aaeab..6a989034 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
@@ -11,6 +11,7 @@
@@ -250,10 +250,10 @@ index 74aa3dd0..f987844d 100644
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
 +    LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
+     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
+     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
 
-     LLM_KV_ROPE_DIMENSION_COUNT,
-     LLM_KV_ROPE_DIMENSION_SECTIONS,
-@@ -343,6 +345,14 @@ enum llm_tensor {
+@@ -347,6 +349,14 @@ enum llm_tensor {
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
     LLM_TENSOR_BSKCN_TV,
@@ -297,10 +297,10 @@ index 01d5ca57..8682b0e6 100644
         batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
     }
 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 65135172..afe6f552 100644
+index 32f59819..0343ba8a 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -858,7 +858,7 @@ float * llama_context::get_logits_ith(int32_t i) {
+@@ -862,7 +862,7 @@ float * llama_context::get_logits_ith(int32_t i) {
             throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
         }
 
@@ -309,7 +309,7 @@ index 65135172..afe6f552 100644
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
-@@ -979,6 +979,10 @@ void llama_context::set_warmup(bool value) {
+@@ -983,6 +983,10 @@ void llama_context::set_warmup(bool value) {
     cparams.warmup = value;
 }
 
@@ -320,7 +320,7 @@ index 65135172..afe6f552 100644
 void llama_context::set_adapter_lora(
             llama_adapter_lora * adapter,
             float scale) {
-@@ -1054,7 +1058,7 @@ int llama_context::encode(llama_batch & inp_batch) {
+@@ -1058,7 +1062,7 @@ int llama_context::encode(llama_batch & inp_batch) {
 
     const int64_t n_embd = hparams.n_embd;
 
@@ -329,7 +329,7 @@ index 65135172..afe6f552 100644
 
     const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
 
-@@ -1194,10 +1198,9 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1198,10 +1202,9 @@ int llama_context::decode(llama_batch & inp_batch) {
 
     const llama_batch & batch = batch_allocr.batch;
 
@@ -341,7 +341,7 @@ index 65135172..afe6f552 100644
 
     const int64_t n_tokens_all = batch.n_tokens;
     const int64_t n_embd       = hparams.n_embd;
-@@ -1245,7 +1248,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1249,7 +1252,7 @@ int llama_context::decode(llama_batch & inp_batch) {
 
     const bool logits_all = n_outputs_all == n_tokens_all;
 
@@ -350,7 +350,7 @@ index 65135172..afe6f552 100644
             /* simple_split */ !kv_self->recurrent,
             /* logits_all   */ logits_all);
 
-@@ -1479,12 +1482,11 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1483,12 +1486,11 @@ int llama_context::decode(llama_batch & inp_batch) {
 
 int32_t llama_context::output_reserve(int32_t n_outputs) {
     const auto & hparams = model.hparams;
@@ -364,7 +364,7 @@ index 65135172..afe6f552 100644
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
-@@ -1554,7 +1556,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
+@@ -1558,7 +1560,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
 void llama_context::output_reorder() {
     auto & out_ids = sbatch.out_ids;
     if (!out_ids.empty()) {
@@ -373,7 +373,7 @@ index 65135172..afe6f552 100644
         const uint32_t n_embd  = model.hparams.n_embd;
 
         GGML_ASSERT((size_t) n_outputs == out_ids.size());
-@@ -2061,7 +2063,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
+@@ -2065,7 +2067,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
     {
         LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
 
@@ -382,7 +382,7 @@ index 65135172..afe6f552 100644
 
         io.write(&logits_size, sizeof(logits_size));
 
-@@ -2244,6 +2246,7 @@ llama_context_params llama_context_default_params() {
+@@ -2248,6 +2250,7 @@ llama_context_params llama_context_default_params() {
         /*.offload_kqv                 =*/ true,
         /*.flash_attn                  =*/ false,
         /*.no_perf                     =*/ true,
@@ -390,7 +390,7 @@ index 65135172..afe6f552 100644
         /*.abort_callback              =*/ nullptr,
         /*.abort_callback_data         =*/ nullptr,
     };
-@@ -2371,6 +2374,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
+@@ -2375,6 +2378,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
     ctx->set_warmup(warmup);
 }
 
@@ -426,7 +426,7 @@ index 30e550f0..85ad91b9 100644
 
     enum llama_pooling_type pooling_type;
 diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
-index cd955d63..83f3c5a8 100644
+index a85e9728..d740c120 100644
 --- a/src/llama-graph.cpp
 +++ b/src/llama-graph.cpp
@@ -546,6 +546,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
@@ -442,7 +442,7 @@ index cd955d63..83f3c5a8 100644
 //
 // llm_graph_context
 //
-@@ -1495,6 +1501,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
+@@ -1506,6 +1512,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
     return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
 }
 
@@ -469,7 +469,7 @@ index cd955d63..83f3c5a8 100644
         llm_graph_input_attn_cross * inp,
         ggml_cgraph * gf,
 diff --git a/src/llama-graph.h b/src/llama-graph.h
-index 5b6618f9..51993998 100644
+index d192dc14..260a2af2 100644
 --- a/src/llama-graph.h
 +++ b/src/llama-graph.h
@@ -86,6 +86,7 @@ public:
@@ -518,7 +518,7 @@ index 8a667960..6a02de03 100644
 +    return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
 +}
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index c3147cbc..4567a0e9 100644
+index 6e278945..c8a34d52 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
@@ -2,6 +2,8 @@
@@ -536,9 +536,9 @@ index c3147cbc..4567a0e9 100644
     uint32_t n_rel_attn_bkts = 0;
 +    uint32_t n_vocab = 0;
 
-     // for WavTokenizer
-     struct llama_hparams_posnet   posnet;
-@@ -52,6 +55,7 @@ struct llama_hparams {
+     // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
+     uint32_t n_embd_head_k_mla = 0;
+@@ -56,6 +59,7 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
     std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
@@ -546,7 +546,7 @@ index c3147cbc..4567a0e9 100644
 
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
-@@ -154,6 +158,9 @@ struct llama_hparams {
+@@ -158,6 +162,9 @@ struct llama_hparams {
     // Block skip connection
     bool n_bskcn(uint32_t n, uint32_t il) const;
 
@@ -557,7 +557,7 @@ index c3147cbc..4567a0e9 100644
 };
 
 diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
-index dbf5f118..9310f262 100644
+index 7c9d46d8..69f8d35a 100644
 --- a/src/llama-kv-cache.cpp
 +++ b/src/llama-kv-cache.cpp
@@ -95,8 +95,16 @@ bool llama_kv_cache_unified::init(
@@ -593,7 +593,7 @@ index a012aeae..2e11507d 100644
     bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
         const int kid = gguf_find_key(meta.get(), key.c_str());
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 5fbd0055..d5ad466e 100644
+index aba42819..d051696c 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
@@ -419,6 +419,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
@@ -650,7 +650,7 @@ index 5fbd0055..d5ad466e 100644
         case LLM_ARCH_DECI:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-@@ -1548,7 +1562,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -1550,7 +1564,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         const int64_t n_embd_head_v = hparams.n_embd_head_v;
         const int64_t n_ff          = hparams.n_ff();
         const int64_t n_embd_gqa    = n_embd_v_gqa;
@@ -659,7 +659,7 @@ index 5fbd0055..d5ad466e 100644
         const int64_t n_token_types = vocab.n_token_types();
         const int64_t n_rot         = hparams.n_rot;
         const int64_t n_expert      = hparams.n_expert;
-@@ -1801,6 +1815,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -1803,6 +1817,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         }
                     }
                 } break;
@@ -712,7 +712,7 @@ index 5fbd0055..d5ad466e 100644
             case LLM_ARCH_DECI:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-@@ -4665,6 +4725,246 @@ struct llm_build_llama : public llm_graph_context {
+@@ -4683,6 +4743,246 @@ struct llm_build_llama : public llm_graph_context {
     }
 };
 
@@ -893,14 +893,14 @@ index 5fbd0055..d5ad466e 100644
 +                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
 +                        ext_factor, attn_factor, beta_fast, beta_slow
 +                        );
-+                
+
 +                cb(Qcur, "Qcur", il);
 +                cb(Kcur, "Kcur", il);
 +                cb(Vcur, "Vcur", il);
 +
 +                cur = build_attn(inp_attn, gf,
 +                    model.layers[il].wo, model.layers[il].bo,
-+                    Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                    Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
 +
 +                if (il == n_layer - 1) {
 +                    // skip computing output for unused tokens
@@ -959,7 +959,7 @@ index 5fbd0055..d5ad466e 100644
 struct llm_build_deci : public llm_graph_context {
     llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
-@@ -12965,6 +13265,10 @@ llm_graph_result_ptr llama_model::build_graph(
+@@ -13017,6 +13317,10 @@ llm_graph_result_ptr llama_model::build_graph(
             {
                 llm = std::make_unique<llm_build_llama>(*this, params, gf);
             } break;
@@ -970,7 +970,7 @@ index 5fbd0055..d5ad466e 100644
         case LLM_ARCH_DECI:
             {
                 llm = std::make_unique<llm_build_deci>(*this, params, gf);
-@@ -13325,6 +13629,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+@@ -13377,6 +13681,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         // use what we call a normal RoPE, operating on pairs of consecutive head values
         case LLM_ARCH_LLAMA:
         case LLM_ARCH_LLAMA4:
@@ -979,7 +979,7 @@ index 5fbd0055..d5ad466e 100644
         case LLM_ARCH_BAICHUAN:
         case LLM_ARCH_STARCODER:
 diff --git a/src/llama-model.h b/src/llama-model.h
-index e08d4ae4..21c4617b 100644
+index 5865d5e9..72bab5be 100644
 --- a/src/llama-model.h
 +++ b/src/llama-model.h
@@ -11,6 +11,7 @@
@@ -998,7 +998,7 @@ index e08d4ae4..21c4617b 100644
     LLM_TYPE_236B,
     LLM_TYPE_314B,
     LLM_TYPE_671B,
-@@ -308,6 +310,16 @@ struct llama_layer {
+@@ -310,6 +312,16 @@ struct llama_layer {
 
     struct ggml_tensor * bskcn_tv = nullptr;
 
--- a/llama/patches/0006-conditional-fattn.patch
+++ b/llama/patches/0006-conditional-fattn.patch
@@ -1,25 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Daniel Hiltgen <daniel@ollama.com>
-Date: Wed, 9 Oct 2024 17:26:23 -0700
-Subject: [PATCH] conditional-fattn
-
---
- ggml/src/ggml-cuda/ggml-cuda.cu | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 59a49560..b70c6a32 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2338,9 +2338,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
-         case GGML_OP_ARGSORT:
-             ggml_cuda_op_argsort(ctx, dst);
-             break;
-+#if !defined(GGML_DISABLE_FLASH_ATTN)
-         case GGML_OP_FLASH_ATTN_EXT:
-             ggml_cuda_flash_attn_ext(ctx, dst);
-             break;
-+#endif
-         case GGML_OP_CROSS_ENTROPY_LOSS:
-             ggml_cuda_cross_entropy_loss(ctx, dst);
-             break;
--- a/llama/patches/0007-add-unpad-operator.patch
+++ b/llama/patches/0007-add-unpad-operator.patch
@@ -147,10 +147,10 @@ index 410a3720..3eca1cf8 100644
 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index b70c6a32..67208cba 100644
+index 31750b6f..0fef9522 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2245,6 +2245,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2246,6 +2246,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_PAD:
             ggml_cuda_op_pad(ctx, dst);
             break;
@@ -160,7 +160,7 @@ index b70c6a32..67208cba 100644
         case GGML_OP_ARANGE:
             ggml_cuda_op_arange(ctx, dst);
             break;
-@@ -3223,6 +3226,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
+@@ -3222,6 +3225,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_UPSCALE:
             return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
         case GGML_OP_PAD:
@@ -233,7 +233,7 @@ index 8fd386b0..e2ededc3 100644
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 310afe8a..b121ab9e 100644
+index 12886cd3..b2e95a66 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
@@ -244,7 +244,7 @@ index 310afe8a..b121ab9e 100644
     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-@@ -998,6 +999,7 @@ @implementation GGMLMetalClass
+@@ -1020,6 +1021,7 @@ @implementation GGMLMetalClass
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                     upscale_f32,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                         pad_f32,                         true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,              pad_reflect_1d_f32,              true);
@@ -252,7 +252,7 @@ index 310afe8a..b121ab9e 100644
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,          timestep_embedding_f32,          true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                      arange_f32,                      true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
-@@ -1339,6 +1341,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
+@@ -1384,6 +1386,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_POOL_2D:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
@@ -260,7 +260,7 @@ index 310afe8a..b121ab9e 100644
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
         case GGML_OP_LEAKY_RELU:
-@@ -3669,6 +3672,36 @@ static void ggml_metal_encode_node(
+@@ -3731,6 +3734,36 @@ static void ggml_metal_encode_node(
 
                 const int nth = MIN(1024, ne0);
 
@@ -298,10 +298,10 @@ index 310afe8a..b121ab9e 100644
             } break;
         case GGML_OP_ARANGE:
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index b08666e2..e3185e5b 100644
+index 8d6e99e6..71f0f97f 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -2968,6 +2968,51 @@ kernel void kernel_pad_reflect_1d_f32(
+@@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32(
     }
 }
 
--- a/llama/patches/0008-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0008-fix-deepseek-deseret-regex.patch
@@ -12,7 +12,7 @@ regex
 2 files changed, 22 insertions(+), 1 deletion(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 0125ee53..d74919d2 100644
+index a35b498c..032019c9 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
--- a/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch
+++ b/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch
--- a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
@@ -22,10 +22,10 @@ multiple batches of processing until everything is complete.
 4 files changed, 51 insertions(+), 106 deletions(-)

 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index afe6f552..d6e7b3af 100644
+index 0343ba8a..4b3e6a83 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -590,13 +590,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
+@@ -594,13 +594,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
 
 llm_graph_result_ptr llama_context::build_kv_self_defrag(
         ggml_context * ctx0,
@@ -41,7 +41,7 @@ index afe6f552..d6e7b3af 100644
 #if 0
     // CPU defrag
     //
-@@ -668,32 +667,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+@@ -672,32 +671,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
         ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
     }
 #else
@@ -79,7 +79,7 @@ index afe6f552..d6e7b3af 100644
 
             ggml_tensor * view_v_src;
             ggml_tensor * view_v_dst;
-@@ -701,34 +688,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+@@ -705,34 +692,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
             if (cparams.flash_attn) {
                 // NOTE: the V cache is not transposed when using flash attention
                 view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
@@ -122,7 +122,7 @@ index afe6f552..d6e7b3af 100644
 #endif
 
     return res;
-@@ -737,8 +720,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+@@ -741,8 +724,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
 void llama_context::kv_self_update() {
     auto & kv = kv_self;
 
@@ -131,7 +131,7 @@ index afe6f552..d6e7b3af 100644
     if (kv->has_shift) {
         if (!kv->get_can_shift()) {
             GGML_ABORT("The current context does not support K-shift");
-@@ -759,8 +740,6 @@ void llama_context::kv_self_update() {
+@@ -763,8 +744,6 @@ void llama_context::kv_self_update() {
             res->set_inputs(nullptr);
 
             graph_compute(gf, false);
@@ -140,7 +140,7 @@ index afe6f552..d6e7b3af 100644
         }
 
         {
-@@ -775,49 +754,28 @@ void llama_context::kv_self_update() {
+@@ -779,49 +758,28 @@ void llama_context::kv_self_update() {
     // defragment the KV cache if needed
     if (kv->do_defrag) {
         LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
@@ -202,7 +202,7 @@ index afe6f552..d6e7b3af 100644
 }
 
 enum llama_pooling_type llama_context::pooling_type() const {
-@@ -1301,9 +1259,12 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1305,9 +1263,12 @@ int llama_context::decode(llama_batch & inp_batch) {
         // find KV slot
         {
             if (!kv_self->find_slot(ubatch)) {
@@ -241,7 +241,7 @@ index baa03276..a59ff8fd 100644
     // TODO: read/write lora adapters and cvec
     size_t state_write_data(llama_io_write_i & io);
 diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
-index 9310f262..5c941e7c 100644
+index 69f8d35a..35a750d3 100644
 --- a/src/llama-kv-cache.cpp
 +++ b/src/llama-kv-cache.cpp
@@ -781,17 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
--- a/llama/patches/0011-sort-devices-by-score.patch
+++ b/llama/patches/0011-sort-devices-by-score.patch
--- a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
--- a/llama/patches/0013-remove-amx.patch
+++ b/llama/patches/0013-remove-amx.patch
--- a/llama/patches/0014-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0014-fix-string-arr-kv-loading.patch
@@ -53,7 +53,7 @@ index 381a9c7d..e45b453d 100644
 }
 
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index d74919d2..c90f636c 100644
+index 032019c9..ba37df35 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
--- a/llama/patches/0015-ollama-debug-tensor.patch
+++ b/llama/patches/0015-ollama-debug-tensor.patch
--- a/llama/patches/0016-add-model-quantizations.patch
+++ b/llama/patches/0016-add-model-quantizations.patch
@@ -13,7 +13,7 @@ models not supported in llama.cpp
 4 files changed, 24 insertions(+)

 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index c1f78618..bdf3d898 100644
+index 0568565f..dd01df60 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
@@ -73,6 +73,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -24,7 +24,7 @@ index c1f78618..bdf3d898 100644
     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
 
-@@ -1582,6 +1583,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -1586,6 +1587,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
         },
     },
@@ -48,7 +48,7 @@ index c1f78618..bdf3d898 100644
         LLM_ARCH_UNKNOWN,
         {
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index f987844d..ee081fbf 100644
+index 6a989034..b6227eeb 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
@@ -75,6 +75,7 @@ enum llm_arch {
@@ -60,10 +60,10 @@ index f987844d..ee081fbf 100644
     LLM_ARCH_BAILINGMOE,
     LLM_ARCH_UNKNOWN,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index d5ad466e..cd1d239c 100644
+index d051696c..c8374159 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -1423,6 +1423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1425,6 +1425,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
@@ -71,7 +71,7 @@ index d5ad466e..cd1d239c 100644
         default: throw std::runtime_error("unsupported model architecture");
     }
 
-@@ -13652,6 +13653,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+@@ -13704,6 +13705,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_CHAMELEON:
         case LLM_ARCH_SOLAR:
         case LLM_ARCH_BAILINGMOE:
--- a/llama/patches/0018-add-op_neg.patch
+++ b/llama/patches/0018-add-op_neg.patch
@@ -1,76 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Tue, 8 Apr 2025 20:41:24 -0700
-Subject: [PATCH] add op_neg
-
-adds the neg operator to ggml
---
- ggml/src/ggml-metal/ggml-metal.m     | 15 +++++++++++++++
- ggml/src/ggml-metal/ggml-metal.metal |  7 +++++++
- 2 files changed, 22 insertions(+)
-
-diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index b121ab9e..fea50521 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
-+++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -461,6 +461,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
-     GGML_METAL_KERNEL_TYPE_SQRT,
-     GGML_METAL_KERNEL_TYPE_SIN,
-     GGML_METAL_KERNEL_TYPE_COS,
-+    GGML_METAL_KERNEL_TYPE_NEG,
-     GGML_METAL_KERNEL_TYPE_SUM_ROWS,
-     GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
-     GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
-@@ -1119,6 +1120,7 @@ @implementation GGMLMetalClass
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT,                            sqrt,                            true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN,                             sin,                             true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                             cos,                             true);
-+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                             neg,                             true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                          argmax,                          true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,                 pool_2d_avg_f32,                 true);
-@@ -1280,6 +1282,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
-                 case GGML_UNARY_OP_GELU_QUICK:
-                 case GGML_UNARY_OP_SILU:
-                 case GGML_UNARY_OP_ELU:
-+                case GGML_UNARY_OP_NEG:
-                     return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
-                 default:
-                     return false;
-@@ -1966,6 +1969,18 @@ static void ggml_metal_encode_node(
- 
-                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                 } break;
-+                case GGML_UNARY_OP_NEG:
-+                {
-+                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NEG].pipeline;
-+
-+                    [encoder setComputePipelineState:pipeline];
-+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-+
-+                    const int64_t n = ggml_nelements(dst);
-+
-+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-+                } break;
-                 default:
-                 {
-                     GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
-diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index e3185e5b..ede9d1e6 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
-+++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -949,6 +949,13 @@ kernel void kernel_cos(
-     dst[tpig] = cos(src0[tpig]);
- }
- 
-+kernel void kernel_neg(
-+        device const float * src0,
-+        device       float * dst,
-+        uint tpig[[thread_position_in_grid]]) {
-+    dst[tpig] = -src0[tpig];
-+}
-+
- kernel void kernel_sum_rows(
-         device const float * src0,
-         device       float * dst,
--- a/llama/patches/0019-fix-compiler-error-in-clip.h.patch
+++ b/llama/patches/0019-fix-compiler-error-in-clip.h.patch
@@ -1,39 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Tue, 8 Apr 2025 20:49:50 -0700
-Subject: [PATCH] fix compiler error in clip.h
-
-fixes an error that occurs in clip.h when compiling
-using CGo
---
- examples/llava/clip.h | 5 +++--
- 1 file changed, 3 insertions(+), 2 deletions(-)
-
-diff --git a/examples/llava/clip.h b/examples/llava/clip.h
-index cc133a58..5fc45d3e 100644
--- a/examples/llava/clip.h
-+++ b/examples/llava/clip.h
-@@ -30,12 +30,13 @@ struct clip_image_size {
-     int height;
- };
- 
-+struct clip_image_f32;
- struct clip_image_u8_batch;
- struct clip_image_f32_batch;
- 
- struct clip_context_params {
-     bool use_gpu;
-    ggml_log_level verbosity;
-+    enum ggml_log_level verbosity;
- };
- 
- // deprecated, use clip_init
-@@ -84,7 +85,7 @@ CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
- CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
- CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
- CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
-CLIP_API clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
-+CLIP_API struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
- 
- /**
-  * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
--- a/llama/patches/0020-Revert-Simplify-and-improve-CUDA-graphs-through-use-.patch
+++ b/llama/patches/0020-Revert-Simplify-and-improve-CUDA-graphs-through-use-.patch
@@ -1,600 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sat, 12 Apr 2025 13:06:57 -0700
-Subject: [PATCH] Revert "Simplify and improve CUDA graphs through use of
- indirect copy pointers (#9017)"
-
-this commit in llama.cpp causes errors when running llama 3.2
-vision - temporarily revert it
-
-This reverts commit 3f9da22c2b21a2cef216de50006436ef1cab8764.
---
- ggml/src/ggml-cuda/common.cuh   |   8 +-
- ggml/src/ggml-cuda/cpy.cu       | 149 ++++++++++++--------------------
- ggml/src/ggml-cuda/cpy.cuh      |   2 -
- ggml/src/ggml-cuda/ggml-cuda.cu |  93 +++++++++++++++-----
- 4 files changed, 124 insertions(+), 128 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
-index 8284a001..a718b6a1 100644
--- a/ggml/src/ggml-cuda/common.cuh
-+++ b/ggml/src/ggml-cuda/common.cuh
-@@ -729,13 +729,7 @@ struct ggml_cuda_graph {
-     bool disable_due_to_failed_graph_capture = false;
-     int number_consecutive_updates = 0;
-     std::vector<ggml_graph_node_properties> ggml_graph_properties;
-    bool use_cpy_indirection = false;
-    std::vector<char *> cpy_dest_ptrs;
-    char ** dest_ptrs_d;
-    int dest_ptrs_size = 0;
-    // Index to allow each cpy kernel to be aware of it's position within the graph
-    // relative to other cpy nodes.
-    int graph_cpynode_index = -1;
-+    std::vector<char **> updated_kernel_arg;
- #endif
- };
- 
-diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
-index 4f4faa3e..8396df28 100644
--- a/ggml/src/ggml-cuda/cpy.cu
-+++ b/ggml/src/ggml-cuda/cpy.cu
-@@ -39,18 +39,16 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
- }
- 
- template <cpy_kernel_t cpy_1>
-static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
-+static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
-                                    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-                                    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                                   const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
-+                                   const int nb12, const int nb13) {
-     const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
- 
-     if (i >= ne) {
-         return;
-     }
- 
-    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
-
-     // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
-     // then combine those indices with the corresponding byte offsets to get the total offsets
-     const int64_t i03 = i/(ne00 * ne01 * ne02);
-@@ -297,18 +295,16 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
- }
- 
- template <cpy_kernel_t cpy_blck, int qk>
-static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int ne,
-+static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
-                                  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-                                  const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                                 const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
-+                                 const int nb12, const int nb13) {
-     const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
- 
-     if (i >= ne) {
-         return;
-     }
- 
-    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
-
-     const int i03 = i/(ne00 * ne01 * ne02);
-     const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
-     const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
-@@ -325,18 +321,16 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int
- }
- 
- template <cpy_kernel_t cpy_blck, int qk>
-static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int ne,
-+static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
-                                  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-                                  const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                                 const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
-+                                 const int nb12, const int nb13) {
-     const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
- 
-     if (i >= ne) {
-         return;
-     }
- 
-    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
-
-     const int i03 = i/(ne00 * ne01 * ne02);
-     const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
-     const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
-@@ -352,97 +346,76 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
-     cpy_blck(cx + x_offset, cdst + dst_offset);
- }
- 
-// Copy destination pointers to GPU to be available when pointer indirection is in use
-
-void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
-    if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
-        CUDA_CHECK(cudaStreamSynchronize(stream));
-        if (cuda_graph->dest_ptrs_d != nullptr) {
-            CUDA_CHECK(cudaFree(cuda_graph->dest_ptrs_d));
-        }
-        CUDA_CHECK(cudaMalloc(&cuda_graph->dest_ptrs_d, host_dest_ptrs_size*sizeof(char *)));
-        cuda_graph->dest_ptrs_size = host_dest_ptrs_size;
-    }
-    // copy destination pointers to GPU
-    CUDA_CHECK(cudaMemcpyAsync(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice, stream));
-    cuda_graph->graph_cpynode_index = 0; // reset index
-#else
-    GGML_UNUSED(cuda_graph); GGML_UNUSED(host_dest_ptrs);
-    GGML_UNUSED(host_dest_ptrs_size); GGML_UNUSED(stream);
-#endif
-}
-
- static void ggml_cpy_f16_f32_cuda(
-     const char * cx, char * cdst, const int ne,
-     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
-+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
- 
-     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-     cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
-+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
- }
- 
- static void ggml_cpy_f32_f32_cuda(
-     const char * cx, char * cdst, const int ne,
-     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
-+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
- 
-     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-     cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
-+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
- }
- 
- static void ggml_cpy_f32_bf16_cuda(
-     const char * cx, char * cdst, const int ne,
-     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
-+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
- 
-     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-     cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
-+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
- }
- 
- static void ggml_cpy_f32_f16_cuda(
-     const char * cx, char * cdst, const int ne,
-     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
-+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
- 
-     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-     cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
-+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
- }
- 
- static void ggml_cpy_f32_q8_0_cuda(
-     const char * cx, char * cdst, const int ne,
-     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
-+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
- 
-     GGML_ASSERT(ne % QK8_0 == 0);
-     const int num_blocks = ne / QK8_0;
-     cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
-+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
- }
- 
- static void ggml_cpy_q8_0_f32_cuda(
-     const char * cx, char * cdst, const int ne,
-     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
-+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
- 
-     const int num_blocks = ne;
-     cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
-+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
- }
- 
- static void ggml_cpy_f32_q4_0_cuda(
-     const char * cx, char * cdst, const int ne,
-     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
-+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
- 
-     GGML_ASSERT(ne % QK4_0 == 0);
-     const int num_blocks = ne / QK4_0;
-     cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
-+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
- }
- 
- static void ggml_cpy_q4_0_f32_cuda(
-@@ -451,22 +424,22 @@ static void ggml_cpy_q4_0_f32_cuda(
-     const int nb00, const int nb01, const int nb02,
-     const int nb03, const int ne10, const int ne11, const int ne12,
-     const int nb10, const int nb11, const int nb12, const int nb13,
-    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
-+    cudaStream_t stream) {
-     const int num_blocks = ne;
-     cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
-         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-         ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
-+         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
- }
- 
- static void ggml_cpy_f32_q4_1_cuda(
-     const char * cx, char * cdst, const int ne,
-     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
-+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
- 
-     GGML_ASSERT(ne % QK4_1 == 0);
-     const int num_blocks = ne / QK4_1;
-     cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
-+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
- }
- 
- static void ggml_cpy_q4_1_f32_cuda(
-@@ -475,22 +448,22 @@ static void ggml_cpy_q4_1_f32_cuda(
-     const int nb00, const int nb01, const int nb02,
-     const int nb03, const int ne10, const int ne11, const int ne12,
-     const int nb10, const int nb11, const int nb12, const int nb13,
-    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
-+    cudaStream_t stream) {
-     const int num_blocks = ne;
-     cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
-         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-         ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
-+         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
- }
- 
- static void ggml_cpy_f32_q5_0_cuda(
-     const char * cx, char * cdst, const int ne,
-     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
-+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
- 
-     GGML_ASSERT(ne % QK5_0 == 0);
-     const int num_blocks = ne / QK5_0;
-     cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
-+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
- }
- 
- static void ggml_cpy_q5_0_f32_cuda(
-@@ -499,22 +472,22 @@ static void ggml_cpy_q5_0_f32_cuda(
-     const int nb00, const int nb01, const int nb02,
-     const int nb03, const int ne10, const int ne11, const int ne12,
-     const int nb10, const int nb11, const int nb12, const int nb13,
-    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
-+    cudaStream_t stream) {
-     const int num_blocks = ne;
-     cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
-         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-        ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
-+        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
- }
- 
- static void ggml_cpy_f32_q5_1_cuda(
-     const char * cx, char * cdst, const int ne,
-     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
-+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
- 
-     GGML_ASSERT(ne % QK5_1 == 0);
-     const int num_blocks = ne / QK5_1;
-     cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
-+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
- }
- 
- static void ggml_cpy_q5_1_f32_cuda(
-@@ -523,32 +496,32 @@ static void ggml_cpy_q5_1_f32_cuda(
-     const int nb00, const int nb01, const int nb02,
-     const int nb03, const int ne10, const int ne11, const int ne12,
-     const int nb10, const int nb11, const int nb12, const int nb13,
-    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
-+    cudaStream_t stream) {
-     const int num_blocks = ne;
-     cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
-         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-        ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
-+        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
- }
- 
- static void ggml_cpy_f32_iq4_nl_cuda(
-     const char * cx, char * cdst, const int ne,
-     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
-+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
- 
-     GGML_ASSERT(ne % QK4_NL == 0);
-     const int num_blocks = ne / QK4_NL;
-     cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
-+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
- }
- 
- static void ggml_cpy_f16_f16_cuda(
-     const char * cx, char * cdst, const int ne,
-     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
-+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
- 
-     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-     cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
-+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
- }
- 
- void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
-@@ -585,62 +558,48 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
-     char * src0_ddc = (char *) src0->data;
-     char * src1_ddc = (char *) src1->data;
- 
-    char ** dest_ptrs_d = nullptr;
-    int graph_cpynode_index = -1;
-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
-    if(ctx.cuda_graph->use_cpy_indirection) {
-        dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
-        graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
-    }
-#endif
-     if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
-         GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
-         CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
-     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+        ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
-        ggml_cpy_f32_bf16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+        ggml_cpy_f32_bf16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+        ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
-        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+        ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
-        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
-         ggml_cpy_q4_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
-        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
-         ggml_cpy_q4_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
-        ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+        ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
-         ggml_cpy_q5_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
-        ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+        ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
-        ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+        ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+        ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else {
-         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
-                 ggml_type_name(src0->type), ggml_type_name(src1->type));
-     }
-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
-    if(ctx.cuda_graph->use_cpy_indirection) {
-        ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
-    }
-#endif
-
- }
- 
- void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-diff --git a/ggml/src/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh
-index 6bed0564..28b06cdd 100644
--- a/ggml/src/ggml-cuda/cpy.cuh
-+++ b/ggml/src/ggml-cuda/cpy.cuh
-@@ -7,5 +7,3 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
- void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
- 
- void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
-
-void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream);
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 67208cba..a44788db 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2477,11 +2477,10 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
- 
- #ifdef USE_CUDA_GRAPH
- static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
-    bool use_cuda_graph) {
-+    std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool use_cuda_graph) {
- 
-     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
-    cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
-
-+    cuda_ctx->cuda_graph->updated_kernel_arg.clear();
-     for (int i = 0; i < cgraph->n_nodes; i++) {
-         ggml_tensor * node = cgraph->nodes[i];
- 
-@@ -2513,11 +2512,8 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
-         }
- 
-         if (node->op == GGML_OP_CPY) {
-
-            // Store the pointers which are updated for each token, such that these can be sent
-            // to the device and accessed using indirection from CUDA graph
-            cuda_ctx->cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data);
-
-+            // store the copy op parameter which changes with each token.
-+            cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
-             // store a pointer to each copy op CUDA kernel to identify it later
-             void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
-             if (!ptr) {
-@@ -2525,6 +2521,10 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
- #ifndef NDEBUG
-                 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
- #endif
-+            } else {
-+                if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
-+                    ggml_cuda_cpy_fn_ptrs.push_back(ptr);
-+                }
-             }
-         }
- 
-@@ -2533,12 +2533,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
-         }
-     }
- 
-    if (use_cuda_graph) {
-        cuda_ctx->cuda_graph->use_cpy_indirection = true;
-        // copy pointers to GPU so they can be accessed via indirection within CUDA graph
-        ggml_cuda_cpy_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream());
-    }
-
-     return use_cuda_graph;
- }
- 
-@@ -2593,6 +2587,51 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
-     return true;
- }
- 
-+static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool cuda_graph_update_required) {
-+
-+    if (cuda_graph_update_required) {
-+        // Extract nodes from graph
-+        // First call with null argument gets number of nodes in graph
-+        CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
-+        // Subsequent call with non-null argument gets nodes
-+        cuda_ctx->cuda_graph->nodes.clear();
-+        cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
-+        cuda_ctx->cuda_graph->params.clear();
-+        cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
-+        if (cuda_ctx->cuda_graph->num_nodes > 0) {
-+            CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
-+
-+            // Loop over nodes, and extract kernel parameters from each node
-+            for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
-+                cudaGraphNodeType node_type;
-+                CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
-+                if (node_type == cudaGraphNodeTypeKernel) {
-+                    cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
-+                    if (stat == cudaErrorInvalidDeviceFunction) {
-+                        // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
-+                        // We don't need to update blas nodes, so clear error and move on.
-+                        (void)cudaGetLastError();
-+                    } else {
-+                        GGML_ASSERT(stat == cudaSuccess);
-+                    }
-+                }
-+            }
-+        }
-+    } else {
-+        // One of the arguments to the copy kernel is updated for each token, hence we need to
-+        // replace that argument with the updated value in the CUDA graph
-+        // on update steps, the live parameters will already be captured
-+        int k = 0;
-+        for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
-+            if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
-+                char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
-+                *(void**)cuda_ctx->cuda_graph->params[i].kernelParams[1] = *(void**)updated_kernel_arg_ptr;
-+                CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
-+            }
-+        }
-+    }
-+}
-+
- static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
- 
-     bool cuda_graph_update_required = false;
-@@ -2652,7 +2691,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
- #endif
- 
- static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
-    bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
-+   [[maybe_unused]] std::vector<void *> & ggml_cuda_cpy_fn_ptrs,  bool & graph_evaluated_or_captured, bool & use_cuda_graph,
-+    bool & cuda_graph_update_required) {
- 
-     while (!graph_evaluated_or_captured) {
-         // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
-@@ -2702,9 +2742,13 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
-         if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
-             CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
-         }
-        if (cuda_graph_update_required) { // Update graph executable
-            update_cuda_graph_executable(cuda_ctx);
-        }
-+
-+        // Perform update to graph (if required for this token), and change copy parameter (required for every token)
-+        maintain_cuda_graph(cuda_ctx, ggml_cuda_cpy_fn_ptrs, cuda_graph_update_required);
-+
-+        // Update graph executable
-+        update_cuda_graph_executable(cuda_ctx);
-+
-         // Launch graph
-         CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
- #else
-@@ -2718,6 +2762,10 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
- 
-     ggml_cuda_set_device(cuda_ctx->device);
- 
-+    // vector of pointers to CUDA cpy kernels, which are required to identify
-+    // kernel parameters which need updated in the graph for each token
-+    std::vector<void *> ggml_cuda_cpy_fn_ptrs;
-+
- #ifdef USE_CUDA_GRAPH
-     static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
- 
-@@ -2751,7 +2799,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
-     if (use_cuda_graph) {
-         cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
- 
-        use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph);
-+        use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph,
-+                             ggml_cuda_cpy_fn_ptrs, use_cuda_graph);
- 
-         // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
-         if (use_cuda_graph && cuda_graph_update_required) {
-@@ -2772,10 +2821,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
-         CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
-     }
- 
-    if (!use_cuda_graph) {
-        cuda_ctx->cuda_graph->use_cpy_indirection = false;
-    }
-
- #else
-     bool use_cuda_graph = false;
-     bool cuda_graph_update_required = false;
-@@ -2783,7 +2828,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
- 
-     bool graph_evaluated_or_captured = false;
- 
-    evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
-+    evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
- 
-     return GGML_STATUS_SUCCESS;
- }