llama: update llama.cpp vendor code to commit d7cfe1ff (#9356)

2025-12-10 07:46:59 +00:00 · 2025-02-26 20:34:44 -08:00
parent 2db96c18e7
commit d7d7e99662
149 changed files with 18215 additions and 11009 deletions
--- a/llama/patches/0001-cuda.patch
+++ b/llama/patches/0001-cuda.patch
@@ -10,7 +10,7 @@ Subject: [PATCH] cuda
 3 files changed, 2 insertions(+), 1 deletion(-)

 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index e2d6c405..a12172dc 100644
+index dba7be33..1ca40b2c 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
@@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -22,10 +22,10 @@ index e2d6c405..a12172dc 100644
 
 size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 0b06be72..be29e979 100644
+index ebb2ccae..b094929b 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -424,6 +424,7 @@ struct ggml_backend_cuda_buffer_context {
+@@ -529,6 +529,7 @@ struct ggml_backend_cuda_buffer_context {
 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     delete ctx;
@@ -34,10 +34,10 @@ index 0b06be72..be29e979 100644
 
 static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index a85502ee..cd8ef741 100644
+index c550142a..fd9a4e77 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -4187,6 +4187,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
+@@ -4350,6 +4350,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
     }
 
     free(ctx);
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -4,17 +4,17 @@ Date: Mon, 16 Sep 2024 15:53:13 -0700
 Subject: [PATCH] pretokenizer

 ---
- src/llama-model.cpp | 14 +++-----------
+ src/llama-vocab.cpp | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

-diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 405e0528..00b80c52 100644
--- a/src/llama-model.cpp
-+++ b/src/llama-model.cpp
-@@ -1249,16 +1249,7 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
-         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
-             vocab.tokenizer_add_space_prefix = false;
-             vocab.tokenizer_clean_spaces = true;
+diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
+index ad9ffe66..a4eee9b8 100644
+--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
+@@ -1468,16 +1468,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+         if (type == LLAMA_VOCAB_TYPE_BPE) {
+             add_space_prefix = false;
+             clean_spaces = true;
 -            if (tokenizer_pre.empty()) {
 -                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
 -                LLAMA_LOG_WARN("%s:                                             \n", __func__);
@@ -23,19 +23,19 @@ index 405e0528..00b80c52 100644
 -                LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__);
 -                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
 -                LLAMA_LOG_WARN("%s:                                             \n", __func__);
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+-                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
 -            } else if (tokenizer_pre == "default") {
 +            if (tokenizer_pre == "default") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -1373,7 +1364,8 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
+@@ -1593,7 +1584,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 tokenizer_pre == "megrez") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+                 pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
-+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             }
-         } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
-             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+         } else if (type == LLAMA_VOCAB_TYPE_SPM) {
+             pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
--- a/llama/patches/0003-embeddings.patch
+++ b/llama/patches/0003-embeddings.patch
@@ -9,10 +9,10 @@ Subject: [PATCH] embeddings
 2 files changed, 5 insertions(+), 3 deletions(-)

 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 38a55fb2..b9c4a5bf 100644
+index 671d2a81..47e79ed4 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -475,7 +475,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
+@@ -479,7 +479,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
@@ -22,10 +22,10 @@ index 38a55fb2..b9c4a5bf 100644
 
     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
 diff --git a/src/llama.cpp b/src/llama.cpp
-index ea78ea48..4eb3f6b9 100644
+index 607f2786..ac85bfed 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -10876,7 +10876,6 @@ static int llama_decode_internal(
+@@ -8652,7 +8652,6 @@ static int llama_decode_impl(
             res  = nullptr;
             embd = nullptr;
         } else if (cparams.embeddings) {
@@ -33,7 +33,7 @@ index ea78ea48..4eb3f6b9 100644
             embd = nullptr;
             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
                 if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
-@@ -10884,12 +10883,15 @@ static int llama_decode_internal(
+@@ -8660,12 +8659,15 @@ static int llama_decode_impl(
                     break;
                 }
             }
--- a/llama/patches/0004-clip-unicode.patch
+++ b/llama/patches/0004-clip-unicode.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] clip-unicode
 1 file changed, 39 insertions(+), 1 deletion(-)

 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 3cd0d2fa..b3c1829f 100644
+index 76d4a785..205af1eb 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
-@@ -56,6 +56,19 @@
+@@ -58,6 +58,19 @@
 #   define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
 #endif // defined(LLAVA_LOG_OFF)
 
@@ -31,7 +31,7 @@ index 3cd0d2fa..b3c1829f 100644
 //#define CLIP_DEBUG_FUNCTIONS
 
 // RGB uint8 image
-@@ -1322,8 +1335,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+@@ -1402,8 +1415,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             gguf_free(ctx);
             return nullptr;
         }
@@ -62,7 +62,7 @@ index 3cd0d2fa..b3c1829f 100644
         if (!fin) {
             LOG_ERR("cannot open model file for loading tensors\n");
             clip_free(new_clip);
-@@ -1363,7 +1397,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+@@ -1443,7 +1477,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                 ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
             }
         }
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@@ -11,21 +11,21 @@ tensor to store the scalar. the scalar is implemented a 1-dimensional
 tensor with 2 elements dervied from the model's bskcn_tv configuration.
 in general, the values are (bskcn_tv, 1 - bskcn_tv)
 ---
- src/llama-arch.cpp         |  53 +++++++----
+ src/llama-arch.cpp         |  21 +++++
 src/llama-arch.h           |   3 +
 src/llama-hparams.cpp      |   8 ++
- src/llama-hparams.h        |   5 +
+ src/llama-hparams.h        |   5 ++
 src/llama-model-loader.cpp |   1 +
- src/llama-model.cpp        |  16 ++++
+ src/llama-model.cpp        |  44 +++++++++++
 src/llama-model.h          |   3 +
- src/llama.cpp              | 185 +++++++++++++++++++++++++++++++++++++
- 8 files changed, 258 insertions(+), 16 deletions(-)
+ src/llama.cpp              | 152 ++++++++++++++++++++++++++++++++++++-
+ 8 files changed, 236 insertions(+), 1 deletion(-)

 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 007d79f8..5b376c5e 100644
+index 97a1e7e5..a1e0ebcc 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
-@@ -59,6 +59,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+@@ -61,6 +61,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE,          "granite"          },
     { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
     { LLM_ARCH_CHAMELEON,        "chameleon"        },
@@ -33,48 +33,16 @@ index 007d79f8..5b376c5e 100644
     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
-@@ -106,22 +107,23 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-     { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
-     { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
- 
-    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
-    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
-    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"         },
-    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"              },
-    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"             },
-    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"           },
-    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"     },
-    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon" },
-    { LLM_KV_ATTENTION_GROUPNORM_EPS,          "%s.attention.group_norm_epsilon"     },
-    { LLM_KV_ATTENTION_GROUPNORM_GROUPS,       "%s.attention.group_norm_groups"      },
-    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                 },
-    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"            },
-    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"           },
-    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
-    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
-    { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
-+    { LLM_KV_ATTENTION_HEAD_COUNT,               "%s.attention.head_count"               },
-+    { LLM_KV_ATTENTION_HEAD_COUNT_KV,            "%s.attention.head_count_kv"            },
-+    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,           "%s.attention.max_alibi_bias"           },
-+    { LLM_KV_ATTENTION_CLAMP_KQV,                "%s.attention.clamp_kqv"                },
-+    { LLM_KV_ATTENTION_KEY_LENGTH,               "%s.attention.key_length"               },
-+    { LLM_KV_ATTENTION_VALUE_LENGTH,             "%s.attention.value_length"             },
-+    { LLM_KV_ATTENTION_LAYERNORM_EPS,            "%s.attention.layer_norm_epsilon"       },
-+    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,        "%s.attention.layer_norm_rms_epsilon"   },
-+    { LLM_KV_ATTENTION_GROUPNORM_EPS,            "%s.attention.group_norm_epsilon"       },
-+    { LLM_KV_ATTENTION_GROUPNORM_GROUPS,         "%s.attention.group_norm_groups"        },
-+    { LLM_KV_ATTENTION_CAUSAL,                   "%s.attention.causal"                   },
-+    { LLM_KV_ATTENTION_Q_LORA_RANK,              "%s.attention.q_lora_rank"              },
-+    { LLM_KV_ATTENTION_KV_LORA_RANK,             "%s.attention.kv_lora_rank"             },
-+    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,   "%s.attention.relative_buckets_count"   },
-+    { LLM_KV_ATTENTION_SLIDING_WINDOW,           "%s.attention.sliding_window"           },
-+    { LLM_KV_ATTENTION_SCALE,                    "%s.attention.scale"                    },
-+    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,    "%s.attention.block_skip_connection"    },
+@@ -125,6 +126,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+     { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
+     { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
+     { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
+    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection"  },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
-@@ -1240,6 +1242,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
-             { LLM_TENSOR_POS_NET_ATTN_OUT,  "posnet.%d.attn_output" },
+@@ -1271,6 +1273,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
 +    {
@@ -96,9 +64,9 @@ index 007d79f8..5b376c5e 100644
 +        },
 +    },
     {
-         LLM_ARCH_UNKNOWN,
+         LLM_ARCH_WAVTOKENIZER_DEC,
         {
-@@ -1372,6 +1392,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -1429,6 +1449,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_FFN_EXP_PROBS_B,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -107,10 +75,10 @@ index 007d79f8..5b376c5e 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 45e458bb..eac7055b 100644
+index 122fdceb..77919578 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
-@@ -63,6 +63,7 @@ enum llm_arch {
+@@ -65,6 +65,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
@@ -118,7 +86,7 @@ index 45e458bb..eac7055b 100644
     LLM_ARCH_WAVTOKENIZER_DEC,
     LLM_ARCH_UNKNOWN,
 };
-@@ -126,6 +127,7 @@ enum llm_kv {
+@@ -129,6 +130,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
@@ -126,7 +94,7 @@ index 45e458bb..eac7055b 100644
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_DIMENSION_SECTIONS,
-@@ -305,6 +307,7 @@ enum llm_tensor {
+@@ -311,6 +313,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
@@ -135,7 +103,7 @@ index 45e458bb..eac7055b 100644
     LLM_TENSOR_CONVNEXT_DW,
     LLM_TENSOR_CONVNEXT_NORM,
 diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index c4053469..450738da 100644
+index ea87b295..f3955de9 100644
 --- a/src/llama-hparams.cpp
 +++ b/src/llama-hparams.cpp
@@ -69,3 +69,11 @@ uint32_t llama_hparams::n_embd_v_s() const {
@@ -152,10 +120,10 @@ index c4053469..450738da 100644
 +}
 \ No newline at end of file
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index a29f20ec..fd898e27 100644
+index 1fe45410..1bdcdfd5 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
-@@ -52,6 +52,8 @@ struct llama_hparams {
+@@ -50,6 +50,8 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
@@ -164,7 +132,7 @@ index a29f20ec..fd898e27 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
     uint32_t n_lora_kv          = 0;
-@@ -134,6 +136,9 @@ struct llama_hparams {
+@@ -133,6 +135,9 @@ struct llama_hparams {
 
     // dimension of the recurrent state embeddings
     uint32_t n_embd_v_s() const;
@@ -175,23 +143,23 @@ index a29f20ec..fd898e27 100644
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index 7743b465..422524a8 100644
+index 05d58ad9..1252aca1 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
-@@ -364,6 +364,7 @@ namespace GGUFMeta {
+@@ -439,6 +439,7 @@ namespace GGUFMeta {
     // TODO: this is not very clever - figure out something better
     template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
 +    template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required);
 
- llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
-     int trace = 0;
+ llama_model_loader::llama_model_loader(
+         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 00b80c52..306c557d 100644
+index 36a0a009..ad1315c6 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -1091,6 +1091,21 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
-                     default: model.type = e_model::MODEL_UNKNOWN;
+@@ -1238,6 +1238,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+                     default: type = LLM_TYPE_UNKNOWN;
                }
             } break;
 +        case LLM_ARCH_SOLAR:
@@ -200,52 +168,19 @@ index 00b80c52..306c557d 100644
 +                for (size_t i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
 +                    auto & bskcn = hparams.n_bskcn_arr[i];
 +                    bskcn.fill(0);
-+                    auto kv = LLM_KV(model.arch);
+                    auto kv = LLM_KV(arch);
 +                    ml.get_key_or_arr(format((kv(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION) + ".%d").c_str(), i), bskcn, hparams.n_layer, false);
 +                }
 +
 +                switch (hparams.n_layer) {
-+                    case 64: model.type = e_model::MODEL_22B; break;
-+                    default: model.type = e_model::MODEL_UNKNOWN;
+                    case 64: type = LLM_TYPE_22B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
 +                }
 +            } break;
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -2065,6 +2080,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
-         case LLM_ARCH_GRANITE:
-         case LLM_ARCH_GRANITE_MOE:
-         case LLM_ARCH_CHAMELEON:
-+        case LLM_ARCH_SOLAR:
-             return LLAMA_ROPE_TYPE_NORM;
- 
-         // the pairs of head values are offset by n_rot/2
-diff --git a/src/llama-model.h b/src/llama-model.h
-index ce038932..c1b9c0a1 100644
--- a/src/llama-model.h
-+++ b/src/llama-model.h
-@@ -54,6 +54,7 @@ enum llm_type {
-     MODEL_15B,
-     MODEL_16B,
-     MODEL_20B,
-+    MODEL_22B,
-     MODEL_30B,
-     MODEL_32B,
-     MODEL_34B,
-@@ -275,6 +276,8 @@ struct llama_layer {
-     struct ggml_tensor * ffn_up_scale   = nullptr;
-     struct ggml_tensor * ffn_down_scale = nullptr;
- 
-+    struct ggml_tensor * bskcn_tv = nullptr;
-+
-     struct llama_layer_posnet posnet;
- 
-     struct llama_layer_convnext convnext;
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 4eb3f6b9..7dec50ae 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -2206,6 +2206,35 @@ static bool llm_load_tensors(
+@@ -3316,6 +3331,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
@@ -256,16 +191,16 @@ index 4eb3f6b9..7dec50ae 100644
 +                } break;
 +            case LLM_ARCH_SOLAR:
 +                {
-+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 +
 +                    // output
 +                    {
-+                        model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-+                        model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
 +                    }
 +
 +                    for (int i = 0; i < n_layer; ++i) {
-+                        auto & layer = model.layers[i];
+                        auto & layer = layers[i];
 +
 +                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 +
@@ -277,16 +212,53 @@ index 4eb3f6b9..7dec50ae 100644
 +                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 +
 +                        layer.bskcn_tv = create_tensor(tn(LLM_TENSOR_BSKCN_TV, "weight", i), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-+
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -10226,6 +10255,158 @@ struct llm_build_context {
-         return gf;
-     }
+@@ -3900,6 +3943,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
+         case LLM_ARCH_GRANITE:
+         case LLM_ARCH_GRANITE_MOE:
+         case LLM_ARCH_CHAMELEON:
+        case LLM_ARCH_SOLAR:
+             return LLAMA_ROPE_TYPE_NORM;
 
-+    ggml_cgraph * build_solar() {
-+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+         // the pairs of head values are offset by n_rot/2
+diff --git a/src/llama-model.h b/src/llama-model.h
+index a7c30444..1afb0024 100644
+--- a/src/llama-model.h
+++ b/src/llama-model.h
+@@ -55,6 +55,7 @@ enum llm_type {
+     LLM_TYPE_15B,
+     LLM_TYPE_16B,
+     LLM_TYPE_20B,
+    LLM_TYPE_22B,
+     LLM_TYPE_30B,
+     LLM_TYPE_32B,
+     LLM_TYPE_34B,
+@@ -281,6 +282,8 @@ struct llama_layer {
+     struct ggml_tensor * ffn_up_scale   = nullptr;
+     struct ggml_tensor * ffn_down_scale = nullptr;
+ 
+    struct ggml_tensor * bskcn_tv = nullptr;
+
+     struct llama_layer_posnet posnet;
+ 
+     struct llama_layer_convnext convnext;
+diff --git a/src/llama.cpp b/src/llama.cpp
+index ac85bfed..6d320ea4 100644
+--- a/src/llama.cpp
+++ b/src/llama.cpp
+@@ -7953,9 +7953,155 @@ struct llm_build_context {
+         cb(img_logits, "img_logits", -1);
+         cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
+         cb(cur, "result_output", -1);
+-
+         ggml_build_forward_expand(gf, cur);
+        return gf;
+   }
+
+   ggml_cgraph * build_solar() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 +
 +        // mutable variable, needed during the last layer of the computation to skip unused tokens
 +        int32_t n_tokens = this->n_tokens;
@@ -333,7 +305,7 @@ index 4eb3f6b9..7dec50ae 100644
 +                   ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
 +                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
 +            }
-+
+ 
 +            // norm
 +            cur = llm_build_norm(ctx0, inpL, hparams,
 +                    model.layers[il].attn_norm, NULL,
@@ -422,25 +394,18 @@ index 4eb3f6b9..7dec50ae 100644
 +        }
 +
 +        cur = inpL;
-+
 +        cur = llm_build_norm(ctx0, cur, hparams,
 +                model.output_norm, NULL,
 +                LLM_NORM_RMS, cb, -1);
 +        cb(cur, "result_norm", -1);
-+
 +        // lm_head
 +        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
 +        cb(cur, "result_output", -1);
-+
 +        ggml_build_forward_expand(gf, cur);
-+
-+        return gf;
-+    }
-+
-     struct ggml_cgraph * build_wavtokenizer_dec() {
-         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+         return gf;
+     }
 
-@@ -10660,6 +10841,10 @@ static struct ggml_cgraph * llama_build_graph(
+@@ -8398,6 +8544,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_chameleon();
             } break;
--- a/llama/patches/0006-conditional-fattn.patch
+++ b/llama/patches/0006-conditional-fattn.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index be29e979..aaa79ea4 100644
+index b094929b..36165840 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2159,9 +2159,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2282,9 +2282,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_ARGSORT:
             ggml_cuda_op_argsort(ctx, dst);
             break;
--- a/llama/patches/0007-add-mllama-support.patch
+++ b/llama/patches/0007-add-mllama-support.patch
@@ -15,27 +15,27 @@ remaining is to implement the cross attention mask
 examples/llava/llava.cpp      |   5 +-
 ggml/src/ggml-backend-reg.cpp |   6 +-
 include/llama.h               |   6 +
- src/llama-arch.cpp            |  44 +++++
+ src/llama-arch.cpp            |  44 ++++++
 src/llama-arch.h              |  10 ++
 src/llama-batch.cpp           |   3 +
- src/llama-context.cpp         |  19 ++-
+ src/llama-context.cpp         |  28 ++--
 src/llama-context.h           |   2 +
 src/llama-cparams.h           |   1 +
- src/llama-hparams.cpp         |   8 +-
- src/llama-hparams.h           |   4 +
- src/llama-kv-cache.cpp        |  33 ++++
+ src/llama-hparams.cpp         |   6 +
+ src/llama-hparams.h           |   5 +
+ src/llama-kv-cache.cpp        |  13 +-
 src/llama-model-loader.cpp    |   2 +
- src/llama-model.cpp           |  59 ++-----
- src/llama-model.h             |  51 ++++++
+ src/llama-model.cpp           |  65 ++++++++-
+ src/llama-model.h             |  12 ++
 src/llama-quant.cpp           |   4 +-
- src/llama.cpp                 | 307 +++++++++++++++++++++++++++++++++-
- 17 files changed, 508 insertions(+), 56 deletions(-)
+ src/llama.cpp                 | 262 +++++++++++++++++++++++++++++++++-
+ 17 files changed, 452 insertions(+), 22 deletions(-)

 diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
-index 16f30c56..0f0f3f62 100644
+index 518aad3f..f0e484a1 100644
 --- a/examples/llava/llava.cpp
 +++ b/examples/llava/llava.cpp
-@@ -429,7 +429,7 @@ struct llava_embd_batch {
+@@ -445,7 +445,7 @@ struct llava_embd_batch {
     std::vector<llama_seq_id *> seq_ids;
     std::vector<int8_t>         logits;
     llama_batch batch;
@@ -44,7 +44,7 @@ index 16f30c56..0f0f3f62 100644
         pos     .resize(n_tokens);
         n_seq_id.resize(n_tokens);
         seq_ids .resize(n_tokens + 1);
-@@ -441,6 +441,7 @@ struct llava_embd_batch {
+@@ -457,6 +457,7 @@ struct llava_embd_batch {
             /*n_tokens       =*/ n_tokens,
             /*tokens         =*/ nullptr,
             /*embd           =*/ embd,
@@ -52,7 +52,7 @@ index 16f30c56..0f0f3f62 100644
             /*pos            =*/ pos.data(),
             /*n_seq_id       =*/ n_seq_id.data(),
             /*seq_id         =*/ seq_ids.data(),
-@@ -464,7 +465,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
+@@ -480,7 +481,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
             n_eval = n_batch;
         }
         float * embd = image_embed->embed+i*n_embd;
@@ -62,7 +62,7 @@ index 16f30c56..0f0f3f62 100644
             LOG_ERR("%s : failed to eval\n", __func__);
             return false;
 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 7ddd178b..899d16f2 100644
+index 955ed505..95036ef8 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
@@ -171,9 +171,9 @@ struct ggml_backend_registry {
@@ -79,10 +79,10 @@ index 7ddd178b..899d16f2 100644
         register_backend(ggml_backend_rpc_reg());
 #endif
 diff --git a/include/llama.h b/include/llama.h
-index a0d5ba5d..9f411960 100644
+index 47919602..cc948005 100644
 --- a/include/llama.h
 +++ b/include/llama.h
-@@ -250,6 +250,7 @@ extern "C" {
+@@ -249,6 +249,7 @@ extern "C" {
 
         llama_token  *  token;
         float        *  embd;
@@ -90,7 +90,7 @@ index a0d5ba5d..9f411960 100644
         llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
-@@ -347,6 +348,7 @@ extern "C" {
+@@ -343,6 +344,7 @@ extern "C" {
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
         bool no_perf;     // whether to measure performance timings
@@ -98,9 +98,9 @@ index a0d5ba5d..9f411960 100644
 
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted
-@@ -426,6 +428,10 @@ extern "C" {
-                      struct llama_model * model,
-             struct llama_context_params   params);
+@@ -443,6 +445,10 @@ extern "C" {
+             struct llama_context_params   params),
+             "use llama_init_from_model instead");
 
 +    // TODO (jmorganca): this should most likely be passed in as part of a batch
 +    // and not set on the context for all batches.
@@ -110,7 +110,7 @@ index a0d5ba5d..9f411960 100644
     LLAMA_API void llama_free(struct llama_context * ctx);
 
 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 5b376c5e..b35aeb31 100644
+index a1e0ebcc..b6f20286 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@
@@ -121,15 +121,15 @@ index 5b376c5e..b35aeb31 100644
     { LLM_ARCH_DECI,             "deci"             },
     { LLM_ARCH_FALCON,           "falcon"           },
     { LLM_ARCH_GROK,             "grok"             },
-@@ -124,6 +125,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-     { LLM_KV_ATTENTION_SLIDING_WINDOW,           "%s.attention.sliding_window"           },
-     { LLM_KV_ATTENTION_SCALE,                    "%s.attention.scale"                    },
-     { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,    "%s.attention.block_skip_connection"    },
-+    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,   "%s.attention.cross_attention_layers"   },
+@@ -127,6 +128,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+     { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
+     { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
+     { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection"  },
+    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
-@@ -220,6 +222,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -225,6 +227,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
         },
     },
@@ -170,7 +170,7 @@ index 5b376c5e..b35aeb31 100644
     {
         LLM_ARCH_DECI,
         {
-@@ -1393,6 +1429,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -1450,6 +1486,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
     {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -186,7 +186,7 @@ index 5b376c5e..b35aeb31 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index eac7055b..e8235ae0 100644
+index 77919578..ec742224 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
@@ -10,6 +10,7 @@
@@ -197,7 +197,7 @@ index eac7055b..e8235ae0 100644
     LLM_ARCH_DECI,
     LLM_ARCH_FALCON,
     LLM_ARCH_BAICHUAN,
-@@ -128,6 +129,7 @@ enum llm_kv {
+@@ -131,6 +132,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
@@ -205,7 +205,7 @@ index eac7055b..e8235ae0 100644
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_DIMENSION_SECTIONS,
-@@ -308,6 +310,14 @@ enum llm_tensor {
+@@ -314,6 +316,14 @@ enum llm_tensor {
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
     LLM_TENSOR_BSKCN_TV,
@@ -249,10 +249,10 @@ index 01d5ca57..8682b0e6 100644
         batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
     }
 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index b9c4a5bf..9d0e7ca3 100644
+index 47e79ed4..7b22fe13 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -71,10 +71,19 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
+@@ -74,10 +74,19 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
     }
 
     if (ubatch.embd) {
@@ -275,7 +275,30 @@ index b9c4a5bf..9d0e7ca3 100644
     }
 
     if (ubatch.pos && lctx.inp_pos) {
-@@ -653,6 +662,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
+@@ -470,12 +479,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
+ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
+     const auto & cparams = lctx.cparams;
+     const auto & hparams = lctx.model.hparams;
+-    const auto & vocab   = lctx.model.vocab;
+ 
+     const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
+ 
+     const auto n_batch = cparams.n_batch;
+-    const auto n_vocab = vocab.n_tokens();
+    const auto n_vocab = hparams.n_vocab;
+     const auto n_embd  = hparams.n_embd;
+ 
+     // TODO: use a per-batch flag for logits presence instead
+@@ -542,7 +550,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
+ void llama_output_reorder(struct llama_context & ctx) {
+     std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
+     if (!out_ids.empty()) {
+-        const uint32_t n_vocab = ctx.model.vocab.n_tokens();
+        const uint32_t n_vocab = ctx.model.hparams.n_vocab;
+         const uint32_t n_embd  = ctx.model.hparams.n_embd;
+ 
+         const int32_t n_outputs = ctx.n_outputs;
+@@ -657,6 +665,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
     ctx->cparams.causal_attn = causal_attn;
 }
 
@@ -286,8 +309,26 @@ index b9c4a5bf..9d0e7ca3 100644
 void llama_synchronize(struct llama_context * ctx) {
     ggml_backend_sched_synchronize(ctx->sched.get());
 
+@@ -726,7 +738,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
+             throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
+         }
+ 
+-        return ctx->logits + j*ctx->model.vocab.n_tokens();
+        return ctx->logits + j*ctx->model.hparams.n_vocab;
+     } catch (const std::exception & err) {
+         LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
+ #ifndef NDEBUG
+@@ -886,7 +898,7 @@ struct llama_data_write {
+     }
+ 
+     void write_logits(const struct llama_context * ctx) {
+-        const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
+        const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
+ 
+         write(&logits_size, sizeof(logits_size));
+ 
 diff --git a/src/llama-context.h b/src/llama-context.h
-index 0d163c47..4980a60e 100644
+index a9268b29..cf12c9d7 100644
 --- a/src/llama-context.h
 +++ b/src/llama-context.h
@@ -107,6 +107,8 @@ struct llama_context {
@@ -312,7 +353,7 @@ index 252012f3..9681e5a0 100644
     enum llama_pooling_type pooling_type;
 
 diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index 450738da..42f8a58f 100644
+index f3955de9..0b841028 100644
 --- a/src/llama-hparams.cpp
 +++ b/src/llama-hparams.cpp
@@ -2,6 +2,8 @@
@@ -328,18 +369,25 @@ index 450738da..42f8a58f 100644
     }
 
     GGML_ABORT("fatal error");
-}
-\ No newline at end of file
 +}
 +
 +bool llama_hparams::cross_attention_layers(uint32_t il) const {
 +    return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
-+}
+ }
+\ No newline at end of file
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index fd898e27..f826cd9a 100644
+index 1bdcdfd5..05383046 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
-@@ -53,6 +53,7 @@ struct llama_hparams {
+@@ -41,6 +41,7 @@ struct llama_hparams {
+     uint32_t n_expert = 0;
+     uint32_t n_expert_used = 0;
+     uint32_t n_rel_attn_bkts = 0;
+    uint32_t n_vocab = 0;
+ 
+     // for WavTokenizer
+     struct llama_hparams_posnet   posnet;
+@@ -51,6 +52,7 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
     std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
@@ -347,65 +395,45 @@ index fd898e27..f826cd9a 100644
 
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
-@@ -139,6 +140,9 @@ struct llama_hparams {
+@@ -138,6 +140,9 @@ struct llama_hparams {
 
     // Block skip connection
     bool n_bskcn(uint32_t n, uint32_t il) const;
 +
-+    // cross attention layers   
+    // cross attention layers
 +    bool cross_attention_layers(uint32_t il) const;
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
 diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
-index 53379253..cf814dbe 100644
+index feffdf0d..b541c5a3 100644
 --- a/src/llama-kv-cache.cpp
 +++ b/src/llama-kv-cache.cpp
-@@ -72,6 +72,39 @@ bool llama_kv_cache_init(
-     cache.v_l.reserve(n_layer);
+@@ -91,8 +91,17 @@ bool llama_kv_cache_init(
+             return false;
+         }
 
-     for (int i = 0; i < n_layer; i++) {
+-        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+-        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+        ggml_tensor * k, *v;
+
 +        // for cross attention layers
 +        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
-+            const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
-+            const llama_model::buft_list_t * buft_list;
-+            if (offload) {
-+                buft_list = model.dev_layer.at(i).buft_list;
-+            } else {
-+                buft_list = &model.cpu_buft_list;
-+            }
-+            ggml_backend_buffer_type_t buft = select_buft(*buft_list,
-+                [&](ggml_context * ctx) {
-+                    ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
-+                    if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
-+                        return k;
-+                    }
-+                    ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
-+                    return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
-+                });
-+            ggml_context * ctx = ctx_for_buft(buft);
-+
-+            if (!ctx) {
-+                LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
-+                return false;
-+            }
-+            ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
-+            ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
-+            ggml_format_name(k, "cache_k_l%d", i);
-+            ggml_format_name(v, "cache_v_l%d", i);
-+            cache.k_l.push_back(k);
-+            cache.v_l.push_back(v);
-+            continue;
+            k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
+            v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
+        } else {
+            k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+            v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
 +        }
 +
-         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
-         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
- 
+         ggml_format_name(k, "cache_k_l%d", i);
+         ggml_format_name(v, "cache_v_l%d", i);
+         cache.k_l.push_back(k);
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index 422524a8..b12d6566 100644
+index 1252aca1..45d08721 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
-@@ -240,6 +240,8 @@ namespace GGUFMeta {
+@@ -315,6 +315,8 @@ namespace GGUFMeta {
         return true;
     }
 
@@ -415,80 +443,47 @@ index 422524a8..b12d6566 100644
     bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
         const int kid = gguf_find_key(meta.get(), key.c_str());
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 306c557d..4f9bbf90 100644
+index ad1315c6..21819080 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -146,46 +146,6 @@ std::string llama_model_ftype_name(const llama_model & model) {
-     return llama_model_ftype_name(model.ftype);
- }
+@@ -401,6 +401,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
-template<typename F>
-static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
-    ggml_init_params params = {
-        /*.mem_size   =*/ ggml_tensor_overhead()*8,
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ true,
-    };
-
-    ggml_context_ptr ctx { ggml_init(params) };
-    if (!ctx) {
-        throw std::runtime_error(format("failed to create ggml context"));
-    }
-
-    ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
-    ggml_tensor * op_tensor = fn(ctx.get());
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (op_tensor->src[i] != nullptr) {
-            assert(op_tensor->src[i]->buffer == nullptr);
-            op_tensor->src[i]->buffer = buf.get();
-        }
-    }
-
-    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
-
-    return op_supported;
-}
-
-template<typename F>
-static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
-    for (const auto & cur : buft_list) {
-        ggml_backend_dev_t cur_dev = cur.first;
-        ggml_backend_buffer_type_t cur_buft = cur.second;
-        if (buft_supported(cur_buft, cur_dev, fn)) {
-            return cur_buft;
-        }
-    }
-
-    throw std::runtime_error(format("no suitable buffer type found"));
-}
-
- ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
-     return select_buft(
-             *model.dev_layer.at(il).buft_list,
-@@ -312,9 +272,11 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
+     // get general kv
+     ml.get_key(LLM_KV_GENERAL_NAME, name, false);
+    ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
+ 
+     // everything past this point is not vocab-related
+     if (hparams.vocab_only) {
+@@ -412,6 +413,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+     ml.get_key(LLM_KV_BLOCK_COUNT,       hparams.n_layer);
+     ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
+     ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
+    ml.get_key(LLM_KV_VOCAB_SIZE,        hparams.n_vocab,       false);
+ 
+     if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
+         ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
+@@ -435,9 +437,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
     std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
     std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
 +    std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
 
-    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
-    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
-+    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,       hparams.n_ff_arr,   hparams.n_layer, false);
-+    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT,      hparams.n_head_arr, hparams.n_layer, false);
+     ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
+     ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
 +    ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
 
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv_arr = hparams.n_head_arr;
-@@ -363,7 +325,7 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
+@@ -486,7 +490,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
 
-        if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
-+        if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_MLLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
+-        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
+        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
             if (hparams.n_rot != hparams.n_embd_head_k) {
                 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
             }
-@@ -405,6 +367,16 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
+@@ -530,6 +534,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     }
                 }
             } break;
@@ -497,145 +492,44 @@ index 306c557d..4f9bbf90 100644
 +                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 +
 +                switch (hparams.n_layer) {
-+                    case 40: model.type = e_model::MODEL_11B; break;
-+                    case 100: model.type = e_model::MODEL_90B; break;
-+                    default: model.type = e_model::MODEL_UNKNOWN;
+                    case 40: type = LLM_TYPE_11B; break;
+                    case 100: type = LLM_TYPE_90B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
 +                }
 +            } break;
         case LLM_ARCH_DECI:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-@@ -2062,6 +2034,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
- 
-         // use what we call a normal RoPE, operating on pairs of consecutive head values
-         case LLM_ARCH_LLAMA:
-+        case LLM_ARCH_MLLAMA:
-         case LLM_ARCH_DECI:
-         case LLM_ARCH_BAICHUAN:
-         case LLM_ARCH_STARCODER:
-diff --git a/src/llama-model.h b/src/llama-model.h
-index c1b9c0a1..5b23e2ba 100644
--- a/src/llama-model.h
-+++ b/src/llama-model.h
-@@ -9,6 +9,7 @@
- #include "ggml-cpp.h"
- 
- #include <vector>
-+#include <stdexcept>
- 
- // available models
- // TODO: this enum does not follow the enum naming convention
-@@ -62,6 +63,7 @@ enum llm_type {
-     MODEL_40B,
-     MODEL_65B,
-     MODEL_70B,
-+    MODEL_90B,
-     MODEL_236B,
-     MODEL_314B,
-     MODEL_671B,
-@@ -278,6 +280,16 @@ struct llama_layer {
- 
-     struct ggml_tensor * bskcn_tv = nullptr;
- 
-+     // cross attention
-+    struct ggml_tensor * cross_attn_k_norm = nullptr;
-+    struct ggml_tensor * cross_attn_k_proj = nullptr;
-+    struct ggml_tensor * cross_attn_o_proj = nullptr;
-+    struct ggml_tensor * cross_attn_q_norm = nullptr;
-+    struct ggml_tensor * cross_attn_q_proj = nullptr;
-+    struct ggml_tensor * cross_attn_v_proj = nullptr;
-+    struct ggml_tensor * cross_attn_attn_gate = nullptr;
-+    struct ggml_tensor * cross_attn_mlp_gate = nullptr;
-+
-     struct llama_layer_posnet posnet;
- 
-     struct llama_layer_convnext convnext;
-@@ -376,6 +388,45 @@ std::string llama_model_arch_name (const llama_model & model);
- std::string llama_model_type_name (const llama_model & model);
- std::string llama_model_ftype_name(const llama_model & model);
- 
-+template<typename F>
-+bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
-+    ggml_init_params params = {
-+        /*.mem_size   =*/ ggml_tensor_overhead()*8,
-+        /*.mem_buffer =*/ NULL,
-+        /*.no_alloc   =*/ true,
-+    };
-+
-+    ggml_context_ptr ctx { ggml_init(params) };
-+    if (!ctx) {
-+        throw std::runtime_error("failed to create ggml context");
-+    }
-+
-+    ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
-+    ggml_tensor * op_tensor = fn(ctx.get());
-+    for (int i = 0; i < GGML_MAX_SRC; i++) {
-+        if (op_tensor->src[i] != nullptr) {
-+            op_tensor->src[i]->buffer = buf.get();
-+        }
-+    }
-+
-+    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
-+
-+    return op_supported;
-+}
-+
-+template<typename F>
-+ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
-+    for (const auto & cur : buft_list) {
-+        ggml_backend_dev_t cur_dev = cur.first;
-+        ggml_backend_buffer_type_t cur_buft = cur.second;
-+        if (buft_supported(cur_buft, cur_dev, fn)) {
-+            return cur_buft;
-+        }
-+    }
-+
-+    throw std::runtime_error("no suitable buffer type found");
-+}
-+
- // used by llama_adapter_cvec
- ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
- 
-diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
-index 42974f8f..27def6fd 100644
--- a/src/llama-quant.cpp
-+++ b/src/llama-quant.cpp
-@@ -629,7 +629,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
-         if (llama_model_has_encoder(&model)) {
-             n_attn_layer *= 3;
-         }
-        GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
-+        if (qs.n_attention_wv != n_attn_layer) {
-+            LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
-+        }
-     }
- 
-     size_t total_size_org = 0;
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 7dec50ae..bac66c24 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -563,6 +563,52 @@ static bool llm_load_tensors(
+@@ -1398,7 +1412,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+         const int64_t n_embd_head_v = hparams.n_embd_head_v;
+         const int64_t n_ff          = hparams.n_ff();
+         const int64_t n_embd_gqa    = n_embd_v_gqa;
+-        const int64_t n_vocab       = vocab.n_tokens();
+        const int64_t n_vocab       = hparams.n_vocab;
+         const int64_t n_token_types = vocab.n_token_types();
+         const int64_t n_rot         = hparams.n_rot;
+         const int64_t n_expert      = hparams.n_expert;
+@@ -1581,6 +1595,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         }
                     }
                 } break;
 +            case LLM_ARCH_MLLAMA:
 +                {
-+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
 +
 +                    // output
 +                    {
-+                        model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-+                        model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
 +
 +                        // if output is NULL, init from the input tok embed
-+                        if (model.output == NULL) {
-+                            model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                        if (output == NULL) {
+                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
 +                        }
 +                    }
 +
 +                    for (int i = 0; i < n_layer; ++i) {
-+                        auto & layer = model.layers[i];
+                        auto & layer = layers[i];
 +
 +                        if (hparams.cross_attention_layers(i)) {
 +                            layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128}, 0);
@@ -667,17 +561,72 @@ index 7dec50ae..bac66c24 100644
 +                } break;
             case LLM_ARCH_DECI:
                 {
-                     model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-@@ -2514,7 +2560,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
+                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+@@ -3925,6 +3985,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
 
-         if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
-             model.hparams.n_vocab != model.vocab.id_to_token.size()) {
-            throw std::runtime_error("vocab size mismatch");
-+            LLAMA_LOG_WARN("%s: vocab mismatch %u !- %zu ...\n", __func__, model.hparams.n_vocab, model.vocab.id_to_token.size());
+         // use what we call a normal RoPE, operating on pairs of consecutive head values
+         case LLM_ARCH_LLAMA:
+        case LLM_ARCH_MLLAMA:
+         case LLM_ARCH_DECI:
+         case LLM_ARCH_BAICHUAN:
+         case LLM_ARCH_STARCODER:
+diff --git a/src/llama-model.h b/src/llama-model.h
+index 1afb0024..7cf57587 100644
+--- a/src/llama-model.h
+++ b/src/llama-model.h
+@@ -9,6 +9,7 @@
+ #include <string>
+ #include <unordered_map>
+ #include <vector>
+#include <stdexcept>
+ 
+ struct llama_model_loader;
+ 
+@@ -63,6 +64,7 @@ enum llm_type {
+     LLM_TYPE_40B,
+     LLM_TYPE_65B,
+     LLM_TYPE_70B,
+    LLM_TYPE_90B,
+     LLM_TYPE_236B,
+     LLM_TYPE_314B,
+     LLM_TYPE_671B,
+@@ -284,6 +286,16 @@ struct llama_layer {
+ 
+     struct ggml_tensor * bskcn_tv = nullptr;
+ 
+    // cross attention
+    struct ggml_tensor * cross_attn_k_norm = nullptr;
+    struct ggml_tensor * cross_attn_k_proj = nullptr;
+    struct ggml_tensor * cross_attn_o_proj = nullptr;
+    struct ggml_tensor * cross_attn_q_norm = nullptr;
+    struct ggml_tensor * cross_attn_q_proj = nullptr;
+    struct ggml_tensor * cross_attn_v_proj = nullptr;
+    struct ggml_tensor * cross_attn_attn_gate = nullptr;
+    struct ggml_tensor * cross_attn_mlp_gate = nullptr;
+
+     struct llama_layer_posnet posnet;
+ 
+     struct llama_layer_convnext convnext;
+diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
+index fb798265..6eb1da08 100644
+--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
+@@ -632,7 +632,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
+         if (llama_model_has_encoder(&model)) {
+             n_attn_layer *= 3;
         }
+-        GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+        if (qs.n_attention_wv != n_attn_layer) {
+            LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
+        }
+     }
 
-         if (params.vocab_only) {
-@@ -2598,6 +2644,21 @@ static struct ggml_tensor * llm_build_inp_embd(
+     size_t total_size_org = 0;
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 6d320ea4..8f7902df 100644
+--- a/src/llama.cpp
+++ b/src/llama.cpp
+@@ -154,6 +154,21 @@ static struct ggml_tensor * llm_build_inp_embd(
     return inpL;
 }
 
@@ -699,7 +648,7 @@ index 7dec50ae..bac66c24 100644
 static void llm_build_kv_store(
         struct ggml_context * ctx,
         const llama_hparams & hparams,
-@@ -3593,6 +3654,7 @@ struct llm_build_context {
+@@ -1157,6 +1172,7 @@ struct llm_build_context {
         lctx.inp_pos_bucket    = nullptr;
         lctx.inp_embd_enc      = nullptr;
         lctx.inp_KQ_mask_cross = nullptr;
@@ -707,12 +656,12 @@ index 7dec50ae..bac66c24 100644
     }
 
     void free() {
-@@ -4074,6 +4136,240 @@ struct llm_build_context {
+@@ -1639,6 +1655,240 @@ struct llm_build_context {
         return gf;
     }
 
-+        struct ggml_cgraph * build_mllama() {
-+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+    struct ggml_cgraph * build_mllama() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 +
 +        // mutable variable, needed during the last layer of the computation to skip unused tokens
 +        int32_t n_tokens = this->n_tokens;
@@ -946,9 +895,9 @@ index 7dec50ae..bac66c24 100644
 +    }
 +
     struct ggml_cgraph * build_deci() {
-         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-@@ -10646,6 +10942,10 @@ static struct ggml_cgraph * llama_build_graph(
+@@ -8344,6 +8594,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_llama();
             } break;
@@ -959,16 +908,33 @@ index 7dec50ae..bac66c24 100644
         case LLM_ARCH_DECI:
             {
                 result = llm.build_deci();
-@@ -10971,7 +11271,7 @@ static int llama_decode_internal(
+@@ -8634,7 +8888,7 @@ static int llama_prepare_sbatch(
         n_outputs = 1;
     }
 
 -    lctx.sbatch.from_batch(batch, n_embd,
 +    lctx.sbatch.from_batch(batch, batch.n_embd,
-         /* simple_split */ !kv_self.recurrent,
+         /* simple_split */ !lctx.kv_self.recurrent,
         /* logits_all   */ n_outputs == n_tokens_all);
 
-@@ -11282,7 +11582,7 @@ static int llama_encode_internal(
+@@ -8749,7 +9003,6 @@ static int llama_decode_impl(
+     const llama_batch & batch = batch_allocr.batch;
+ 
+     const auto & model   = lctx.model;
+-    const auto & vocab   = model.vocab;
+     const auto & hparams = model.hparams;
+     const auto & cparams = lctx.cparams;
+ 
+@@ -8760,7 +9013,7 @@ static int llama_decode_impl(
+     llama_kv_slot_restorer kv_slot_restorer(kv_self);
+ 
+     const int64_t n_embd  = hparams.n_embd;
+-    const int64_t n_vocab = vocab.n_tokens();
+    const int64_t n_vocab = hparams.n_vocab;
+ 
+     uint32_t n_outputs = 0;
+     uint32_t n_outputs_prev = 0;
+@@ -9025,7 +9278,7 @@ static int llama_encode_impl(
 
     const int64_t n_embd = hparams.n_embd;
 
@@ -977,7 +943,7 @@ index 7dec50ae..bac66c24 100644
 
     const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
 
-@@ -11775,6 +12075,7 @@ struct llama_context_params llama_context_default_params() {
+@@ -9511,6 +9764,7 @@ struct llama_context_params llama_context_default_params() {
         /*.offload_kqv                 =*/ true,
         /*.flash_attn                  =*/ false,
         /*.no_perf                     =*/ true,
--- a/llama/patches/0008-add-unpad-operator.patch
+++ b/llama/patches/0008-add-unpad-operator.patch
@@ -15,10 +15,10 @@ Subject: [PATCH] add unpad operator
 8 files changed, 220 insertions(+), 2 deletions(-)

 diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
-index c714fc8c..1bc50fca 100644
+index dd0c6a96..8d269a9c 100644
 --- a/ggml/include/ggml.h
 +++ b/ggml/include/ggml.h
-@@ -499,6 +499,7 @@ extern "C" {
+@@ -487,6 +487,7 @@ extern "C" {
         GGML_OP_UPSCALE, // nearest interpolate
         GGML_OP_PAD,
         GGML_OP_PAD_REFLECT_1D,
@@ -26,7 +26,7 @@ index c714fc8c..1bc50fca 100644
         GGML_OP_ARANGE,
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
-@@ -1735,6 +1736,15 @@ extern "C" {
+@@ -1743,6 +1744,15 @@ extern "C" {
             int                   p0,
             int                   p1);
 
@@ -43,10 +43,10 @@ index c714fc8c..1bc50fca 100644
     // timesteps: [N,]
     // return: [N, dim]
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index b7fefb9d..b307d554 100644
+index 72325349..2f606d82 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -10588,6 +10588,59 @@ static void ggml_compute_forward_pad_reflect_1d(
+@@ -10844,6 +10844,59 @@ static void ggml_compute_forward_pad_reflect_1d(
     }
 }
 
@@ -106,7 +106,7 @@ index b7fefb9d..b307d554 100644
 // ggml_compute_forward_arange
 
 static void ggml_compute_forward_arange_f32(
-@@ -12690,6 +12743,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
+@@ -13137,6 +13190,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_pad_reflect_1d(params, tensor);
             } break;
@@ -117,7 +117,7 @@ index b7fefb9d..b307d554 100644
         case GGML_OP_ARANGE:
             {
                 ggml_compute_forward_arange(params, tensor);
-@@ -13033,6 +13090,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
+@@ -13484,6 +13541,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
@@ -126,10 +126,10 @@ index b7fefb9d..b307d554 100644
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index aaa79ea4..9286f866 100644
+index 36165840..1adf08fa 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2082,6 +2082,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2198,6 +2198,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_PAD:
             ggml_cuda_op_pad(ctx, dst);
             break;
@@ -139,8 +139,8 @@ index aaa79ea4..9286f866 100644
         case GGML_OP_ARANGE:
             ggml_cuda_op_arange(ctx, dst);
             break;
-@@ -3010,6 +3013,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
-         case GGML_OP_GROUP_NORM:
+@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
+             return ggml_is_contiguous(op->src[0]);
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
 +        case GGML_OP_UNPAD:
@@ -148,7 +148,7 @@ index aaa79ea4..9286f866 100644
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_LEAKY_RELU:
 diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
-index aba539e8..39fd4b16 100644
+index aba539e8..b4b87409 100644
 --- a/ggml/src/ggml-cuda/pad.cu
 +++ b/ggml/src/ggml-cuda/pad.cu
@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -201,6 +201,7 @@ index aba539e8..39fd4b16 100644
 +        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
 +        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
 +}
+\ No newline at end of file
 diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
 index 8fd386b0..e2ededc3 100644
 --- a/ggml/src/ggml-cuda/pad.cuh
@@ -211,10 +212,10 @@ index 8fd386b0..e2ededc3 100644
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index cd8ef741..318addec 100644
+index fd9a4e77..e4c093f9 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -311,6 +311,7 @@ enum ggml_metal_kernel_type {
+@@ -331,6 +331,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
     GGML_METAL_KERNEL_TYPE_PAD_F32,
     GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
@@ -222,7 +223,7 @@ index cd8ef741..318addec 100644
     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-@@ -910,6 +911,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
+@@ -946,6 +947,7 @@ @implementation GGMLMetalClass
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                   upscale_f32,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                       pad_f32,                        true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,            pad_reflect_1d_f32,             true);
@@ -230,7 +231,7 @@ index cd8ef741..318addec 100644
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,        timestep_embedding_f32,         true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                    arange_f32,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,           argsort_f32_i32_asc,            true);
-@@ -1145,6 +1147,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
+@@ -1254,6 +1256,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
@@ -238,7 +239,7 @@ index cd8ef741..318addec 100644
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
-@@ -3348,6 +3351,36 @@ static void ggml_metal_encode_node(
+@@ -3469,6 +3472,36 @@ static void ggml_metal_encode_node(
 
                 const int nth = MIN(1024, ne0);
 
@@ -276,10 +277,10 @@ index cd8ef741..318addec 100644
             } break;
         case GGML_OP_ARANGE:
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index 8ba43904..204c93e6 100644
+index d092a169..f38909d0 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -2944,6 +2944,51 @@ kernel void kernel_pad_reflect_1d_f32(
+@@ -2953,6 +2953,51 @@ kernel void kernel_pad_reflect_1d_f32(
     }
 }
 
@@ -332,10 +333,10 @@ index 8ba43904..204c93e6 100644
     device        char * dst,
     constant   int64_t & ne0,
 diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index 2bbe5f48..7ffcd907 100644
+index 7fc06724..635aa299 100644
 --- a/ggml/src/ggml.c
 +++ b/ggml/src/ggml.c
-@@ -954,6 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+@@ -962,6 +962,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "UPSCALE",
     "PAD",
     "PAD_REFLECT_1D",
@@ -343,16 +344,16 @@ index 2bbe5f48..7ffcd907 100644
     "ARANGE",
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
-@@ -987,7 +988,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+@@ -996,7 +997,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
-+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
-@@ -1050,6 +1051,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+@@ -1059,6 +1060,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "upscale(x)",
     "pad(x)",
     "pad_reflect_1d(x)",
@@ -360,16 +361,16 @@ index 2bbe5f48..7ffcd907 100644
     "arange(start, stop, step)",
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
-@@ -1083,7 +1085,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+@@ -1093,7 +1095,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
 
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
-+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
-@@ -4214,6 +4216,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
+@@ -4225,6 +4227,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
     return result;
 }
 
--- a/llama/patches/0009-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0009-fix-deepseek-deseret-regex.patch
@@ -11,10 +11,10 @@ the characters
 2 files changed, 23 insertions(+), 1 deletion(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 3fcfcaa3..8f44705a 100644
+index a4eee9b8..1ca827eb 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -375,7 +375,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
+@@ -295,7 +295,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
             case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
                 regex_exprs = {
                     "[\r\n]",
@@ -24,7 +24,7 @@ index 3fcfcaa3..8f44705a 100644
                     "\\s+$",
                     "[一-龥ࠀ-一가-퟿]+",
 diff --git a/src/unicode.cpp b/src/unicode.cpp
-index 7aca6544..6155da80 100644
+index e63bb4ab..9dd53b9a 100644
 --- a/src/unicode.cpp
 +++ b/src/unicode.cpp
@@ -2,6 +2,11 @@
@@ -39,7 +39,7 @@ index 7aca6544..6155da80 100644
 #include "unicode.h"
 #include "unicode-data.h"
 
-@@ -201,6 +206,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
+@@ -200,6 +205,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
 }
 
 static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
@@ -62,7 +62,7 @@ index 7aca6544..6155da80 100644
 #if defined(__clang__)
     // disable C++17 deprecation warning for std::codecvt_utf8
 #    pragma clang diagnostic push
-@@ -214,6 +235,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
+@@ -213,6 +234,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
 #endif
 
     return conv.from_bytes(s);
--- a/llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
+++ b/llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
@@ -8,11 +8,11 @@ Subject: [PATCH] Maintain ordering for rules for grammar
 1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
-index dadc18c8..2a8dbd22 100644
+index 3ebcc3d9..30c28808 100644
 --- a/common/json-schema-to-grammar.cpp
 +++ b/common/json-schema-to-grammar.cpp
-@@ -391,7 +391,7 @@ class SchemaConverter {
- private:
+@@ -346,7 +346,7 @@ private:
+     friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
     std::function<json(const std::string &)> _fetch_json;
     bool _dotall;
 -    std::map<std::string, std::string> _rules;
--- a/llama/patches/0011-fix-missing-arg-in-static-assert-on-windows.patch
+++ b/llama/patches/0011-fix-missing-arg-in-static-assert-on-windows.patch
@@ -1,22 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sat, 14 Dec 2024 12:54:00 -0800
-Subject: [PATCH] fix missing arg in static assert on windows
-
---
- ggml/src/ggml-cuda/concat.cu | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu
-index 2f42b8a9..5eb9f08d 100644
--- a/ggml/src/ggml-cuda/concat.cu
-+++ b/ggml/src/ggml-cuda/concat.cu
-@@ -124,7 +124,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
-           uint64_t   nb1,
-           uint64_t   nb2,
-           uint64_t   nb3){
-    static_assert(dim >= 0 && dim <= 3);
-+    static_assert(dim >= 0 && dim <= 3, "dim must be between 0 and 3");
- 
-     const int64_t i3 = blockIdx.z;
-     const int64_t i2 = blockIdx.y;
--- a/llama/patches/0011-llama-Ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0011-llama-Ensure-KV-cache-is-fully-defragmented.patch
@@ -19,10 +19,10 @@ multiple batches of processing until everything is complete.
 1 file changed, 46 insertions(+), 53 deletions(-)

 diff --git a/src/llama.cpp b/src/llama.cpp
-index bac66c24..c95da45d 100644
+index 8f7902df..01854fce 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -3536,6 +3536,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
+@@ -1054,6 +1054,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
     return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
 }
 
@@ -36,13 +36,13 @@ index bac66c24..c95da45d 100644
 struct llm_build_context {
     const llama_model    & model;
           llama_context  & lctx;
-@@ -3712,35 +3719,23 @@ struct llm_build_context {
+@@ -1230,35 +1237,23 @@ struct llm_build_context {
         return gf;
     }
 
 -    struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
 +    struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
-         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
 -        for (uint32_t i = 0; i < ids.size(); ++i) {
 -            const uint32_t id = ids[i];
@@ -78,7 +78,7 @@ index bac66c24..c95da45d 100644
 
                 ggml_tensor * view_v_src;
                 ggml_tensor * view_v_dst;
-@@ -3748,31 +3743,29 @@ struct llm_build_context {
+@@ -1266,31 +1261,29 @@ struct llm_build_context {
                 if (flash_attn) {
                     // NOTE: the V cache is not transposed when using flash attention
                     view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
@@ -118,7 +118,7 @@ index bac66c24..c95da45d 100644
         }
 
         //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
-@@ -10856,7 +10849,7 @@ struct llm_build_context {
+@@ -8508,7 +8501,7 @@ struct llm_build_context {
     }
 };
 
@@ -127,7 +127,7 @@ index bac66c24..c95da45d 100644
     llama_ubatch dummy = {};
     dummy.equal_seqs = true;
 
-@@ -10866,7 +10859,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
+@@ -8518,7 +8511,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
 
     llm.init();
 
@@ -136,21 +136,21 @@ index bac66c24..c95da45d 100644
 
     llm.free();
 
-@@ -11329,7 +11322,12 @@ static int llama_decode_internal(
-                 kv_self.head = 0;
-             }
+@@ -8956,7 +8949,12 @@ static int llama_prepare_ubatch(
+             kv_self.head = 0;
+         }
 
-            const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
-+            auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
-+            if (!slot) {
-+                llama_kv_cache_defrag(kv_self);
-+                llama_kv_cache_update(&lctx);
-+                slot = llama_kv_cache_find_slot(kv_self, ubatch);
-+            }
-             if (!slot) {
-                 return 1;
-             }
-@@ -11735,8 +11733,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
+-        const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+        auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+        if (!slot) {
+            llama_kv_cache_defrag(kv_self);
+            llama_kv_cache_update(&lctx);
+            slot = llama_kv_cache_find_slot(kv_self, ubatch);
+        }
+         if (!slot) {
+             return 1;
+         }
+@@ -9431,8 +9429,8 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
 
     //const int64_t t_start = ggml_time_us();
 
@@ -161,7 +161,7 @@ index bac66c24..c95da45d 100644
 
     // each move requires 6*n_layer tensors (see build_defrag)
     //   - source view, destination view, copy operation
-@@ -11800,19 +11798,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
+@@ -9496,19 +9494,11 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
         // are we moving a continuous block of memory?
         bool cont = false;
 
@@ -181,7 +181,7 @@ index bac66c24..c95da45d 100644
                 cont = false;
                 continue;
             }
-@@ -11828,8 +11818,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
+@@ -9524,8 +9514,10 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
             kv_self.head = n_used;
 
             if (!cont) {
@@ -193,7 +193,7 @@ index bac66c24..c95da45d 100644
             }
 
             nf++;
-@@ -11839,22 +11831,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
+@@ -9535,22 +9527,16 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
             }
         }
 
@@ -218,7 +218,7 @@ index bac66c24..c95da45d 100644
 
 #if 0
     // CPU defrag
-@@ -11929,11 +11915,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
+@@ -9625,11 +9611,18 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
 #else
     // ggml_graph defrag
 
--- a/llama/patches/0012-use-dynamic-backend-loading-for-clip.patch
+++ b/llama/patches/0012-use-dynamic-backend-loading-for-clip.patch
@@ -8,12 +8,12 @@ Subject: [PATCH] use dynamic backend loading for clip
 1 file changed, 27 insertions(+), 47 deletions(-)

 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index b3c1829f..86b91d5c 100644
+index 205af1eb..560021c7 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
-@@ -8,25 +8,25 @@
- #include "ggml-alloc.h"
+@@ -9,25 +9,25 @@
 #include "ggml-backend.h"
+ #include "gguf.h"
 
 -//#ifdef GGML_USE_CUDA
 -//#include "ggml-cuda.h"
@@ -56,7 +56,7 @@ index b3c1829f..86b91d5c 100644
 
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
-@@ -1235,35 +1235,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+@@ -1309,35 +1309,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         }
     }
 
--- a/llama/patches/0013-sort-devices-by-score.patch
+++ b/llama/patches/0013-sort-devices-by-score.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] sort devices by score
 1 file changed, 13 insertions(+), 8 deletions(-)

 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 899d16f2..135f7df0 100644
+index 95036ef8..98d5e14d 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
@@ -150,7 +150,7 @@ struct ggml_backend_reg_entry {
--- a/llama/patches/0014-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0014-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 84101c32..72b488dd 100644
+index 0002ac18..0a8d1092 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
+@@ -297,6 +297,7 @@ function(ggml_add_cpu_backend_variant tag_name)
     endforeach()
 
     ggml_add_cpu_backend_variant_impl(${tag_name})
@@ -19,7 +19,7 @@ index 84101c32..72b488dd 100644
 endfunction()
 
 ggml_add_backend(CPU)
-@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
+@@ -305,6 +306,7 @@ if (GGML_CPU_ALL_VARIANTS)
     if (NOT GGML_BACKEND_DL)
         message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
     endif()
--- a/llama/patches/0015-try-catch-backend-load.patch
+++ b/llama/patches/0015-try-catch-backend-load.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] try/catch backend load
 1 file changed, 23 insertions(+), 22 deletions(-)

 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 135f7df0..84b21dd8 100644
+index 98d5e14d..1c19129a 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
@@ -512,32 +512,33 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
--- a/llama/patches/0016-remove-sgemm-global-variables.patch
+++ b/llama/patches/0016-remove-sgemm-global-variables.patch
@@ -1,55 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sun, 9 Feb 2025 17:22:15 -0800
-Subject: [PATCH] remove sgemm global variables
-
-removes the 'iq4nlt' global variable in sgemm.cpp that causes
-a runtime crash when calling dlopen on ggml-cpu libraries as
-its initialization depends on AVX instructions the host machine
-may not have
---
- ggml/src/ggml-cpu/llamafile/sgemm.cpp | 17 +++++++++--------
- 1 file changed, 9 insertions(+), 8 deletions(-)
-
-diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
-index 8fce576c..3f260ce5 100644
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
-+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
-@@ -279,14 +279,6 @@ template <> inline __m256bh load(const float *p) {
- }
- #endif
- 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// CONSTANTS
-
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
-static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
-static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
-#endif
-
- ////////////////////////////////////////////////////////////////////////////////////////////////////
- // FLOATING POINT MATRIX MULTIPLICATION
- 
-@@ -613,6 +605,14 @@ class tinyBLAS_Q0_AVX {
-                     TC *C, int64_t ldc,
-                     int ith, int nth)
-         : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
-+        const int8_t kvalues_iq4nl[16] = {
-+            -127, -104, -83, -65,
-+            -49,  -35,  -22, -10,
-+              1,   13,   25,  38,
-+             53,   69,   89, 113
-+        };
-+
-+        iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
-     }
- 
-     void matmul(int64_t m, int64_t n) {
-@@ -1037,6 +1037,7 @@ class tinyBLAS_Q0_AVX {
-     const int64_t ldc;
-     const int ith;
-     const int nth;
-+    __m128i iq4nlt;
- };
- #endif // __AVX__
- 
--- a/llama/patches/0016-use-std-filesystem-path-instead-of-wstring.patch
+++ b/llama/patches/0016-use-std-filesystem-path-instead-of-wstring.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] use std::filesystem::path instead of wstring
 1 file changed, 58 insertions(+), 86 deletions(-)

 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 84b21dd8..e35a6936 100644
+index 1c19129a..c854e6bb 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
@@ -66,26 +66,6 @@
--- a/llama/patches/0017-remove-amx.patch
+++ b/llama/patches/0017-remove-amx.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] remove amx
 1 file changed, 4 deletions(-)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 72b488dd..50828717 100644
+index 0a8d1092..4564df91 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
+@@ -312,10 +312,6 @@ if (GGML_CPU_ALL_VARIANTS)
     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
     ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
     ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 FMA AVX_VNNI)
@@ -19,6 +19,6 @@ index 72b488dd..50828717 100644
 -        # MSVC doesn't support AMX
 -        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
 -    endif()
- else ()
+ elseif (GGML_CPU)
     ggml_add_cpu_backend_variant_impl("")
 endif()
--- a/llama/patches/0018-fix-clip-compiler-error.patch
+++ b/llama/patches/0018-fix-clip-compiler-error.patch
@@ -0,0 +1,36 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 25 Feb 2025 19:14:51 -0800
+Subject: [PATCH] fix-clip-compiler-error
+
+---
+ examples/llava/clip.cpp | 2 +-
+ examples/llava/clip.h   | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
+index 560021c7..54265beb 100644
+--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
+@@ -1788,7 +1788,7 @@ void clip_image_f32_batch_free(struct clip_image_f32_batch  * batch) {
+     }
+ }
+ 
+-void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
+void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img) {
+     img->nx = nx;
+     img->ny = ny;
+     img->buf.resize(3 * nx * ny);
+diff --git a/examples/llava/clip.h b/examples/llava/clip.h
+index ce6f6194..f9f80d7d 100644
+--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
+@@ -75,7 +75,7 @@ CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
+ CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
+ 
+ /** build image from pixels decoded by other libraries instead of stb_image.h for better performance. The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes */
+-CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img);
+CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
+ 
+ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+