llama: update vendored code to commit 40c6d79f (#7875)

2025-12-10 07:46:59 +00:00 · 2024-12-10 19:21:34 -08:00
parent a37f4a86a7
commit 527cc97899
289 changed files with 58552 additions and 41806 deletions
--- a/llama/patches/0001-cuda.patch
+++ b/llama/patches/0001-cuda.patch
@@ -4,47 +4,33 @@ Date: Thu, 6 Jun 2024 23:55:47 -0700
 Subject: [PATCH] cuda

 ---
- ggml/include/ggml-cuda.h | 2 ++
- ggml/src/ggml-backend.c  | 5 +++++
- ggml/src/ggml-cuda.cu    | 6 ++++--
- 3 files changed, 11 insertions(+), 2 deletions(-)
+ ggml/src/ggml-backend.cpp       | 5 +++++
+ ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++++
+ 2 files changed, 9 insertions(+)

-diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
-index 71bb6dcf..08be0895 100644
--- a/ggml/include/ggml-cuda.h
-+++ b/ggml/include/ggml-cuda.h
-@@ -34,6 +34,8 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_typ
- // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
- 
-+GGML_API GGML_CALL int ggml_backend_cuda_reg_devices();
-+
- GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
- GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
- GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
-diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
-index ba280e06..d5c3fe49 100644
--- a/ggml/src/ggml-backend.c
-+++ b/ggml/src/ggml-backend.c
-@@ -83,7 +83,12 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
+diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
+index fdb4b986..9b80fe07 100644
+--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
+@@ -106,7 +106,12 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
     if (buffer->iface.free_buffer != NULL) {
         buffer->iface.free_buffer(buffer);
     }
 +
-+// TODO: this needs to be freed in cuda and hipblas backends because
+// TODO: this needs to be freed in cuda and hip backends because
 +// the cuda backend implementation compiled with msvc
-+#if !defined(GGML_USE_CUDA) && !defined(GGML_USE_HIPBLAS)
-     free(buffer);
+#if !defined(GGML_USE_CUDA) && !defined(GGML_USE_HIP)
+     delete buffer;
 +#endif
 }
 
 size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
-diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
-index 6efdab14..809d6ab1 100644
--- a/ggml/src/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda.cu
-@@ -469,6 +469,10 @@ GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer)
- GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index d6e4bfdd..52aec229 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -424,6 +424,10 @@ struct ggml_backend_cuda_buffer_context {
+ static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     delete ctx;
 +
@@ -53,13 +39,4 @@ index 6efdab14..809d6ab1 100644
 +    free(buffer);
 }
 
- GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -3204,8 +3208,6 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params,
-     GGML_UNUSED(params);
- }
- 
-extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
-
- GGML_CALL int ggml_backend_cuda_reg_devices() {
-     int device_count = ggml_backend_cuda_get_device_count();
-     //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
+ static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] pretokenizer
 1 file changed, 3 insertions(+), 11 deletions(-)

 diff --git a/src/llama.cpp b/src/llama.cpp
-index 4c0a1bb6..800dfb95 100644
+index 6a6f4c2a..fa09f3b3 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -6287,16 +6287,7 @@ static void llm_load_vocab(
+@@ -6362,16 +6362,7 @@ static void llm_load_vocab(
         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
             vocab.tokenizer_add_space_prefix = false;
             vocab.tokenizer_clean_spaces = true;
@@ -29,7 +29,7 @@ index 4c0a1bb6..800dfb95 100644
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -6398,7 +6389,8 @@ static void llm_load_vocab(
+@@ -6473,7 +6464,8 @@ static void llm_load_vocab(
                 vocab.tokenizer_add_bos = true;
                 vocab.tokenizer_clean_spaces = false;
             } else {
--- a/llama/patches/0003-embeddings.patch
+++ b/llama/patches/0003-embeddings.patch
@@ -4,14 +4,14 @@ Date: Mon, 16 Sep 2024 15:53:14 -0700
 Subject: [PATCH] embeddings

 ---
- src/llama.cpp | 15 +++++++++------
- 1 file changed, 9 insertions(+), 6 deletions(-)
+ src/llama.cpp | 9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)

 diff --git a/src/llama.cpp b/src/llama.cpp
-index 800dfb95..a639522d 100644
+index fa09f3b3..d1791af0 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -16920,7 +16920,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
+@@ -17398,7 +17398,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
@@ -20,20 +20,15 @@ index 800dfb95..a639522d 100644
     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
 
     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-@@ -17192,20 +17192,23 @@ static int llama_decode_internal(
-             // no output
+@@ -17693,7 +17693,6 @@ static int llama_decode_internal(
             res  = nullptr;
             embd = nullptr;
-        } else if (cparams.embeddings) {
+         } else if (cparams.embeddings) {
 -            res  = nullptr; // do not extract logits for embedding case
-            embd = nullptr;
-+        }
-+
-+        if (cparams.embeddings) {
+             embd = nullptr;
             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
-+                embd = ggml_graph_node(gf, i);
                 if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
-                    embd = ggml_graph_node(gf, i);
+@@ -17701,11 +17700,15 @@ static int llama_decode_internal(
                     break;
                 }
             }
@@ -46,6 +41,7 @@ index 800dfb95..a639522d 100644
 +        if (!cparams.causal_attn) {
 +            res = nullptr; // do not extract logits when not needed
 +        }
+
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
-         ggml_backend_sched_alloc_graph(lctx.sched, gf);
+         ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
--- a/llama/patches/0003-metal.patch
+++ b/llama/patches/0003-metal.patch
@@ -1,54 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:12 -0700
-Subject: [PATCH] metal
-
---
- ggml/src/ggml-metal.m | 30 +++++++++++++-----------------
- 1 file changed, 13 insertions(+), 17 deletions(-)
-
-diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
-index 9da08fe2..3a433703 100644
--- a/ggml/src/ggml-metal.m
-+++ b/ggml/src/ggml-metal.m
-@@ -1720,27 +1720,23 @@ static void ggml_metal_encode_node(
-                 // to the matrix-vector kernel
-                 int ne11_mm_min = 1;
- 
-#if 0
-                 // the numbers below are measured on M2 Ultra for 7B and 13B models
-                 // these numbers do not translate to other devices or model sizes
-                 // TODO: need to find a better approach
-                        if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
-                            switch (src0t) {
-                                case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
-                                case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
-                                case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q4_0:
-                                case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
-                                case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
-                                case GGML_TYPE_Q5_0:                          // not tested yet
-                                case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
-                                case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
-                                default:             ne11_mm_min = 1;  break;
-                            }
-+                        switch (src0t) {
-+                            case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
-+                            case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
-+                            case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
-+                            case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
-+                            case GGML_TYPE_Q4_0:
-+                            case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
-+                            case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
-+                            case GGML_TYPE_Q5_0:                          // not tested yet
-+                            case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
-+                            case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
-+                            case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
-+                            default:             ne11_mm_min = 1;  break;
-                         }
-#endif
- 
-                         // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
-                         // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
--- a/llama/patches/0004-clip-unicode.patch
+++ b/llama/patches/0004-clip-unicode.patch
@@ -8,12 +8,12 @@ Subject: [PATCH] clip-unicode
 1 file changed, 39 insertions(+), 1 deletion(-)

 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 14e02c8d..6e849d8e 100644
+index d7c94352..427d5e02 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
-@@ -44,6 +44,19 @@
- #define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
- #define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+@@ -56,6 +56,19 @@
+ #   define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+ #endif // defined(LLAVA_LOG_OFF)
 
 +#if defined(_WIN32)
 +#define WIN32_LEAN_AND_MEAN
@@ -31,7 +31,7 @@ index 14e02c8d..6e849d8e 100644
 //#define CLIP_DEBUG_FUNCTIONS
 
 // RGB uint8 image
-@@ -1225,8 +1238,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+@@ -1242,8 +1255,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             gguf_free(ctx);
             return nullptr;
         }
@@ -62,7 +62,7 @@ index 14e02c8d..6e849d8e 100644
         if (!fin) {
             LOG_ERR("cannot open model file for loading tensors\n");
             clip_free(new_clip);
-@@ -1266,7 +1300,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+@@ -1283,7 +1317,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                 ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
             }
         }
--- a/llama/patches/0004-ggml-metal.patch
+++ b/llama/patches/0004-ggml-metal.patch
@@ -1,24 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Wed, 12 Jun 2024 12:18:40 -0700
-Subject: [PATCH] ggml-metal
-
---
- ggml/src/ggml-metal.m | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
-index 3a433703..829c5e39 100644
--- a/ggml/src/ggml-metal.m
-+++ b/ggml/src/ggml-metal.m
-@@ -392,8 +392,8 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
- #if GGML_METAL_EMBED_LIBRARY
-             GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__);
- 
-            extern const char ggml_metallib_start[];
-            extern const char ggml_metallib_end[];
-+            extern const char *ggml_metallib_start;
-+            extern const char *ggml_metallib_end;
- 
-             NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
- #else
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@@ -11,14 +11,14 @@ tensor to store the scalar. the scalar is implemented a 1-dimensional
 tensor with 2 elements dervied from the model's bskcn_tv configuration.
 in general, the values are (bskcn_tv, 1 - bskcn_tv)
 ---
- src/llama.cpp | 269 +++++++++++++++++++++++++++++++++++++++++++++++---
- 1 file changed, 255 insertions(+), 14 deletions(-)
+ src/llama.cpp | 267 +++++++++++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 253 insertions(+), 14 deletions(-)

 diff --git a/src/llama.cpp b/src/llama.cpp
-index a639522d..83b80b59 100644
+index d1791af0..b01770d0 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -217,6 +217,7 @@ enum llm_arch {
+@@ -195,6 +195,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
@@ -26,7 +26,7 @@ index a639522d..83b80b59 100644
     LLM_ARCH_UNKNOWN,
 };
 
-@@ -270,6 +271,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+@@ -249,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE,         "granite"      },
     { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
     { LLM_ARCH_CHAMELEON,       "chameleon"    },
@@ -34,7 +34,7 @@ index a639522d..83b80b59 100644
     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
 
-@@ -327,6 +329,7 @@ enum llm_kv {
+@@ -306,6 +308,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
@@ -42,7 +42,7 @@ index a639522d..83b80b59 100644
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_FREQ_BASE,
-@@ -421,20 +424,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+@@ -408,20 +411,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
     { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
 
@@ -76,17 +76,17 @@ index a639522d..83b80b59 100644
 +    { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                    },
 +    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection.%d" },
 
-     { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
-     { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
-@@ -608,6 +612,7 @@ enum llm_tensor {
+     { LLM_KV_ROPE_DIMENSION_COUNT,             "%s.rope.dimension_count"                 },
+     { LLM_KV_ROPE_FREQ_BASE,                   "%s.rope.freq_base"                       },
+@@ -603,6 +607,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
 +    LLM_TENSOR_BSKCN_TV,
 };
 
- static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
-@@ -1527,6 +1532,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
+ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
+@@ -1541,6 +1546,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
@@ -111,15 +111,15 @@ index a639522d..83b80b59 100644
     {
         LLM_ARCH_UNKNOWN,
         {
-@@ -2360,6 +2383,7 @@ enum e_model {
+@@ -2401,6 +2424,7 @@ enum e_model {
     MODEL_15B,
     MODEL_16B,
     MODEL_20B,
 +    MODEL_22B,
     MODEL_30B,
+     MODEL_32B,
     MODEL_34B,
-     MODEL_35B,
-@@ -2409,6 +2433,8 @@ struct llama_hparams {
+@@ -2451,6 +2475,8 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
@@ -128,7 +128,7 @@ index a639522d..83b80b59 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q = 0;
     uint32_t n_lora_kv = 0;
-@@ -2479,6 +2505,7 @@ struct llama_hparams {
+@@ -2521,6 +2547,7 @@ struct llama_hparams {
         if (this->n_head_arr    != other.n_head_arr)    return true;
         if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
         if (this->n_ff_arr      != other.n_ff_arr)      return true;
@@ -136,7 +136,7 @@ index a639522d..83b80b59 100644
 
         if (this->n_rel_attn_bkts    != other.n_rel_attn_bkts)    return true;
         if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
-@@ -2588,6 +2615,14 @@ struct llama_hparams {
+@@ -2630,6 +2657,14 @@ struct llama_hparams {
             return ssm_d_state * ssm_d_inner;
         }
     }
@@ -151,7 +151,7 @@ index a639522d..83b80b59 100644
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-@@ -2769,6 +2804,8 @@ struct llama_layer {
+@@ -2816,6 +2851,8 @@ struct llama_layer {
     struct ggml_tensor * ffn_gate_scale;
     struct ggml_tensor * ffn_up_scale;
     struct ggml_tensor * ffn_down_scale;
@@ -160,7 +160,7 @@ index a639522d..83b80b59 100644
 };
 
 // very similar to llama_batch,
-@@ -6134,6 +6171,21 @@ static void llm_load_hparams(
+@@ -6209,6 +6246,21 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                }
             } break;
@@ -182,46 +182,51 @@ index a639522d..83b80b59 100644
         default: (void)0;
     }
 
-@@ -8831,6 +8883,38 @@ static bool llm_load_tensors(
+@@ -7198,6 +7250,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
+     {LLM_TENSOR_FFN_UP_EXPS,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+     // this tensor is loaded for T5, but never used
+     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
+    {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}
+ };
 
-                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ // checks if the weight tensor can be used with the specified buffer type and device
+@@ -9205,6 +9258,35 @@ static bool llm_load_tensors(
 
-+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ 
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
 +                    }
 +                } break;
 +            case LLM_ARCH_SOLAR:
 +                {
-+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 +
 +                    // output
 +                    {
-+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
 +                    }
 +
 +                    for (int i = 0; i < n_layer; ++i) {
-+                        ggml_context * ctx_layer = ctx_for_layer(i);
-+                        ggml_context * ctx_split = ctx_for_layer_split(i);
-+
 +                        auto & layer = model.layers[i];
 +
-+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 +
-+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
-+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
 +
-+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 +
-+                        layer.bskcn_tv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_BSKCN_TV, "weight"), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.bskcn_tv = create_tensor(tn(LLM_TENSOR_BSKCN_TV, "weight", i), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
 +
-                         layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                         layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-@@ -16179,6 +16263,158 @@ struct llm_build_context {
+                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+@@ -16652,6 +16734,158 @@ struct llm_build_context {
 
         return gf;
     }
@@ -239,7 +244,7 @@ index a639522d..83b80b59 100644
 +        struct ggml_tensor * cur;
 +        struct ggml_tensor * inpL;
 +
-+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 +
 +        // inp_pos - contains the positions
 +        struct ggml_tensor * inp_pos = build_inp_pos();
@@ -380,7 +385,7 @@ index a639522d..83b80b59 100644
 };
 
 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
-@@ -16443,6 +16679,10 @@ static struct ggml_cgraph * llama_build_graph(
+@@ -16921,6 +17155,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_chameleon();
             } break;
@@ -391,7 +396,7 @@ index a639522d..83b80b59 100644
         default:
             GGML_ABORT("fatal error");
     }
-@@ -19589,6 +19829,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+@@ -20132,6 +20370,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_CHAMELEON:
--- a/llama/patches/0006-conditional-fattn.patch
+++ b/llama/patches/0006-conditional-fattn.patch
@@ -4,14 +4,14 @@ Date: Wed, 9 Oct 2024 17:26:23 -0700
 Subject: [PATCH] conditional-fattn

 ---
- ggml/src/ggml-cuda.cu | 2 ++
+ ggml/src/ggml-cuda/ggml-cuda.cu | 2 ++
 1 file changed, 2 insertions(+)

-diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
-index 809d6ab1..fe77b81c 100644
--- a/ggml/src/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda.cu
-@@ -2347,9 +2347,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index 52aec229..cbf4fddf 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2162,9 +2162,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_ARGSORT:
             ggml_cuda_op_argsort(ctx, dst);
             break;
--- a/llama/patches/0007-blas.patch
+++ b/llama/patches/0007-blas.patch
@@ -0,0 +1,26 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Mon, 30 Sep 2024 16:31:04 -0700
+Subject: [PATCH] blas
+
+---
+ ggml/src/ggml-blas/ggml-blas.cpp | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
+index ec158dfa..b3ac1fa4 100644
+--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
+@@ -1,3 +1,5 @@
+#ifdef GGML_USE_BLAS
+
+ #include "ggml-impl.h"
+ #include "ggml-blas.h"
+ #include "ggml-backend-impl.h"
+@@ -515,3 +517,5 @@ ggml_backend_reg_t ggml_backend_blas_reg(void) {
+ }
+ 
+ GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
+
+#endif // GGML_USE_BLAS
+\ No newline at end of file
--- a/llama/patches/0008-add-mllama-support.patch
+++ b/llama/patches/0008-add-mllama-support.patch
@@ -12,29 +12,46 @@ kv cache once per run

 remaining is to implement the cross attention mask
 ---
- examples/llava/llava.cpp |   2 +-
+ examples/llava/llava.cpp |   5 +-
 include/llama.h          |   5 +
- src/llama.cpp            | 447 +++++++++++++++++++++++++++++++++++++--
- 3 files changed, 436 insertions(+), 18 deletions(-)
+ src/llama.cpp            | 477 +++++++++++++++++++++++++++++++++++++--
+ 3 files changed, 467 insertions(+), 20 deletions(-)

 diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
-index 8558c6bd..37b2f2e2 100644
+index 4ca53a0b..d56644a8 100644
 --- a/examples/llava/llava.cpp
 +++ b/examples/llava/llava.cpp
-@@ -409,7 +409,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
-         if (n_eval > n_batch) {
+@@ -412,7 +412,7 @@ struct llava_embd_batch {
+     std::vector<llama_seq_id *> seq_ids;
+     std::vector<int8_t>         logits;
+     llama_batch batch;
+-    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+    llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+         pos     .resize(n_tokens);
+         n_seq_id.resize(n_tokens);
+         seq_ids .resize(n_tokens + 1);
+@@ -424,6 +424,7 @@ struct llava_embd_batch {
+             /*n_tokens       =*/ n_tokens,
+             /*tokens         =*/ nullptr,
+             /*embd           =*/ embd,
+            /*n_embd         =*/ n_embd,
+             /*pos            =*/ pos.data(),
+             /*n_seq_id       =*/ n_seq_id.data(),
+             /*seq_id         =*/ seq_ids.data(),
+@@ -447,7 +448,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
             n_eval = n_batch;
         }
-        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
-+        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), n_embd, nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
-         if (llama_decode(ctx_llama, batch)) {
+         float * embd = image_embed->embed+i*n_embd;
+-        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
+        llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
+         if (llama_decode(ctx_llama, llava_batch.batch)) {
             LOG_ERR("%s : failed to eval\n", __func__);
             return false;
 diff --git a/include/llama.h b/include/llama.h
-index 7cae1bbe..aca09310 100644
+index e85f459f..aba85f86 100644
 --- a/include/llama.h
 +++ b/include/llama.h
-@@ -240,6 +240,7 @@ extern "C" {
+@@ -245,6 +245,7 @@ extern "C" {
 
         llama_token  *  token;
         float        *  embd;
@@ -42,7 +59,7 @@ index 7cae1bbe..aca09310 100644
         llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
-@@ -423,6 +424,10 @@ extern "C" {
+@@ -419,6 +420,10 @@ extern "C" {
                      struct llama_model * model,
             struct llama_context_params   params);
 
@@ -54,10 +71,10 @@ index 7cae1bbe..aca09310 100644
     LLAMA_API void llama_free(struct llama_context * ctx);
 
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 83b80b59..35748488 100644
+index b01770d0..46881642 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -169,6 +169,7 @@ static std::string format(const char * fmt, ...) {
+@@ -146,6 +146,7 @@ static std::string format(const char * fmt, ...) {
 
 enum llm_arch {
     LLM_ARCH_LLAMA,
@@ -65,7 +82,7 @@ index 83b80b59..35748488 100644
     LLM_ARCH_FALCON,
     LLM_ARCH_BAICHUAN,
     LLM_ARCH_GROK,
-@@ -223,6 +224,7 @@ enum llm_arch {
+@@ -201,6 +202,7 @@ enum llm_arch {
 
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_LLAMA,           "llama"        },
@@ -73,7 +90,7 @@ index 83b80b59..35748488 100644
     { LLM_ARCH_FALCON,          "falcon"       },
     { LLM_ARCH_GROK,            "grok"         },
     { LLM_ARCH_GPT2,            "gpt2"         },
-@@ -330,6 +332,7 @@ enum llm_kv {
+@@ -309,6 +311,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
@@ -81,15 +98,15 @@ index 83b80b59..35748488 100644
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_FREQ_BASE,
-@@ -439,6 +442,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+@@ -426,6 +429,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"           },
     { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                    },
     { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection.%d" },
 +    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers"   },
 
-     { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
-     { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
-@@ -613,6 +617,14 @@ enum llm_tensor {
+     { LLM_KV_ROPE_DIMENSION_COUNT,             "%s.rope.dimension_count"                 },
+     { LLM_KV_ROPE_FREQ_BASE,                   "%s.rope.freq_base"                       },
+@@ -608,6 +612,14 @@ enum llm_tensor {
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
     LLM_TENSOR_BSKCN_TV,
@@ -103,8 +120,8 @@ index 83b80b59..35748488 100644
 +    LLM_TENSOR_CROSS_ATTN_MLP_GATE,
 };
 
- static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
-@@ -642,6 +654,40 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
+ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
+@@ -637,6 +649,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
         },
     },
@@ -145,7 +162,7 @@ index 83b80b59..35748488 100644
     {
         LLM_ARCH_BAICHUAN,
         {
-@@ -2390,6 +2436,7 @@ enum e_model {
+@@ -2432,6 +2478,7 @@ enum e_model {
     MODEL_40B,
     MODEL_65B,
     MODEL_70B,
@@ -153,7 +170,7 @@ index 83b80b59..35748488 100644
     MODEL_236B,
     MODEL_314B,
     MODEL_SMALL,
-@@ -2434,6 +2481,7 @@ struct llama_hparams {
+@@ -2476,6 +2523,7 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
     std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
@@ -161,7 +178,7 @@ index 83b80b59..35748488 100644
 
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q = 0;
-@@ -2502,10 +2550,11 @@ struct llama_hparams {
+@@ -2544,10 +2592,11 @@ struct llama_hparams {
         if (this->n_expert      != other.n_expert)      return true;
         if (this->n_expert_used != other.n_expert_used) return true;
 
@@ -169,15 +186,15 @@ index 83b80b59..35748488 100644
 -        if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
 -        if (this->n_ff_arr      != other.n_ff_arr)      return true;
 -        if (this->n_bskcn_arr   != other.n_bskcn_arr)   return true;
-+        if (this->n_head_arr        != other.n_head_arr)        return true;
-+        if (this->n_head_kv_arr     != other.n_head_kv_arr)     return true;
-+        if (this->n_ff_arr          != other.n_ff_arr)          return true;
-+        if (this->n_bskcn_arr       != other.n_bskcn_arr)       return true;
+        if (this->n_head_arr        != other.n_head_arr)    return true;
+        if (this->n_head_kv_arr     != other.n_head_kv_arr) return true;
+        if (this->n_ff_arr          != other.n_ff_arr)      return true;
+        if (this->n_bskcn_arr       != other.n_bskcn_arr)   return true;
 +        if (this->cross_attn_layers != other.cross_attn_layers) return true;
 
         if (this->n_rel_attn_bkts    != other.n_rel_attn_bkts)    return true;
         if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
-@@ -2623,6 +2672,10 @@ struct llama_hparams {
+@@ -2665,6 +2714,10 @@ struct llama_hparams {
 
         GGML_ABORT("fatal error");
     }
@@ -188,7 +205,7 @@ index 83b80b59..35748488 100644
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-@@ -2652,6 +2705,9 @@ struct llama_cparams {
+@@ -2694,6 +2747,9 @@ struct llama_cparams {
     bool offload_kqv;
     bool flash_attn;
     bool no_perf;
@@ -198,7 +215,7 @@ index 83b80b59..35748488 100644
 
     enum llama_pooling_type pooling_type;
 
-@@ -2806,6 +2862,16 @@ struct llama_layer {
+@@ -2853,6 +2909,16 @@ struct llama_layer {
     struct ggml_tensor * ffn_down_scale;
 
     struct ggml_tensor * bskcn_tv;
@@ -215,7 +232,7 @@ index 83b80b59..35748488 100644
 };
 
 // very similar to llama_batch,
-@@ -3452,6 +3518,8 @@ struct llama_context {
+@@ -3439,6 +3505,8 @@ struct llama_context {
     struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
     struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
     struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
@@ -224,13 +241,34 @@ index 83b80b59..35748488 100644
 };
 
 struct llama_lora_weight {
-@@ -3686,6 +3754,18 @@ static bool llama_kv_cache_init(
+@@ -3577,6 +3645,39 @@ static bool llama_kv_cache_init(
     cache.v_l.reserve(n_layer);
 
     for (int i = 0; i < (int) n_layer; i++) {
 +        // for cross attention layers
 +        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
-+            struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
+            const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
+            const llama_model::buft_list_t * buft_list;
+            if (offload) {
+                buft_list = model.dev_layer.at(i).buft_list;
+            } else {
+                buft_list = &model.cpu_buft_list;
+            }
+            ggml_backend_buffer_type_t buft = select_buft(*buft_list,
+                [&](ggml_context * ctx) {
+                    ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+                    if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
+                        return k;
+                    }
+                    ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+                    return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
+                });
+            ggml_context * ctx = ctx_for_buft(buft);
+
+            if (!ctx) {
+                LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
+                return false;
+            }
 +            ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
 +            ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
 +            ggml_format_name(k, "cache_k_l%d", i);
@@ -243,17 +281,17 @@ index 83b80b59..35748488 100644
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
 
-@@ -5460,12 +5540,14 @@ static void llm_load_hparams(
+@@ -5520,12 +5621,14 @@ static void llm_load_hparams(
     }
 
     // zero-out the per-layer hparams
 -    std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
 -    std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
 -    std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
-+    std::fill(hparams.n_head_arr.begin(),             hparams.n_head_arr.end(),        0);
-+    std::fill(hparams.n_head_kv_arr.begin(),          hparams.n_head_kv_arr.end(),     0);
-+    std::fill(hparams.n_ff_arr.begin(),               hparams.n_ff_arr.end(),          0);
-+    std::fill(hparams.cross_attn_layers.begin(),      hparams.cross_attn_layers.end(), -1);
+    std::fill(hparams.n_head_arr.begin(),        hparams.n_head_arr.end(),        0);
+    std::fill(hparams.n_head_kv_arr.begin(),     hparams.n_head_kv_arr.end(),     0);
+    std::fill(hparams.n_ff_arr.begin(),          hparams.n_ff_arr.end(),          0);
+    std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
 
 -    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer);
 -    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
@@ -263,7 +301,7 @@ index 83b80b59..35748488 100644
 
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv_arr = hparams.n_head_arr;
-@@ -5514,7 +5596,7 @@ static void llm_load_hparams(
+@@ -5574,7 +5677,7 @@ static void llm_load_hparams(
 
         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
 
@@ -272,7 +310,7 @@ index 83b80b59..35748488 100644
             if (hparams.n_rot != hparams.n_embd_head_k) {
                 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
             }
-@@ -5554,6 +5636,16 @@ static void llm_load_hparams(
+@@ -5614,6 +5717,16 @@ static void llm_load_hparams(
                     }
                 }
             } break;
@@ -289,63 +327,78 @@ index 83b80b59..35748488 100644
         case LLM_ARCH_MINICPM:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-@@ -7249,6 +7341,55 @@ static bool llm_load_tensors(
-                         layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+@@ -7250,7 +7363,15 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
+     {LLM_TENSOR_FFN_UP_EXPS,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+     // this tensor is loaded for T5, but never used
+     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
+-    {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}
+    {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CROSS_ATTN_K_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CROSS_ATTN_K_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CROSS_ATTN_O_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CROSS_ATTN_Q_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CROSS_ATTN_Q_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CROSS_ATTN_V_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CROSS_ATTN_ATTN_GATE,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CROSS_ATTN_MLP_GATE,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ };
+ 
+ // checks if the weight tensor can be used with the specified buffer type and device
+@@ -7754,6 +7875,53 @@ static bool llm_load_tensors(
+                         }
                     }
                 } break;
 +            case LLM_ARCH_MLLAMA:
 +                {
-+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8});
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
 +
 +                    // output
 +                    {
-+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
 +
 +                        // if output is NULL, init from the input tok embed
 +                        if (model.output == NULL) {
-+                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                            model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
 +                        }
 +                    }
 +
 +                    for (int i = 0; i < n_layer; ++i) {
-+                        ggml_context * ctx_layer = ctx_for_layer(i);
-+                        ggml_context * ctx_split = ctx_for_layer_split(i);
 +
 +                        auto & layer = model.layers[i];
 +
 +                        if (hparams.cross_attention_layers(i)) {
-+                            layer.cross_attn_k_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128});
-+                            layer.cross_attn_k_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_PROJ,   "weight", i), {n_embd, 1024});
-+                            layer.cross_attn_o_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_O_PROJ,   "weight", i), {n_embd, n_embd});
-+                            layer.cross_attn_q_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128});
-+                            layer.cross_attn_q_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd});
-+                            layer.cross_attn_v_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024});
-+                            layer.cross_attn_attn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1});
-+                            layer.cross_attn_mlp_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1});
-+                            layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-+                            layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-+                            layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-+                            layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-+                            layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                            layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128}, 0);
+                            layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ,   "weight", i), {n_embd, 1024}, 0);
+                            layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ,   "weight", i), {n_embd, n_embd}, 0);
+                            layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
+                            layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
+                            layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
+                            layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
+                            layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 +                        } else {
-+                            layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-+                            layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
-+                            layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-+                            layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-+                            layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
-+                            layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-+                            layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-+                            layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-+                            layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-+                            layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
 +                        }
 +                    }
 +                } break;
-             case LLM_ARCH_GROK:
+             case LLM_ARCH_MINICPM3:
                 {
-                     if (n_expert == 0) {
-@@ -9093,7 +9234,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
+                     const int64_t n_embd_head_qk_rope = hparams.n_rot;
+@@ -9463,7 +9631,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
 
         if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
             model.hparams.n_vocab != model.vocab.id_to_token.size()) {
@@ -354,7 +407,7 @@ index 83b80b59..35748488 100644
         }
 
         if (params.vocab_only) {
-@@ -9193,6 +9334,21 @@ static struct ggml_tensor * llm_build_inp_embd(
+@@ -9546,6 +9714,21 @@ static struct ggml_tensor * llm_build_inp_embd(
     return inpL;
 }
 
@@ -376,7 +429,7 @@ index 83b80b59..35748488 100644
 static void llm_build_kv_store(
         struct ggml_context * ctx,
         const llama_hparams & hparams,
-@@ -10167,6 +10323,7 @@ struct llm_build_context {
+@@ -10513,6 +10696,7 @@ struct llm_build_context {
         lctx.inp_pos_bucket    = nullptr;
         lctx.inp_embd_enc      = nullptr;
         lctx.inp_KQ_mask_cross = nullptr;
@@ -384,18 +437,10 @@ index 83b80b59..35748488 100644
     }
 
     void free() {
-@@ -10754,6 +10911,239 @@ struct llm_build_context {
-                 LLM_NORM_RMS, cb, -1);
-         cb(cur, "result_norm", -1);
+@@ -10992,6 +11176,240 @@ struct llm_build_context {
+         return gf;
+     }
 
-+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-+        cb(cur, "result_output", -1);
-+
-+        ggml_build_forward_expand(gf, cur);
-+
-+        return gf;
-+    }
-+
 +    struct ggml_cgraph * build_mllama() {
 +        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 +
@@ -410,7 +455,7 @@ index 83b80b59..35748488 100644
 +        struct ggml_tensor * inpL;
 +        struct ggml_tensor * inpCAS;
 +
-+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 +        inpCAS = llm_build_inp_cross_attn_state(ctx0, lctx, hparams, cb);
 +
 +        // inp_pos - contains the positions
@@ -429,7 +474,7 @@ index 83b80b59..35748488 100644
 +            cb(cur, "attn_norm", il);
 +
 +            if (hparams.cross_attention_layers(il)) {
-+                if (!batch.embd && !cparams.cross_attn) {
+                if (!ubatch.embd && !cparams.cross_attn) {
 +                    continue;
 +                }
 +
@@ -447,7 +492,7 @@ index 83b80b59..35748488 100644
 +                cb(Qcur, "Qcur", il);
 +
 +                struct ggml_tensor * Kcur, * Vcur;
-+                if (batch.embd) {
+                if (ubatch.embd) {
 +                    Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
 +                    cb(Kcur, "Kcur", il);
 +
@@ -621,10 +666,19 @@ index 83b80b59..35748488 100644
 +                LLM_NORM_RMS, cb, -1);
 +        cb(cur, "result_norm", -1);
 +
-         // lm_head
-         cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-         cb(cur, "result_output", -1);
-@@ -16501,6 +16891,10 @@ static struct ggml_cgraph * llama_build_graph(
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+     struct ggml_cgraph * build_baichuan() {
+         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ 
+@@ -16973,6 +17391,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_llama();
             } break;
@@ -635,14 +689,14 @@ index 83b80b59..35748488 100644
         case LLM_ARCH_BAICHUAN:
             {
                 result = llm.build_baichuan();
-@@ -16761,10 +17155,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
+@@ -17237,10 +17659,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
     }
 
-     if (batch.embd) {
+     if (ubatch.embd) {
 -        const int64_t n_embd   = hparams.n_embd;
-        const int64_t n_tokens = batch.n_tokens;
+-        const int64_t n_tokens = ubatch.n_tokens;
 +        if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
-+            ggml_backend_tensor_set(lctx.inp_cross_attn_state, batch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
+            ggml_backend_tensor_set(lctx.inp_cross_attn_state, ubatch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
 +            // zero out inp_embd since it's not used
 +            float * inp_embd_data = (float *)lctx.inp_embd->data;
 +            for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
@@ -650,24 +704,24 @@ index 83b80b59..35748488 100644
 +            }
 +        } else {
 +            const int64_t n_embd   = hparams.n_embd;
-+            const int64_t n_tokens = batch.n_tokens;
+            const int64_t n_tokens = ubatch.n_tokens;
 
-        ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
-+            ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+-        ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+            ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
 +        }
     }
 
-     if (batch.pos && lctx.inp_pos) {
-@@ -17345,7 +17748,7 @@ static int llama_decode_internal(
+     if (ubatch.pos && lctx.inp_pos) {
+@@ -17841,7 +18272,7 @@ static int llama_decode_internal(
         n_outputs = 1;
     }
 
-    lctx.sbatch.from_batch(batch_all, n_embd,
-+    lctx.sbatch.from_batch(batch_all, batch_all.n_embd,
+-    lctx.sbatch.from_batch(batch, n_embd,
+    lctx.sbatch.from_batch(batch, batch.n_embd,
         /* simple_split */ !kv_self.recurrent,
         /* logits_all   */ n_outputs == n_tokens_all);
 
-@@ -17638,7 +18041,7 @@ static int llama_encode_internal(
+@@ -18151,7 +18582,7 @@ static int llama_encode_internal(
 
     const int64_t n_embd = hparams.n_embd;
 
@@ -676,7 +730,7 @@ index 83b80b59..35748488 100644
 
     const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
 
-@@ -18648,7 +19051,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
+@@ -19189,7 +19620,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         if (llama_model_has_encoder(&model)) {
             n_attn_layer *= 3;
         }
@@ -687,7 +741,7 @@ index 83b80b59..35748488 100644
     }
 
     size_t total_size_org = 0;
-@@ -19814,6 +20219,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+@@ -20355,6 +20788,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
 
         // use what we call a normal RoPE, operating on pairs of consecutive head values
         case LLM_ARCH_LLAMA:
@@ -695,7 +749,7 @@ index 83b80b59..35748488 100644
         case LLM_ARCH_BAICHUAN:
         case LLM_ARCH_STARCODER:
         case LLM_ARCH_PLAMO:
-@@ -21230,6 +21636,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
+@@ -21782,6 +22216,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
     ctx->cparams.causal_attn = causal_attn;
 }
 
@@ -705,8 +759,8 @@ index 83b80b59..35748488 100644
 +
 struct llama_batch llama_batch_get_one(
              llama_token * tokens,
-                  int32_t   n_tokens,
-@@ -21239,6 +21649,7 @@ struct llama_batch llama_batch_get_one(
+                  int32_t   n_tokens) {
+@@ -21789,6 +22227,7 @@ struct llama_batch llama_batch_get_one(
         /*n_tokens       =*/ n_tokens,
         /*tokens         =*/ tokens,
         /*embd           =*/ nullptr,
@@ -714,7 +768,7 @@ index 83b80b59..35748488 100644
         /*pos            =*/ nullptr,
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
-@@ -21254,6 +21665,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
+@@ -21801,6 +22240,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
         /*n_tokens       =*/ 0,
         /*tokens         =*/ nullptr,
         /*embd           =*/ nullptr,
@@ -722,7 +776,7 @@ index 83b80b59..35748488 100644
         /*pos            =*/ nullptr,
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
-@@ -21265,6 +21677,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
+@@ -21809,6 +22249,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
 
     if (embd) {
         batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
--- a/llama/patches/0009-add-unpad-operator.patch
+++ b/llama/patches/0009-add-unpad-operator.patch
@@ -4,20 +4,21 @@ Date: Thu, 17 Oct 2024 17:19:25 -0700
 Subject: [PATCH] add unpad operator

 ---
- ggml/include/ggml.h        | 10 ++++
- ggml/src/ggml-cuda.cu      |  4 ++
- ggml/src/ggml-cuda/pad.cu  | 46 +++++++++++++++++++
- ggml/src/ggml-cuda/pad.cuh |  1 +
- ggml/src/ggml-metal.m      | 33 ++++++++++++++
- ggml/src/ggml-metal.metal  | 45 ++++++++++++++++++
- ggml/src/ggml.c            | 93 +++++++++++++++++++++++++++++++++++++-
- 7 files changed, 230 insertions(+), 2 deletions(-)
+ ggml/include/ggml.h                  | 10 +++++
+ ggml/src/ggml-cpu/ggml-cpu.c         | 57 ++++++++++++++++++++++++++++
+ ggml/src/ggml-cuda/ggml-cuda.cu      |  4 ++
+ ggml/src/ggml-cuda/pad.cu            | 46 ++++++++++++++++++++++
+ ggml/src/ggml-cuda/pad.cuh           |  1 +
+ ggml/src/ggml-metal/ggml-metal.m     | 33 ++++++++++++++++
+ ggml/src/ggml-metal/ggml-metal.metal | 45 ++++++++++++++++++++++
+ ggml/src/ggml.c                      | 25 +++++++++++-
+ 8 files changed, 219 insertions(+), 2 deletions(-)

 diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
-index ce3d92cb..962cb5f7 100644
+index 65cb92c4..acbcccc6 100644
 --- a/ggml/include/ggml.h
 +++ b/ggml/include/ggml.h
-@@ -506,6 +506,7 @@ extern "C" {
+@@ -499,6 +499,7 @@ extern "C" {
         GGML_OP_POOL_2D_BACK,
         GGML_OP_UPSCALE, // nearest interpolate
         GGML_OP_PAD,
@@ -25,7 +26,7 @@ index ce3d92cb..962cb5f7 100644
         GGML_OP_ARANGE,
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
-@@ -1764,6 +1765,15 @@ extern "C" {
+@@ -1695,6 +1696,15 @@ extern "C" {
             int                  p2,
             int                  p3);
 
@@ -41,11 +42,93 @@ index ce3d92cb..962cb5f7 100644
     // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
     // timesteps: [N,]
     // return: [N, dim]
-diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
-index fe77b81c..6e84af56 100644
--- a/ggml/src/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda.cu
-@@ -2270,6 +2270,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
+index 23ae2e10..111ff3b0 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
+@@ -10439,6 +10439,58 @@ static void ggml_compute_forward_pad(
+     }
+ }
+ 
+static void ggml_compute_forward_unpad_f32(
+    const struct ggml_compute_params *params,
+    struct ggml_tensor *dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    float * dst_ptr = (float *) dst->data;
+
+    // TODO: optimize
+
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                for (int64_t i3 = 0; i3 < ne3; ++i3) {
+                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+
+                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+
+                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+                        dst_ptr[dst_idx] = *src_ptr;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_unpad(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_unpad_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+ 
+ // ggml_compute_forward_arange
+ 
+@@ -12535,6 +12587,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
+             {
+                 ggml_compute_forward_pad(params, tensor);
+             } break;
+        case GGML_OP_UNPAD:
+            {
+                ggml_compute_forward_unpad(params, tensor);
+            } break;
+         case GGML_OP_ARANGE:
+             {
+                 ggml_compute_forward_arange(params, tensor);
+@@ -12877,6 +12933,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
+             } break;
+         case GGML_OP_UPSCALE:
+         case GGML_OP_PAD:
+        case GGML_OP_UNPAD:
+         case GGML_OP_ARANGE:
+         case GGML_OP_TIMESTEP_EMBEDDING:
+         case GGML_OP_ARGSORT:
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index cbf4fddf..9ca6cb77 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2085,6 +2085,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_PAD:
             ggml_cuda_op_pad(ctx, dst);
             break;
@@ -55,7 +138,7 @@ index fe77b81c..6e84af56 100644
         case GGML_OP_ARANGE:
             ggml_cuda_op_arange(ctx, dst);
             break;
-@@ -2992,6 +2995,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
+@@ -3012,6 +3015,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_GROUP_NORM:
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
@@ -126,35 +209,35 @@ index 8fd386b0..e2ededc3 100644
 
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
-index 829c5e39..25702d85 100644
--- a/ggml/src/ggml-metal.m
-+++ b/ggml/src/ggml-metal.m
-@@ -193,6 +193,7 @@
-     GGML_METAL_KERNEL_TYPE_IM2COL_F32,
+diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
+index 093ae900..cb9a1307 100644
+--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
+@@ -310,6 +310,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
+     GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
     GGML_METAL_KERNEL_TYPE_PAD_F32,
 +    GGML_METAL_KERNEL_TYPE_UNPAD_F32,
     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-@@ -689,6 +690,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32,                    im2col_f32,                     true);
+@@ -877,6 +878,7 @@ @implementation GGMLMetalClass
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,     conv_transpose_1d_f16_f32,      true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                   upscale_f32,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                       pad_f32,                        true);
 +        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                     unpad_f32,                        true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,        timestep_embedding_f32,         true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                    arange_f32,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,           argsort_f32_i32_asc,            true);
-@@ -846,6 +848,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
-             return false;
+@@ -1099,6 +1101,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
+         case GGML_OP_POOL_2D:
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
 +        case GGML_OP_UNPAD:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
-@@ -2655,6 +2658,36 @@ static void ggml_metal_encode_node(
+@@ -3258,6 +3261,36 @@ static void ggml_metal_encode_node(
 
                 const int nth = MIN(1024, ne0);
 
@@ -191,11 +274,11 @@ index 829c5e39..25702d85 100644
                 [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
             } break;
         case GGML_OP_ARANGE:
-diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
-index 2b200032..09887511 100644
--- a/ggml/src/ggml-metal.metal
-+++ b/ggml/src/ggml-metal.metal
-@@ -2029,6 +2029,51 @@ kernel void kernel_pad_f32(
+diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
+index 5caa0846..47038c31 100644
+--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
+@@ -2897,6 +2897,51 @@ kernel void kernel_pad_f32(
     }
 }
 
@@ -248,10 +331,10 @@ index 2b200032..09887511 100644
     device        char * dst,
     constant   int64_t & ne0,
 diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index bcbc32d9..f4864ac8 100644
+index 1a9a7efa..ea2b259b 100644
 --- a/ggml/src/ggml.c
 +++ b/ggml/src/ggml.c
-@@ -2997,6 +2997,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+@@ -950,6 +950,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "POOL_2D_BACK",
     "UPSCALE",
     "PAD",
@@ -259,16 +342,16 @@ index bcbc32d9..f4864ac8 100644
     "ARANGE",
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
-@@ -3030,7 +3031,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+@@ -983,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
-static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80");
-+static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
-@@ -3091,6 +3092,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+@@ -1045,6 +1046,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "pool_2d_back(x)",
     "upscale(x)",
     "pad(x)",
@@ -276,16 +359,16 @@ index bcbc32d9..f4864ac8 100644
     "arange(start, stop, step)",
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
-@@ -3124,7 +3126,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+@@ -1078,7 +1080,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
 
-static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80");
-+static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
-@@ -6955,6 +6957,32 @@ struct ggml_tensor * ggml_pad(
+@@ -4097,6 +4099,25 @@ struct ggml_tensor * ggml_pad(
     return result;
 }
 
@@ -295,12 +378,6 @@ index bcbc32d9..f4864ac8 100644
 +    struct ggml_context * ctx,
 +    struct ggml_tensor  * a,
 +    int p0, int p1, int p2, int p3) {
-+    bool is_node = false;
-+
-+    if (a->grad) {
-+        GGML_ABORT("fatal error"); // TODO: implement backward
-+        is_node = true;
-+    }
 +
 +    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
 +            a->ne[0] - p0,
@@ -309,7 +386,6 @@ index bcbc32d9..f4864ac8 100644
 +            a->ne[3] - p3);
 +
 +    result->op = GGML_OP_UNPAD;
-+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 +    result->src[0] = a;
 +
 +    return result;
@@ -318,92 +394,3 @@ index bcbc32d9..f4864ac8 100644
 // ggml_arange
 
 struct ggml_tensor * ggml_arange(
-@@ -15312,6 +15340,58 @@ static void ggml_compute_forward_pad(
-     }
- }
- 
-+static void ggml_compute_forward_unpad_f32(
-+    const struct ggml_compute_params *params,
-+    struct ggml_tensor *dst) {
-+
-+    const struct ggml_tensor * src0 = dst->src[0];
-+
-+    GGML_ASSERT(src0->nb[0] == sizeof(float));
-+    GGML_ASSERT( dst->nb[0] == sizeof(float));
-+
-+    const int ith = params->ith;
-+    const int nth = params->nth;
-+
-+    GGML_TENSOR_UNARY_OP_LOCALS
-+
-+    float * dst_ptr = (float *) dst->data;
-+
-+    // TODO: optimize
-+
-+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-+        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
-+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-+                for (int64_t i3 = 0; i3 < ne3; ++i3) {
-+                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-+
-+                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-+
-+                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-+                        dst_ptr[dst_idx] = *src_ptr;
-+                    }
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+static void ggml_compute_forward_unpad(
-+    const struct ggml_compute_params * params,
-+    struct ggml_tensor * dst) {
-+
-+    const struct ggml_tensor * src0 = dst->src[0];
-+
-+    switch (src0->type) {
-+        case GGML_TYPE_F32:
-+            {
-+                ggml_compute_forward_unpad_f32(params, dst);
-+            } break;
-+        default:
-+            {
-+                GGML_ABORT("fatal error");
-+            }
-+    }
-+}
- 
- // ggml_compute_forward_arange
- 
-@@ -17294,6 +17374,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
-             {
-                 ggml_compute_forward_pad(params, tensor);
-             } break;
-+        case GGML_OP_UNPAD:
-+            {
-+                ggml_compute_forward_unpad(params, tensor);
-+            } break;
-         case GGML_OP_ARANGE:
-             {
-                 ggml_compute_forward_arange(params, tensor);
-@@ -18369,6 +18453,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
-             {
-                 GGML_ABORT("fatal error"); // TODO: not implemented
-             }
-+        case GGML_OP_UNPAD:
-+            {
-+                GGML_ABORT("fatal error"); // TODO: not implemented
-+            }
-         case GGML_OP_ARANGE:
-             {
-                 GGML_ABORT("fatal error"); // TODO: not implemented
-@@ -19165,6 +19253,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
-             } break;
-         case GGML_OP_UPSCALE:
-         case GGML_OP_PAD:
-+        case GGML_OP_UNPAD:
-         case GGML_OP_ARANGE:
-         case GGML_OP_TIMESTEP_EMBEDDING:
-         case GGML_OP_ARGSORT:
--- a/llama/patches/0009-blas.patch
+++ b/llama/patches/0009-blas.patch
@@ -1,25 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@ollama.com>
-Date: Mon, 30 Sep 2024 16:31:04 -0700
-Subject: [PATCH] blas
-
---
- ggml/src/ggml-blas.cpp | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp
-index 6d99c6be..8e1ab99d 100644
--- a/ggml/src/ggml-blas.cpp
-+++ b/ggml/src/ggml-blas.cpp
-@@ -1,3 +1,5 @@
-+#ifdef GGML_USE_BLAS
-+
- #include "ggml-impl.h"
- #include "ggml-blas.h"
- #include "ggml-backend-impl.h"
-@@ -366,3 +368,5 @@ void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads)
-     ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
-     ctx->n_threads = n_threads;
- }
-+
-+#endif
--- a/llama/patches/0010-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0010-fix-deepseek-deseret-regex.patch
@@ -7,11 +7,11 @@ On windows compiled with gcc the c++ regex library failed to handle
 the characters
 ---
 src/llama-vocab.cpp |  2 +-
- src/unicode.cpp     | 21 +++++++++++++++++++++
- 2 files changed, 22 insertions(+), 1 deletion(-)
+ src/unicode.cpp     | 22 ++++++++++++++++++++++
+ 2 files changed, 23 insertions(+), 1 deletion(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index d2f34ddd..3ef6af19 100644
+index d1dc9627..05ef0e71 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
@@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
@@ -24,7 +24,7 @@ index d2f34ddd..3ef6af19 100644
                     "\\s+$",
                     "[一-龥ࠀ-一가-퟿]+",
 diff --git a/src/unicode.cpp b/src/unicode.cpp
-index f4e941cd..9d78ff16 100644
+index 3d459263..51dd81fb 100644
 --- a/src/unicode.cpp
 +++ b/src/unicode.cpp
@@ -2,6 +2,11 @@
@@ -39,7 +39,7 @@ index f4e941cd..9d78ff16 100644
 #include "unicode.h"
 #include "unicode-data.h"
 
-@@ -201,8 +206,24 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
+@@ -201,6 +206,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
 }
 
 static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
@@ -58,7 +58,13 @@ index f4e941cd..9d78ff16 100644
 +    free(wbuf);
 +    return ret;
 +#else
-     std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
+
+ #if defined(__clang__)
+     // disable C++17 deprecation warning for std::codecvt_utf8
+ #    pragma clang diagnostic push
+@@ -214,6 +235,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
+ #endif
+ 
     return conv.from_bytes(s);
 +#endif
 }
--- a/llama/patches/0011-relative-include-paths.patch
+++ b/llama/patches/0011-relative-include-paths.patch
@@ -0,0 +1,64 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 3 Dec 2024 21:30:51 -0800
+Subject: [PATCH] relative include paths
+
+---
+ ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 2 +-
+ ggml/src/ggml-cpu/ggml-cpu.c         | 2 +-
+ ggml/src/ggml-cpu/ggml-cpu.cpp       | 2 +-
+ ggml/src/ggml-quants.c               | 2 +-
+ 4 files changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+index 11152385..bbf8934e 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+@@ -4,7 +4,7 @@
+ #include "ggml-quants.h"
+ #include "ggml-impl.h"
+ #include "ggml-cpu.h"
+-#include "ggml-cpu/ggml-cpu-impl.h"
+#include "ggml-cpu-impl.h"
+ 
+ #include <math.h>
+ #include <string.h>
+diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
+index 111ff3b0..df0bd3c6 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
+@@ -10,7 +10,7 @@
+ #include "ggml-quants.h"
+ #include "ggml-cpu-quants.h"
+ #include "ggml-threading.h"
+-#include "amx/amx.h"
+#include "amx.h"
+ #include "ggml.h"
+ 
+ #if defined(_MSC_VER) || defined(__MINGW32__)
+diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
+index 77e5d87a..91476ad0 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
+@@ -3,7 +3,7 @@
+ #include "ggml-cpu.h"
+ #include "ggml-cpu-aarch64.h"
+ #include "ggml-impl.h"
+-#include "amx/amx.h"
+#include "amx.h"
+ #include <cctype>
+ #include <string>
+ #include <vector>
+diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
+index 7301a9c6..49ab3daf 100644
+--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
+@@ -3,7 +3,7 @@
+ 
+ #include "ggml-quants.h"
+ #include "ggml-impl.h"
+-#include "ggml-cpu/ggml-cpu-impl.h"
+#include "ggml-cpu-impl.h"
+ #include "ggml-cpu.h"
+ 
+ #include <math.h>