llama: update llama.cpp vendor code to commit d7cfe1ff (#9356)

2025-12-11 00:07:07 +00:00 · 2025-02-26 20:34:44 -08:00
parent 2db96c18e7
commit d7d7e99662
149 changed files with 18215 additions and 11009 deletions
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@@ -11,21 +11,21 @@ tensor to store the scalar. the scalar is implemented a 1-dimensional
 tensor with 2 elements dervied from the model's bskcn_tv configuration.
 in general, the values are (bskcn_tv, 1 - bskcn_tv)
 ---
- src/llama-arch.cpp         |  53 +++++++----
+ src/llama-arch.cpp         |  21 +++++
 src/llama-arch.h           |   3 +
 src/llama-hparams.cpp      |   8 ++
- src/llama-hparams.h        |   5 +
+ src/llama-hparams.h        |   5 ++
 src/llama-model-loader.cpp |   1 +
- src/llama-model.cpp        |  16 ++++
+ src/llama-model.cpp        |  44 +++++++++++
 src/llama-model.h          |   3 +
- src/llama.cpp              | 185 +++++++++++++++++++++++++++++++++++++
- 8 files changed, 258 insertions(+), 16 deletions(-)
+ src/llama.cpp              | 152 ++++++++++++++++++++++++++++++++++++-
+ 8 files changed, 236 insertions(+), 1 deletion(-)

 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 007d79f8..5b376c5e 100644
+index 97a1e7e5..a1e0ebcc 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
-@@ -59,6 +59,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+@@ -61,6 +61,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE,          "granite"          },
     { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
     { LLM_ARCH_CHAMELEON,        "chameleon"        },
@@ -33,48 +33,16 @@ index 007d79f8..5b376c5e 100644
     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
-@@ -106,22 +107,23 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-     { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
-     { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
- 
-    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
-    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
-    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"         },
-    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"              },
-    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"             },
-    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"           },
-    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"     },
-    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon" },
-    { LLM_KV_ATTENTION_GROUPNORM_EPS,          "%s.attention.group_norm_epsilon"     },
-    { LLM_KV_ATTENTION_GROUPNORM_GROUPS,       "%s.attention.group_norm_groups"      },
-    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                 },
-    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"            },
-    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"           },
-    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
-    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
-    { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
-+    { LLM_KV_ATTENTION_HEAD_COUNT,               "%s.attention.head_count"               },
-+    { LLM_KV_ATTENTION_HEAD_COUNT_KV,            "%s.attention.head_count_kv"            },
-+    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,           "%s.attention.max_alibi_bias"           },
-+    { LLM_KV_ATTENTION_CLAMP_KQV,                "%s.attention.clamp_kqv"                },
-+    { LLM_KV_ATTENTION_KEY_LENGTH,               "%s.attention.key_length"               },
-+    { LLM_KV_ATTENTION_VALUE_LENGTH,             "%s.attention.value_length"             },
-+    { LLM_KV_ATTENTION_LAYERNORM_EPS,            "%s.attention.layer_norm_epsilon"       },
-+    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,        "%s.attention.layer_norm_rms_epsilon"   },
-+    { LLM_KV_ATTENTION_GROUPNORM_EPS,            "%s.attention.group_norm_epsilon"       },
-+    { LLM_KV_ATTENTION_GROUPNORM_GROUPS,         "%s.attention.group_norm_groups"        },
-+    { LLM_KV_ATTENTION_CAUSAL,                   "%s.attention.causal"                   },
-+    { LLM_KV_ATTENTION_Q_LORA_RANK,              "%s.attention.q_lora_rank"              },
-+    { LLM_KV_ATTENTION_KV_LORA_RANK,             "%s.attention.kv_lora_rank"             },
-+    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,   "%s.attention.relative_buckets_count"   },
-+    { LLM_KV_ATTENTION_SLIDING_WINDOW,           "%s.attention.sliding_window"           },
-+    { LLM_KV_ATTENTION_SCALE,                    "%s.attention.scale"                    },
-+    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,    "%s.attention.block_skip_connection"    },
+@@ -125,6 +126,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+     { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
+     { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
+     { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
+    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection"  },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
-@@ -1240,6 +1242,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
-             { LLM_TENSOR_POS_NET_ATTN_OUT,  "posnet.%d.attn_output" },
+@@ -1271,6 +1273,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
 +    {
@@ -96,9 +64,9 @@ index 007d79f8..5b376c5e 100644
 +        },
 +    },
     {
-         LLM_ARCH_UNKNOWN,
+         LLM_ARCH_WAVTOKENIZER_DEC,
         {
-@@ -1372,6 +1392,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -1429,6 +1449,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_FFN_EXP_PROBS_B,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -107,10 +75,10 @@ index 007d79f8..5b376c5e 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 45e458bb..eac7055b 100644
+index 122fdceb..77919578 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
-@@ -63,6 +63,7 @@ enum llm_arch {
+@@ -65,6 +65,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
@@ -118,7 +86,7 @@ index 45e458bb..eac7055b 100644
     LLM_ARCH_WAVTOKENIZER_DEC,
     LLM_ARCH_UNKNOWN,
 };
-@@ -126,6 +127,7 @@ enum llm_kv {
+@@ -129,6 +130,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
@@ -126,7 +94,7 @@ index 45e458bb..eac7055b 100644
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_DIMENSION_SECTIONS,
-@@ -305,6 +307,7 @@ enum llm_tensor {
+@@ -311,6 +313,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
@@ -135,7 +103,7 @@ index 45e458bb..eac7055b 100644
     LLM_TENSOR_CONVNEXT_DW,
     LLM_TENSOR_CONVNEXT_NORM,
 diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index c4053469..450738da 100644
+index ea87b295..f3955de9 100644
 --- a/src/llama-hparams.cpp
 +++ b/src/llama-hparams.cpp
@@ -69,3 +69,11 @@ uint32_t llama_hparams::n_embd_v_s() const {
@@ -152,10 +120,10 @@ index c4053469..450738da 100644
 +}
 \ No newline at end of file
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index a29f20ec..fd898e27 100644
+index 1fe45410..1bdcdfd5 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
-@@ -52,6 +52,8 @@ struct llama_hparams {
+@@ -50,6 +50,8 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
@@ -164,7 +132,7 @@ index a29f20ec..fd898e27 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
     uint32_t n_lora_kv          = 0;
-@@ -134,6 +136,9 @@ struct llama_hparams {
+@@ -133,6 +135,9 @@ struct llama_hparams {
 
     // dimension of the recurrent state embeddings
     uint32_t n_embd_v_s() const;
@@ -175,23 +143,23 @@ index a29f20ec..fd898e27 100644
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index 7743b465..422524a8 100644
+index 05d58ad9..1252aca1 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
-@@ -364,6 +364,7 @@ namespace GGUFMeta {
+@@ -439,6 +439,7 @@ namespace GGUFMeta {
     // TODO: this is not very clever - figure out something better
     template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
 +    template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required);
 
- llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
-     int trace = 0;
+ llama_model_loader::llama_model_loader(
+         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 00b80c52..306c557d 100644
+index 36a0a009..ad1315c6 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -1091,6 +1091,21 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
-                     default: model.type = e_model::MODEL_UNKNOWN;
+@@ -1238,6 +1238,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+                     default: type = LLM_TYPE_UNKNOWN;
                }
             } break;
 +        case LLM_ARCH_SOLAR:
@@ -200,52 +168,19 @@ index 00b80c52..306c557d 100644
 +                for (size_t i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
 +                    auto & bskcn = hparams.n_bskcn_arr[i];
 +                    bskcn.fill(0);
-+                    auto kv = LLM_KV(model.arch);
+                    auto kv = LLM_KV(arch);
 +                    ml.get_key_or_arr(format((kv(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION) + ".%d").c_str(), i), bskcn, hparams.n_layer, false);
 +                }
 +
 +                switch (hparams.n_layer) {
-+                    case 64: model.type = e_model::MODEL_22B; break;
-+                    default: model.type = e_model::MODEL_UNKNOWN;
+                    case 64: type = LLM_TYPE_22B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
 +                }
 +            } break;
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -2065,6 +2080,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
-         case LLM_ARCH_GRANITE:
-         case LLM_ARCH_GRANITE_MOE:
-         case LLM_ARCH_CHAMELEON:
-+        case LLM_ARCH_SOLAR:
-             return LLAMA_ROPE_TYPE_NORM;
- 
-         // the pairs of head values are offset by n_rot/2
-diff --git a/src/llama-model.h b/src/llama-model.h
-index ce038932..c1b9c0a1 100644
--- a/src/llama-model.h
-+++ b/src/llama-model.h
-@@ -54,6 +54,7 @@ enum llm_type {
-     MODEL_15B,
-     MODEL_16B,
-     MODEL_20B,
-+    MODEL_22B,
-     MODEL_30B,
-     MODEL_32B,
-     MODEL_34B,
-@@ -275,6 +276,8 @@ struct llama_layer {
-     struct ggml_tensor * ffn_up_scale   = nullptr;
-     struct ggml_tensor * ffn_down_scale = nullptr;
- 
-+    struct ggml_tensor * bskcn_tv = nullptr;
-+
-     struct llama_layer_posnet posnet;
- 
-     struct llama_layer_convnext convnext;
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 4eb3f6b9..7dec50ae 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -2206,6 +2206,35 @@ static bool llm_load_tensors(
+@@ -3316,6 +3331,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
@@ -256,16 +191,16 @@ index 4eb3f6b9..7dec50ae 100644
 +                } break;
 +            case LLM_ARCH_SOLAR:
 +                {
-+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 +
 +                    // output
 +                    {
-+                        model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-+                        model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
 +                    }
 +
 +                    for (int i = 0; i < n_layer; ++i) {
-+                        auto & layer = model.layers[i];
+                        auto & layer = layers[i];
 +
 +                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 +
@@ -277,16 +212,53 @@ index 4eb3f6b9..7dec50ae 100644
 +                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 +
 +                        layer.bskcn_tv = create_tensor(tn(LLM_TENSOR_BSKCN_TV, "weight", i), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-+
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -10226,6 +10255,158 @@ struct llm_build_context {
-         return gf;
-     }
+@@ -3900,6 +3943,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
+         case LLM_ARCH_GRANITE:
+         case LLM_ARCH_GRANITE_MOE:
+         case LLM_ARCH_CHAMELEON:
+        case LLM_ARCH_SOLAR:
+             return LLAMA_ROPE_TYPE_NORM;
 
-+    ggml_cgraph * build_solar() {
-+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+         // the pairs of head values are offset by n_rot/2
+diff --git a/src/llama-model.h b/src/llama-model.h
+index a7c30444..1afb0024 100644
+--- a/src/llama-model.h
+++ b/src/llama-model.h
+@@ -55,6 +55,7 @@ enum llm_type {
+     LLM_TYPE_15B,
+     LLM_TYPE_16B,
+     LLM_TYPE_20B,
+    LLM_TYPE_22B,
+     LLM_TYPE_30B,
+     LLM_TYPE_32B,
+     LLM_TYPE_34B,
+@@ -281,6 +282,8 @@ struct llama_layer {
+     struct ggml_tensor * ffn_up_scale   = nullptr;
+     struct ggml_tensor * ffn_down_scale = nullptr;
+ 
+    struct ggml_tensor * bskcn_tv = nullptr;
+
+     struct llama_layer_posnet posnet;
+ 
+     struct llama_layer_convnext convnext;
+diff --git a/src/llama.cpp b/src/llama.cpp
+index ac85bfed..6d320ea4 100644
+--- a/src/llama.cpp
+++ b/src/llama.cpp
+@@ -7953,9 +7953,155 @@ struct llm_build_context {
+         cb(img_logits, "img_logits", -1);
+         cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
+         cb(cur, "result_output", -1);
+-
+         ggml_build_forward_expand(gf, cur);
+        return gf;
+   }
+
+   ggml_cgraph * build_solar() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 +
 +        // mutable variable, needed during the last layer of the computation to skip unused tokens
 +        int32_t n_tokens = this->n_tokens;
@@ -333,7 +305,7 @@ index 4eb3f6b9..7dec50ae 100644
 +                   ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
 +                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
 +            }
-+
+ 
 +            // norm
 +            cur = llm_build_norm(ctx0, inpL, hparams,
 +                    model.layers[il].attn_norm, NULL,
@@ -422,25 +394,18 @@ index 4eb3f6b9..7dec50ae 100644
 +        }
 +
 +        cur = inpL;
-+
 +        cur = llm_build_norm(ctx0, cur, hparams,
 +                model.output_norm, NULL,
 +                LLM_NORM_RMS, cb, -1);
 +        cb(cur, "result_norm", -1);
-+
 +        // lm_head
 +        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
 +        cb(cur, "result_output", -1);
-+
 +        ggml_build_forward_expand(gf, cur);
-+
-+        return gf;
-+    }
-+
-     struct ggml_cgraph * build_wavtokenizer_dec() {
-         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+         return gf;
+     }
 
-@@ -10660,6 +10841,10 @@ static struct ggml_cgraph * llama_build_graph(
+@@ -8398,6 +8544,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_chameleon();
             } break;