llama: update vendor code to commit ba1cb19c (#8101)

2025-12-10 15:57:04 +00:00 · 2024-12-14 14:55:51 -08:00
parent 60f75560a2
commit 7a81daf026
273 changed files with 3194 additions and 1900 deletions
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@@ -15,10 +15,10 @@ in general, the values are (bskcn_tv, 1 - bskcn_tv)
 1 file changed, 253 insertions(+), 14 deletions(-)

 diff --git a/src/llama.cpp b/src/llama.cpp
-index d1791af0..b01770d0 100644
+index 9e292c4f..26be6254 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -195,6 +195,7 @@ enum llm_arch {
+@@ -196,6 +196,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
@@ -26,7 +26,7 @@ index d1791af0..b01770d0 100644
     LLM_ARCH_UNKNOWN,
 };
 
-@@ -249,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+@@ -251,6 +252,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE,         "granite"      },
     { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
     { LLM_ARCH_CHAMELEON,       "chameleon"    },
@@ -34,15 +34,15 @@ index d1791af0..b01770d0 100644
     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
 
-@@ -306,6 +308,7 @@ enum llm_kv {
+@@ -308,6 +310,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
 +    LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
 
     LLM_KV_ROPE_DIMENSION_COUNT,
-     LLM_KV_ROPE_FREQ_BASE,
-@@ -408,20 +411,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+     LLM_KV_ROPE_DIMENSION_SECTIONS,
+@@ -411,20 +414,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
     { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
 
@@ -77,8 +77,8 @@ index d1791af0..b01770d0 100644
 +    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection.%d" },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,             "%s.rope.dimension_count"                 },
-     { LLM_KV_ROPE_FREQ_BASE,                   "%s.rope.freq_base"                       },
-@@ -603,6 +607,7 @@ enum llm_tensor {
+     { LLM_KV_ROPE_DIMENSION_SECTIONS,          "%s.rope.dimension_sections"              },
+@@ -607,6 +611,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
@@ -86,7 +86,7 @@ index d1791af0..b01770d0 100644
 };
 
 static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
-@@ -1541,6 +1546,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -1564,6 +1569,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
@@ -111,7 +111,7 @@ index d1791af0..b01770d0 100644
     {
         LLM_ARCH_UNKNOWN,
         {
-@@ -2401,6 +2424,7 @@ enum e_model {
+@@ -2425,6 +2448,7 @@ enum e_model {
     MODEL_15B,
     MODEL_16B,
     MODEL_20B,
@@ -119,7 +119,7 @@ index d1791af0..b01770d0 100644
     MODEL_30B,
     MODEL_32B,
     MODEL_34B,
-@@ -2451,6 +2475,8 @@ struct llama_hparams {
+@@ -2475,6 +2499,8 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
@@ -128,7 +128,7 @@ index d1791af0..b01770d0 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q = 0;
     uint32_t n_lora_kv = 0;
-@@ -2521,6 +2547,7 @@ struct llama_hparams {
+@@ -2546,6 +2572,7 @@ struct llama_hparams {
         if (this->n_head_arr    != other.n_head_arr)    return true;
         if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
         if (this->n_ff_arr      != other.n_ff_arr)      return true;
@@ -136,7 +136,7 @@ index d1791af0..b01770d0 100644
 
         if (this->n_rel_attn_bkts    != other.n_rel_attn_bkts)    return true;
         if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
-@@ -2630,6 +2657,14 @@ struct llama_hparams {
+@@ -2658,6 +2685,14 @@ struct llama_hparams {
             return ssm_d_state * ssm_d_inner;
         }
     }
@@ -151,7 +151,7 @@ index d1791af0..b01770d0 100644
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-@@ -2816,6 +2851,8 @@ struct llama_layer {
+@@ -2844,6 +2879,8 @@ struct llama_layer {
     struct ggml_tensor * ffn_gate_scale;
     struct ggml_tensor * ffn_up_scale;
     struct ggml_tensor * ffn_down_scale;
@@ -160,7 +160,7 @@ index d1791af0..b01770d0 100644
 };
 
 // very similar to llama_batch,
-@@ -6209,6 +6246,21 @@ static void llm_load_hparams(
+@@ -6247,6 +6284,21 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                }
             } break;
@@ -182,7 +182,7 @@ index d1791af0..b01770d0 100644
         default: (void)0;
     }
 
-@@ -7198,6 +7250,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
+@@ -7239,6 +7291,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
     {LLM_TENSOR_FFN_UP_EXPS,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -190,7 +190,7 @@ index d1791af0..b01770d0 100644
 };
 
 // checks if the weight tensor can be used with the specified buffer type and device
-@@ -9205,6 +9258,35 @@ static bool llm_load_tensors(
+@@ -9253,6 +9306,35 @@ static bool llm_load_tensors(
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
@@ -226,7 +226,7 @@ index d1791af0..b01770d0 100644
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -16652,6 +16734,158 @@ struct llm_build_context {
+@@ -16671,6 +16753,158 @@ struct llm_build_context {
 
         return gf;
     }
@@ -385,7 +385,7 @@ index d1791af0..b01770d0 100644
 };
 
 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
-@@ -16921,6 +17155,10 @@ static struct ggml_cgraph * llama_build_graph(
+@@ -16942,6 +17176,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_chameleon();
             } break;
@@ -396,7 +396,7 @@ index d1791af0..b01770d0 100644
         default:
             GGML_ABORT("fatal error");
     }
-@@ -20132,6 +20370,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+@@ -20137,6 +20375,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_CHAMELEON: