llama: update vendor code to commit ba1cb19c (#8101)

This commit is contained in:
Jeffrey Morgan
2024-12-14 14:55:51 -08:00
committed by GitHub
parent 60f75560a2
commit 7a81daf026
273 changed files with 3194 additions and 1900 deletions

View File

@@ -15,10 +15,10 @@ in general, the values are (bskcn_tv, 1 - bskcn_tv)
1 file changed, 253 insertions(+), 14 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index d1791af0..b01770d0 100644
index 9e292c4f..26be6254 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -195,6 +195,7 @@ enum llm_arch {
@@ -196,6 +196,7 @@ enum llm_arch {
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
@@ -26,7 +26,7 @@ index d1791af0..b01770d0 100644
LLM_ARCH_UNKNOWN,
};
@@ -249,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -251,6 +252,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
@@ -34,15 +34,15 @@ index d1791af0..b01770d0 100644
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -306,6 +308,7 @@ enum llm_kv {
@@ -308,6 +310,7 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
@@ -408,20 +411,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -411,20 +414,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
@@ -77,8 +77,8 @@ index d1791af0..b01770d0 100644
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -603,6 +607,7 @@ enum llm_tensor {
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -607,6 +611,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
@@ -86,7 +86,7 @@ index d1791af0..b01770d0 100644
};
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -1541,6 +1546,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -1564,6 +1569,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
@@ -111,7 +111,7 @@ index d1791af0..b01770d0 100644
{
LLM_ARCH_UNKNOWN,
{
@@ -2401,6 +2424,7 @@ enum e_model {
@@ -2425,6 +2448,7 @@ enum e_model {
MODEL_15B,
MODEL_16B,
MODEL_20B,
@@ -119,7 +119,7 @@ index d1791af0..b01770d0 100644
MODEL_30B,
MODEL_32B,
MODEL_34B,
@@ -2451,6 +2475,8 @@ struct llama_hparams {
@@ -2475,6 +2499,8 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
@@ -128,7 +128,7 @@ index d1791af0..b01770d0 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -2521,6 +2547,7 @@ struct llama_hparams {
@@ -2546,6 +2572,7 @@ struct llama_hparams {
if (this->n_head_arr != other.n_head_arr) return true;
if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
if (this->n_ff_arr != other.n_ff_arr) return true;
@@ -136,7 +136,7 @@ index d1791af0..b01770d0 100644
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -2630,6 +2657,14 @@ struct llama_hparams {
@@ -2658,6 +2685,14 @@ struct llama_hparams {
return ssm_d_state * ssm_d_inner;
}
}
@@ -151,7 +151,7 @@ index d1791af0..b01770d0 100644
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -2816,6 +2851,8 @@ struct llama_layer {
@@ -2844,6 +2879,8 @@ struct llama_layer {
struct ggml_tensor * ffn_gate_scale;
struct ggml_tensor * ffn_up_scale;
struct ggml_tensor * ffn_down_scale;
@@ -160,7 +160,7 @@ index d1791af0..b01770d0 100644
};
// very similar to llama_batch,
@@ -6209,6 +6246,21 @@ static void llm_load_hparams(
@@ -6247,6 +6284,21 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
@@ -182,7 +182,7 @@ index d1791af0..b01770d0 100644
default: (void)0;
}
@@ -7198,6 +7250,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
@@ -7239,6 +7291,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -190,7 +190,7 @@ index d1791af0..b01770d0 100644
};
// checks if the weight tensor can be used with the specified buffer type and device
@@ -9205,6 +9258,35 @@ static bool llm_load_tensors(
@@ -9253,6 +9306,35 @@ static bool llm_load_tensors(
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -226,7 +226,7 @@ index d1791af0..b01770d0 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -16652,6 +16734,158 @@ struct llm_build_context {
@@ -16671,6 +16753,158 @@ struct llm_build_context {
return gf;
}
@@ -385,7 +385,7 @@ index d1791af0..b01770d0 100644
};
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -16921,6 +17155,10 @@ static struct ggml_cgraph * llama_build_graph(
@@ -16942,6 +17176,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_chameleon();
} break;
@@ -396,7 +396,7 @@ index d1791af0..b01770d0 100644
default:
GGML_ABORT("fatal error");
}
@@ -20132,6 +20370,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
@@ -20137,6 +20375,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON: