mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-11 00:07:07 +00:00
llama: update llama.cpp vendor code to commit d7cfe1ff (#9356)
This commit is contained in:
@@ -11,21 +11,21 @@ tensor to store the scalar. the scalar is implemented a 1-dimensional
|
||||
tensor with 2 elements dervied from the model's bskcn_tv configuration.
|
||||
in general, the values are (bskcn_tv, 1 - bskcn_tv)
|
||||
---
|
||||
src/llama-arch.cpp | 53 +++++++----
|
||||
src/llama-arch.cpp | 21 +++++
|
||||
src/llama-arch.h | 3 +
|
||||
src/llama-hparams.cpp | 8 ++
|
||||
src/llama-hparams.h | 5 +
|
||||
src/llama-hparams.h | 5 ++
|
||||
src/llama-model-loader.cpp | 1 +
|
||||
src/llama-model.cpp | 16 ++++
|
||||
src/llama-model.cpp | 44 +++++++++++
|
||||
src/llama-model.h | 3 +
|
||||
src/llama.cpp | 185 +++++++++++++++++++++++++++++++++++++
|
||||
8 files changed, 258 insertions(+), 16 deletions(-)
|
||||
src/llama.cpp | 152 ++++++++++++++++++++++++++++++++++++-
|
||||
8 files changed, 236 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index 007d79f8..5b376c5e 100644
|
||||
index 97a1e7e5..a1e0ebcc 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -59,6 +59,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
@@ -61,6 +61,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GRANITE, "granite" },
|
||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
||||
@@ -33,48 +33,16 @@ index 007d79f8..5b376c5e 100644
|
||||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
@@ -106,22 +107,23 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
|
||||
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
|
||||
|
||||
- { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
- { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
- { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
||||
- { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
||||
- { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
|
||||
- { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
||||
- { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
||||
- { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
||||
- { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
|
||||
- { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
|
||||
- { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
||||
- { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
||||
- { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
||||
- { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
- { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
- { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
+ { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
+ { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
+ { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
||||
+ { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
||||
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
|
||||
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
||||
+ { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
||||
+ { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
||||
+ { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
|
||||
+ { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
|
||||
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
||||
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
||||
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
||||
+ { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
+ { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
@@ -125,6 +126,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
@@ -1240,6 +1242,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
|
||||
@@ -1271,6 +1273,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
},
|
||||
},
|
||||
+ {
|
||||
@@ -96,9 +64,9 @@ index 007d79f8..5b376c5e 100644
|
||||
+ },
|
||||
+ },
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
{
|
||||
@@ -1372,6 +1392,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
@@ -1429,6 +1449,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
@@ -107,10 +75,10 @@ index 007d79f8..5b376c5e 100644
|
||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index 45e458bb..eac7055b 100644
|
||||
index 122fdceb..77919578 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -63,6 +63,7 @@ enum llm_arch {
|
||||
@@ -65,6 +65,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GRANITE,
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
LLM_ARCH_CHAMELEON,
|
||||
@@ -118,7 +86,7 @@ index 45e458bb..eac7055b 100644
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
@@ -126,6 +127,7 @@ enum llm_kv {
|
||||
@@ -129,6 +130,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
@@ -126,7 +94,7 @@ index 45e458bb..eac7055b 100644
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
@@ -305,6 +307,7 @@ enum llm_tensor {
|
||||
@@ -311,6 +313,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
@@ -135,7 +103,7 @@ index 45e458bb..eac7055b 100644
|
||||
LLM_TENSOR_CONVNEXT_DW,
|
||||
LLM_TENSOR_CONVNEXT_NORM,
|
||||
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
|
||||
index c4053469..450738da 100644
|
||||
index ea87b295..f3955de9 100644
|
||||
--- a/src/llama-hparams.cpp
|
||||
+++ b/src/llama-hparams.cpp
|
||||
@@ -69,3 +69,11 @@ uint32_t llama_hparams::n_embd_v_s() const {
|
||||
@@ -152,10 +120,10 @@ index c4053469..450738da 100644
|
||||
+}
|
||||
\ No newline at end of file
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index a29f20ec..fd898e27 100644
|
||||
index 1fe45410..1bdcdfd5 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -52,6 +52,8 @@ struct llama_hparams {
|
||||
@@ -50,6 +50,8 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
|
||||
@@ -164,7 +132,7 @@ index a29f20ec..fd898e27 100644
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
uint32_t n_lora_kv = 0;
|
||||
@@ -134,6 +136,9 @@ struct llama_hparams {
|
||||
@@ -133,6 +135,9 @@ struct llama_hparams {
|
||||
|
||||
// dimension of the recurrent state embeddings
|
||||
uint32_t n_embd_v_s() const;
|
||||
@@ -175,23 +143,23 @@ index a29f20ec..fd898e27 100644
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
|
||||
index 7743b465..422524a8 100644
|
||||
index 05d58ad9..1252aca1 100644
|
||||
--- a/src/llama-model-loader.cpp
|
||||
+++ b/src/llama-model-loader.cpp
|
||||
@@ -364,6 +364,7 @@ namespace GGUFMeta {
|
||||
@@ -439,6 +439,7 @@ namespace GGUFMeta {
|
||||
// TODO: this is not very clever - figure out something better
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
||||
+ template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
||||
|
||||
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
|
||||
int trace = 0;
|
||||
llama_model_loader::llama_model_loader(
|
||||
const std::string & fname,
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 00b80c52..306c557d 100644
|
||||
index 36a0a009..ad1315c6 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1091,6 +1091,21 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
@@ -1238,6 +1238,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
@@ -200,52 +168,19 @@ index 00b80c52..306c557d 100644
|
||||
+ for (size_t i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
|
||||
+ auto & bskcn = hparams.n_bskcn_arr[i];
|
||||
+ bskcn.fill(0);
|
||||
+ auto kv = LLM_KV(model.arch);
|
||||
+ auto kv = LLM_KV(arch);
|
||||
+ ml.get_key_or_arr(format((kv(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION) + ".%d").c_str(), i), bskcn, hparams.n_layer, false);
|
||||
+ }
|
||||
+
|
||||
+ switch (hparams.n_layer) {
|
||||
+ case 64: model.type = e_model::MODEL_22B; break;
|
||||
+ default: model.type = e_model::MODEL_UNKNOWN;
|
||||
+ case 64: type = LLM_TYPE_22B; break;
|
||||
+ default: type = LLM_TYPE_UNKNOWN;
|
||||
+ }
|
||||
+ } break;
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -2065,6 +2080,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
// the pairs of head values are offset by n_rot/2
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index ce038932..c1b9c0a1 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -54,6 +54,7 @@ enum llm_type {
|
||||
MODEL_15B,
|
||||
MODEL_16B,
|
||||
MODEL_20B,
|
||||
+ MODEL_22B,
|
||||
MODEL_30B,
|
||||
MODEL_32B,
|
||||
MODEL_34B,
|
||||
@@ -275,6 +276,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_up_scale = nullptr;
|
||||
struct ggml_tensor * ffn_down_scale = nullptr;
|
||||
|
||||
+ struct ggml_tensor * bskcn_tv = nullptr;
|
||||
+
|
||||
struct llama_layer_posnet posnet;
|
||||
|
||||
struct llama_layer_convnext convnext;
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 4eb3f6b9..7dec50ae 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -2206,6 +2206,35 @@ static bool llm_load_tensors(
|
||||
@@ -3316,6 +3331,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
@@ -256,16 +191,16 @@ index 4eb3f6b9..7dec50ae 100644
|
||||
+ } break;
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
+ {
|
||||
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
+
|
||||
+ // output
|
||||
+ {
|
||||
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
+ }
|
||||
+
|
||||
+ for (int i = 0; i < n_layer; ++i) {
|
||||
+ auto & layer = model.layers[i];
|
||||
+ auto & layer = layers[i];
|
||||
+
|
||||
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
+
|
||||
@@ -277,16 +212,53 @@ index 4eb3f6b9..7dec50ae 100644
|
||||
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
+
|
||||
+ layer.bskcn_tv = create_tensor(tn(LLM_TENSOR_BSKCN_TV, "weight", i), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
+
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -10226,6 +10255,158 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
@@ -3900,6 +3943,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
+ ggml_cgraph * build_solar() {
|
||||
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
// the pairs of head values are offset by n_rot/2
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index a7c30444..1afb0024 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -55,6 +55,7 @@ enum llm_type {
|
||||
LLM_TYPE_15B,
|
||||
LLM_TYPE_16B,
|
||||
LLM_TYPE_20B,
|
||||
+ LLM_TYPE_22B,
|
||||
LLM_TYPE_30B,
|
||||
LLM_TYPE_32B,
|
||||
LLM_TYPE_34B,
|
||||
@@ -281,6 +282,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_up_scale = nullptr;
|
||||
struct ggml_tensor * ffn_down_scale = nullptr;
|
||||
|
||||
+ struct ggml_tensor * bskcn_tv = nullptr;
|
||||
+
|
||||
struct llama_layer_posnet posnet;
|
||||
|
||||
struct llama_layer_convnext convnext;
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index ac85bfed..6d320ea4 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -7953,9 +7953,155 @@ struct llm_build_context {
|
||||
cb(img_logits, "img_logits", -1);
|
||||
cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
|
||||
cb(cur, "result_output", -1);
|
||||
-
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
+ return gf;
|
||||
+ }
|
||||
+
|
||||
+ ggml_cgraph * build_solar() {
|
||||
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||
+
|
||||
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
+ int32_t n_tokens = this->n_tokens;
|
||||
@@ -333,7 +305,7 @@ index 4eb3f6b9..7dec50ae 100644
|
||||
+ ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
|
||||
+ ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
|
||||
+ }
|
||||
+
|
||||
|
||||
+ // norm
|
||||
+ cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
+ model.layers[il].attn_norm, NULL,
|
||||
@@ -422,25 +394,18 @@ index 4eb3f6b9..7dec50ae 100644
|
||||
+ }
|
||||
+
|
||||
+ cur = inpL;
|
||||
+
|
||||
+ cur = llm_build_norm(ctx0, cur, hparams,
|
||||
+ model.output_norm, NULL,
|
||||
+ LLM_NORM_RMS, cb, -1);
|
||||
+ cb(cur, "result_norm", -1);
|
||||
+
|
||||
+ // lm_head
|
||||
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||
+ cb(cur, "result_output", -1);
|
||||
+
|
||||
+ ggml_build_forward_expand(gf, cur);
|
||||
+
|
||||
+ return gf;
|
||||
+ }
|
||||
+
|
||||
struct ggml_cgraph * build_wavtokenizer_dec() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
return gf;
|
||||
}
|
||||
|
||||
@@ -10660,6 +10841,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
@@ -8398,6 +8544,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_chameleon();
|
||||
} break;
|
||||
|
||||
Reference in New Issue
Block a user