llama: update vendored code to commit 46e3556 (#8308)

This commit is contained in:
Jeffrey Morgan
2025-01-08 11:22:01 -08:00
committed by GitHub
parent 57f038ec7b
commit 1deafd8254
305 changed files with 16048 additions and 12926 deletions

View File

@@ -9,7 +9,7 @@ Subject: [PATCH] cuda
2 files changed, 9 insertions(+)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index fdb4b986..9b80fe07 100644
index e2d6c405..1b62c056 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -106,7 +106,12 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -26,7 +26,7 @@ index fdb4b986..9b80fe07 100644
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index c180adc8..000f1777 100644
index 0b06be72..0a6ae325 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -424,6 +424,10 @@ struct ggml_backend_cuda_buffer_context {

View File

@@ -4,14 +4,14 @@ Date: Mon, 16 Sep 2024 15:53:13 -0700
Subject: [PATCH] pretokenizer
---
src/llama.cpp | 14 +++-----------
src/llama-model.cpp | 14 +++-----------
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index abc1252e..626c3e3f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6400,16 +6400,7 @@ static void llm_load_vocab(
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 405e0528..00b80c52 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1249,16 +1249,7 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = true;
@@ -29,9 +29,9 @@ index abc1252e..626c3e3f 100644
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -6514,7 +6505,8 @@ static void llm_load_vocab(
tokenizer_pre == "minerva-7b") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
@@ -1373,7 +1364,8 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
tokenizer_pre == "megrez") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);

View File

@@ -4,14 +4,15 @@ Date: Mon, 16 Sep 2024 15:53:14 -0700
Subject: [PATCH] embeddings
---
src/llama.cpp | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
src/llama-context.cpp | 2 +-
src/llama.cpp | 6 ++++--
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index 626c3e3f..9e292c4f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17419,7 +17419,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 38a55fb2..b9c4a5bf 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -475,7 +475,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
@@ -20,7 +21,11 @@ index 626c3e3f..9e292c4f 100644
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
@@ -17714,7 +17714,6 @@ static int llama_decode_internal(
diff --git a/src/llama.cpp b/src/llama.cpp
index ea78ea48..4eb3f6b9 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -10876,7 +10876,6 @@ static int llama_decode_internal(
res = nullptr;
embd = nullptr;
} else if (cparams.embeddings) {
@@ -28,7 +33,7 @@ index 626c3e3f..9e292c4f 100644
embd = nullptr;
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
@@ -17722,11 +17721,15 @@ static int llama_decode_internal(
@@ -10884,12 +10883,15 @@ static int llama_decode_internal(
break;
}
}
@@ -37,7 +42,7 @@ index 626c3e3f..9e292c4f 100644
embd = nullptr; // do not extract embeddings when not needed
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
}
+
+ if (!cparams.causal_attn) {
+ res = nullptr; // do not extract logits when not needed
+ }

View File

@@ -8,7 +8,7 @@ Subject: [PATCH] clip-unicode
1 file changed, 39 insertions(+), 1 deletion(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index ba28c07c..46998e4c 100644
index 3cd0d2fa..b3c1829f 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -56,6 +56,19 @@

View File

@@ -11,38 +11,29 @@ tensor to store the scalar. the scalar is implemented a 1-dimensional
tensor with 2 elements dervied from the model's bskcn_tv configuration.
in general, the values are (bskcn_tv, 1 - bskcn_tv)
---
src/llama.cpp | 267 +++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 253 insertions(+), 14 deletions(-)
src/llama-arch.cpp | 53 +++++++----
src/llama-arch.h | 3 +
src/llama-hparams.cpp | 8 ++
src/llama-hparams.h | 5 +
src/llama-model-loader.cpp | 1 +
src/llama-model.cpp | 16 ++++
src/llama-model.h | 3 +
src/llama.cpp | 185 +++++++++++++++++++++++++++++++++++++
8 files changed, 258 insertions(+), 16 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index 9e292c4f..26be6254 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -196,6 +196,7 @@ enum llm_arch {
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
+ LLM_ARCH_SOLAR,
LLM_ARCH_UNKNOWN,
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 007d79f8..5b376c5e 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -59,6 +59,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
+ { LLM_ARCH_SOLAR, "solar" },
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -251,6 +252,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
+ { LLM_ARCH_SOLAR, "solar" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -308,6 +310,7 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -411,20 +414,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -106,22 +107,23 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
@@ -54,40 +45,36 @@ index 9e292c4f..26be6254 100644
- { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
- { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
- { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
- { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
- { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
- { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
- { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
- { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
- { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
- { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
- { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
+ { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
+ { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
+ { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
+ { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
+ { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
+ { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
+ { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
+ { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
+ { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
+ { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
+ { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
+ { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
+ { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
+ { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
+ { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
+ { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
+ { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -607,6 +611,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
+ LLM_TENSOR_BSKCN_TV,
};
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -1564,6 +1569,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -1240,6 +1242,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
},
},
+ {
@@ -111,7 +98,133 @@ index 9e292c4f..26be6254 100644
{
LLM_ARCH_UNKNOWN,
{
@@ -2425,6 +2448,7 @@ enum e_model {
@@ -1372,6 +1392,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
+ {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 45e458bb..eac7055b 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -63,6 +63,7 @@ enum llm_arch {
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
+ LLM_ARCH_SOLAR,
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_UNKNOWN,
};
@@ -126,6 +127,7 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -305,6 +307,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
+ LLM_TENSOR_BSKCN_TV,
LLM_TENSOR_CONV1D,
LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index c4053469..450738da 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -69,3 +69,11 @@ uint32_t llama_hparams::n_embd_v_s() const {
// corresponds to Mamba's ssm_states size
return ssm_d_state * ssm_d_inner;
}
+
+bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
+ if (il < n_layer) {
+ return n_bskcn_arr[n][il] > 0;
+ }
+
+ GGML_ABORT("fatal error");
+}
\ No newline at end of file
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index a29f20ec..fd898e27 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -52,6 +52,8 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
+ std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
+
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -134,6 +136,9 @@ struct llama_hparams {
// dimension of the recurrent state embeddings
uint32_t n_embd_v_s() const;
+
+ // Block skip connection
+ bool n_bskcn(uint32_t n, uint32_t il) const;
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 7743b465..422524a8 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -364,6 +364,7 @@ namespace GGUFMeta {
// TODO: this is not very clever - figure out something better
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
+ template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required);
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
int trace = 0;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 00b80c52..306c557d 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1091,6 +1091,21 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
+ case LLM_ARCH_SOLAR:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ for (size_t i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
+ auto & bskcn = hparams.n_bskcn_arr[i];
+ bskcn.fill(0);
+ auto kv = LLM_KV(model.arch);
+ ml.get_key_or_arr(format((kv(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION) + ".%d").c_str(), i), bskcn, hparams.n_layer, false);
+ }
+
+ switch (hparams.n_layer) {
+ case 64: model.type = e_model::MODEL_22B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_WAVTOKENIZER_DEC:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -2065,6 +2080,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
+ case LLM_ARCH_SOLAR:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
diff --git a/src/llama-model.h b/src/llama-model.h
index ce038932..c1b9c0a1 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -54,6 +54,7 @@ enum llm_type {
MODEL_15B,
MODEL_16B,
MODEL_20B,
@@ -119,78 +232,20 @@ index 9e292c4f..26be6254 100644
MODEL_30B,
MODEL_32B,
MODEL_34B,
@@ -2475,6 +2499,8 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
@@ -275,6 +276,8 @@ struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
+ std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
+ struct ggml_tensor * bskcn_tv = nullptr;
+
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -2546,6 +2572,7 @@ struct llama_hparams {
if (this->n_head_arr != other.n_head_arr) return true;
if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
if (this->n_ff_arr != other.n_ff_arr) return true;
+ if (this->n_bskcn_arr != other.n_bskcn_arr) return true;
struct llama_layer_posnet posnet;
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -2658,6 +2685,14 @@ struct llama_hparams {
return ssm_d_state * ssm_d_inner;
}
}
+
+ bool n_bskcn(uint32_t n, uint32_t il = 0) const {
+ if (il < n_layer) {
+ return n_bskcn_arr[n][il] > 0;
+ }
+
+ GGML_ABORT("fatal error");
+ }
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -2844,6 +2879,8 @@ struct llama_layer {
struct ggml_tensor * ffn_gate_scale;
struct ggml_tensor * ffn_up_scale;
struct ggml_tensor * ffn_down_scale;
+
+ struct ggml_tensor * bskcn_tv;
};
// very similar to llama_batch,
@@ -6247,6 +6284,21 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
+ case LLM_ARCH_SOLAR:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ for (int i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
+ auto & bskcn = hparams.n_bskcn_arr.at(i);
+ bskcn.fill(0);
+ ml.get_key_or_arr(::format(LLM_KV_NAMES.at(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION), LLM_ARCH_NAMES.at(ml.llm_kv.arch), i), bskcn, hparams.n_layer, false);
+ }
+
+ switch (hparams.n_layer) {
+ case 64: model.type = e_model::MODEL_22B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ }
default: (void)0;
}
@@ -7239,6 +7291,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
+ {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}
};
// checks if the weight tensor can be used with the specified buffer type and device
@@ -9253,6 +9306,35 @@ static bool llm_load_tensors(
struct llama_layer_convnext convnext;
diff --git a/src/llama.cpp b/src/llama.cpp
index 4eb3f6b9..7dec50ae 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2206,6 +2206,35 @@ static bool llm_load_tensors(
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -226,11 +281,10 @@ index 9e292c4f..26be6254 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -16671,6 +16753,158 @@ struct llm_build_context {
@@ -10226,6 +10255,158 @@ struct llm_build_context {
return gf;
}
+
+ ggml_cgraph * build_solar() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
@@ -382,10 +436,11 @@ index 9e292c4f..26be6254 100644
+
+ return gf;
+ }
};
+
struct ggml_cgraph * build_wavtokenizer_dec() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -16942,6 +17176,10 @@ static struct ggml_cgraph * llama_build_graph(
@@ -10660,6 +10841,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_chameleon();
} break;
@@ -393,14 +448,6 @@ index 9e292c4f..26be6254 100644
+ {
+ result = llm.build_solar();
+ } break;
default:
GGML_ABORT("fatal error");
}
@@ -20137,6 +20375,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
+ case LLM_ARCH_SOLAR:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
case LLM_ARCH_WAVTOKENIZER_DEC:
{
result = llm.build_wavtokenizer_dec();

View File

@@ -8,7 +8,7 @@ Subject: [PATCH] conditional-fattn
1 file changed, 2 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 000f1777..8fd7c1a3 100644
index 0a6ae325..bb425ee8 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2162,9 +2162,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg

View File

@@ -12,10 +12,24 @@ kv cache once per run
remaining is to implement the cross attention mask
---
examples/llava/llava.cpp | 5 +-
include/llama.h | 5 +
src/llama.cpp | 477 +++++++++++++++++++++++++++++++++++++--
3 files changed, 467 insertions(+), 20 deletions(-)
examples/llava/llava.cpp | 5 +-
ggml/src/ggml-backend-reg.cpp | 6 +-
include/llama.h | 6 +
src/llama-arch.cpp | 44 +++++
src/llama-arch.h | 10 ++
src/llama-batch.cpp | 3 +
src/llama-context.cpp | 19 ++-
src/llama-context.h | 2 +
src/llama-cparams.h | 1 +
src/llama-hparams.cpp | 8 +-
src/llama-hparams.h | 4 +
src/llama-kv-cache.cpp | 33 ++++
src/llama-model-loader.cpp | 2 +
src/llama-model.cpp | 59 ++-----
src/llama-model.h | 51 ++++++
src/llama-quant.cpp | 4 +-
src/llama.cpp | 307 +++++++++++++++++++++++++++++++++-
17 files changed, 508 insertions(+), 56 deletions(-)
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 16f30c56..0f0f3f62 100644
@@ -47,11 +61,28 @@ index 16f30c56..0f0f3f62 100644
if (llama_decode(ctx_llama, llava_batch.batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 7ddd178b..899d16f2 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -171,9 +171,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg());
#endif
-#ifdef GGML_USE_BLAS
- register_backend(ggml_backend_blas_reg());
-#endif
+// #ifdef GGML_USE_BLAS
+// register_backend(ggml_backend_blas_reg());
+// #endif
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif
diff --git a/include/llama.h b/include/llama.h
index c67988a3..0f266283 100644
index a0d5ba5d..9f411960 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -249,6 +249,7 @@ extern "C" {
@@ -250,6 +250,7 @@ extern "C" {
llama_token * token;
float * embd;
@@ -59,7 +90,15 @@ index c67988a3..0f266283 100644
llama_pos * pos;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
@@ -423,6 +424,10 @@ extern "C" {
@@ -347,6 +348,7 @@ extern "C" {
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings
+ bool cross_attn; // whether to use cross attention
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
@@ -426,6 +428,10 @@ extern "C" {
struct llama_model * model,
struct llama_context_params params);
@@ -70,58 +109,27 @@ index c67988a3..0f266283 100644
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama.cpp b/src/llama.cpp
index 26be6254..4778a9ed 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -146,6 +146,7 @@ static std::string format(const char * fmt, ...) {
enum llm_arch {
LLM_ARCH_LLAMA,
+ LLM_ARCH_MLLAMA,
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
LLM_ARCH_GROK,
@@ -202,6 +203,7 @@ enum llm_arch {
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 5b376c5e..b35aeb31 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" },
+ { LLM_ARCH_MLLAMA, "mllama" },
{ LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" },
{ LLM_ARCH_GPT2, "gpt2" },
@@ -311,6 +313,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
{ LLM_ARCH_LLAMA, "llama" },
+ { LLM_ARCH_MLLAMA, "mllama" },
{ LLM_ARCH_DECI, "deci" },
{ LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" },
@@ -124,6 +125,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -429,6 +432,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -612,6 +616,14 @@ enum llm_tensor {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV,
+ LLM_TENSOR_CROSS_ATTN_K_NORM,
+ LLM_TENSOR_CROSS_ATTN_K_PROJ,
+ LLM_TENSOR_CROSS_ATTN_O_PROJ,
+ LLM_TENSOR_CROSS_ATTN_Q_NORM,
+ LLM_TENSOR_CROSS_ATTN_Q_PROJ,
+ LLM_TENSOR_CROSS_ATTN_V_PROJ,
+ LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
+ LLM_TENSOR_CROSS_ATTN_MLP_GATE,
};
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -641,6 +653,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -220,6 +222,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
@@ -160,79 +168,129 @@ index 26be6254..4778a9ed 100644
+ },
+ },
{
LLM_ARCH_BAICHUAN,
LLM_ARCH_DECI,
{
@@ -2456,6 +2502,7 @@ enum e_model {
MODEL_40B,
MODEL_65B,
MODEL_70B,
+ MODEL_90B,
MODEL_236B,
MODEL_314B,
MODEL_SMALL,
@@ -2500,6 +2547,7 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
@@ -1393,6 +1429,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index eac7055b..e8235ae0 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -10,6 +10,7 @@
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
+ std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
enum llm_arch {
LLM_ARCH_LLAMA,
+ LLM_ARCH_MLLAMA,
LLM_ARCH_DECI,
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
@@ -128,6 +129,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
@@ -2569,10 +2617,11 @@ struct llama_hparams {
if (this->n_expert != other.n_expert) return true;
if (this->n_expert_used != other.n_expert_used) return true;
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -308,6 +310,14 @@ enum llm_tensor {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV,
+ LLM_TENSOR_CROSS_ATTN_K_NORM,
+ LLM_TENSOR_CROSS_ATTN_K_PROJ,
+ LLM_TENSOR_CROSS_ATTN_O_PROJ,
+ LLM_TENSOR_CROSS_ATTN_Q_NORM,
+ LLM_TENSOR_CROSS_ATTN_Q_PROJ,
+ LLM_TENSOR_CROSS_ATTN_V_PROJ,
+ LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
+ LLM_TENSOR_CROSS_ATTN_MLP_GATE,
LLM_TENSOR_CONV1D,
LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
index 01d5ca57..8682b0e6 100644
--- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
@@ -316,6 +316,7 @@ struct llama_batch llama_batch_get_one(
/*n_tokens =*/ n_tokens,
/*tokens =*/ tokens,
/*embd =*/ nullptr,
+ /*n_embd =*/ 0,
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -328,6 +329,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_tokens =*/ 0,
/*tokens =*/ nullptr,
/*embd =*/ nullptr,
+ /*n_embd =*/ 0,
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -336,6 +338,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
- if (this->n_head_arr != other.n_head_arr) return true;
- if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
- if (this->n_ff_arr != other.n_ff_arr) return true;
- if (this->n_bskcn_arr != other.n_bskcn_arr) return true;
+ if (this->n_head_arr != other.n_head_arr) return true;
+ if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
+ if (this->n_ff_arr != other.n_ff_arr) return true;
+ if (this->n_bskcn_arr != other.n_bskcn_arr) return true;
+ if (this->cross_attn_layers != other.cross_attn_layers) return true;
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -2693,6 +2742,10 @@ struct llama_hparams {
GGML_ABORT("fatal error");
if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
+ batch.n_embd = embd;
} else {
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
}
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index b9c4a5bf..9d0e7ca3 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -71,10 +71,19 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
}
if (ubatch.embd) {
- const int64_t n_embd = hparams.n_embd;
- const int64_t n_tokens = ubatch.n_tokens;
+ if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
+ ggml_backend_tensor_set(lctx.inp_cross_attn_state, ubatch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
+ // zero out inp_embd since it's not used
+ float * inp_embd_data = (float *)lctx.inp_embd->data;
+ for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
+ inp_embd_data[i] = 0.0f;
+ }
+ } else {
+ const int64_t n_embd = hparams.n_embd;
+ const int64_t n_tokens = ubatch.n_tokens;
- ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+ ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+ }
}
if (ubatch.pos && lctx.inp_pos) {
@@ -653,6 +662,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
ctx->cparams.causal_attn = causal_attn;
}
+void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
+ ctx->cparams.cross_attn = cross_attention;
+}
+
+ bool cross_attention_layers(uint32_t il) const {
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+ }
};
void llama_synchronize(struct llama_context * ctx) {
ggml_backend_sched_synchronize(ctx->sched.get());
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -2722,6 +2775,9 @@ struct llama_cparams {
bool offload_kqv;
bool flash_attn;
bool no_perf;
+ // TODO (jmorganca): this should most likely be passed in as part of a batch
+ // and not set on the context for all batches.
+ bool cross_attn = false;
enum llama_pooling_type pooling_type;
@@ -2881,6 +2937,16 @@ struct llama_layer {
struct ggml_tensor * ffn_down_scale;
struct ggml_tensor * bskcn_tv;
+
+ // cross attention
+ struct ggml_tensor * cross_attn_k_norm;
+ struct ggml_tensor * cross_attn_k_proj;
+ struct ggml_tensor * cross_attn_o_proj;
+ struct ggml_tensor * cross_attn_q_norm;
+ struct ggml_tensor * cross_attn_q_proj;
+ struct ggml_tensor * cross_attn_v_proj;
+ struct ggml_tensor * cross_attn_attn_gate;
+ struct ggml_tensor * cross_attn_mlp_gate;
};
// very similar to llama_batch,
@@ -3472,6 +3538,8 @@ struct llama_context {
diff --git a/src/llama-context.h b/src/llama-context.h
index 0d163c47..4980a60e 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -107,6 +107,8 @@ struct llama_context {
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
@@ -240,11 +298,73 @@ index 26be6254..4778a9ed 100644
+ struct ggml_tensor * inp_cross_attn_state; // F32 [4, n_embd, 1061]
};
struct llama_lora_weight {
@@ -3610,6 +3678,39 @@ static bool llama_kv_cache_init(
// TODO: make these methods of llama_context
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 252012f3..9681e5a0 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -29,6 +29,7 @@ struct llama_cparams {
bool offload_kqv;
bool flash_attn;
bool no_perf;
+ bool cross_attn;
enum llama_pooling_type pooling_type;
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 450738da..42f8a58f 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -2,6 +2,8 @@
#include "ggml.h"
+#include <algorithm>
+
uint32_t llama_hparams::n_head(uint32_t il) const {
if (il < n_layer) {
return n_head_arr[il];
@@ -76,4 +78,8 @@ bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
}
GGML_ABORT("fatal error");
-}
\ No newline at end of file
+}
+
+bool llama_hparams::cross_attention_layers(uint32_t il) const {
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+}
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index fd898e27..f826cd9a 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -53,6 +53,7 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
+ std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
@@ -139,6 +140,9 @@ struct llama_hparams {
// Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const;
+
+ // cross attention layers
+ bool cross_attention_layers(uint32_t il) const;
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 53379253..cf814dbe 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -72,6 +72,39 @@ bool llama_kv_cache_init(
cache.v_l.reserve(n_layer);
for (int i = 0; i < (int) n_layer; i++) {
for (int i = 0; i < n_layer; i++) {
+ // for cross attention layers
+ if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
@@ -281,36 +401,94 @@ index 26be6254..4778a9ed 100644
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
@@ -5547,12 +5648,14 @@ static void llm_load_hparams(
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 422524a8..b12d6566 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -240,6 +240,8 @@ namespace GGUFMeta {
return true;
}
// zero-out the per-layer hparams
- std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
- std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
- std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
+ std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
+ std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
+ std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
+ template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
+
template<typename T, size_t N_MAX>
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 306c557d..4f9bbf90 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -146,46 +146,6 @@ std::string llama_model_ftype_name(const llama_model & model) {
return llama_model_ftype_name(model.ftype);
}
-template<typename F>
-static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
- ggml_init_params params = {
- /*.mem_size =*/ ggml_tensor_overhead()*8,
- /*.mem_buffer =*/ NULL,
- /*.no_alloc =*/ true,
- };
-
- ggml_context_ptr ctx { ggml_init(params) };
- if (!ctx) {
- throw std::runtime_error(format("failed to create ggml context"));
- }
-
- ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
- ggml_tensor * op_tensor = fn(ctx.get());
- for (int i = 0; i < GGML_MAX_SRC; i++) {
- if (op_tensor->src[i] != nullptr) {
- assert(op_tensor->src[i]->buffer == nullptr);
- op_tensor->src[i]->buffer = buf.get();
- }
- }
-
- bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
-
- return op_supported;
-}
-
-template<typename F>
-static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
- for (const auto & cur : buft_list) {
- ggml_backend_dev_t cur_dev = cur.first;
- ggml_backend_buffer_type_t cur_buft = cur.second;
- if (buft_supported(cur_buft, cur_dev, fn)) {
- return cur_buft;
- }
- }
-
- throw std::runtime_error(format("no suitable buffer type found"));
-}
-
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
return select_buft(
*model.dev_layer.at(il).buft_list,
@@ -312,9 +272,11 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
+ std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
- ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
- ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
+ ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
- ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
- ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
+ ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
+ ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
// n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -5601,7 +5704,7 @@ static void llm_load_hparams(
@@ -363,7 +325,7 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
- if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_MLLAMA || model.arch == LLM_ARCH_FALCON) {
- if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_MLLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
@@ -5641,6 +5744,16 @@ static void llm_load_hparams(
@@ -405,6 +367,16 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
}
}
} break;
@@ -324,27 +502,120 @@ index 26be6254..4778a9ed 100644
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_MINICPM:
case LLM_ARCH_DECI:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -7291,7 +7404,15 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
- {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}
+ {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
};
@@ -2062,6 +2034,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
// checks if the weight tensor can be used with the specified buffer type and device
@@ -7801,6 +7922,53 @@ static bool llm_load_tensors(
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
+ case LLM_ARCH_MLLAMA:
case LLM_ARCH_DECI:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h
index c1b9c0a1..5b23e2ba 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -9,6 +9,7 @@
#include "ggml-cpp.h"
#include <vector>
+#include <stdexcept>
// available models
// TODO: this enum does not follow the enum naming convention
@@ -62,6 +63,7 @@ enum llm_type {
MODEL_40B,
MODEL_65B,
MODEL_70B,
+ MODEL_90B,
MODEL_236B,
MODEL_314B,
MODEL_671B,
@@ -278,6 +280,16 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr;
+ // cross attention
+ struct ggml_tensor * cross_attn_k_norm = nullptr;
+ struct ggml_tensor * cross_attn_k_proj = nullptr;
+ struct ggml_tensor * cross_attn_o_proj = nullptr;
+ struct ggml_tensor * cross_attn_q_norm = nullptr;
+ struct ggml_tensor * cross_attn_q_proj = nullptr;
+ struct ggml_tensor * cross_attn_v_proj = nullptr;
+ struct ggml_tensor * cross_attn_attn_gate = nullptr;
+ struct ggml_tensor * cross_attn_mlp_gate = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
@@ -376,6 +388,45 @@ std::string llama_model_arch_name (const llama_model & model);
std::string llama_model_type_name (const llama_model & model);
std::string llama_model_ftype_name(const llama_model & model);
+template<typename F>
+bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
+ ggml_init_params params = {
+ /*.mem_size =*/ ggml_tensor_overhead()*8,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ ggml_context_ptr ctx { ggml_init(params) };
+ if (!ctx) {
+ throw std::runtime_error("failed to create ggml context");
+ }
+
+ ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
+ ggml_tensor * op_tensor = fn(ctx.get());
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
+ if (op_tensor->src[i] != nullptr) {
+ op_tensor->src[i]->buffer = buf.get();
+ }
+ }
+
+ bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+
+ return op_supported;
+}
+
+template<typename F>
+ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
+ for (const auto & cur : buft_list) {
+ ggml_backend_dev_t cur_dev = cur.first;
+ ggml_backend_buffer_type_t cur_buft = cur.second;
+ if (buft_supported(cur_buft, cur_dev, fn)) {
+ return cur_buft;
+ }
+ }
+
+ throw std::runtime_error("no suitable buffer type found");
+}
+
// used by llama_adapter_cvec
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 42974f8f..27def6fd 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -629,7 +629,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+ if (qs.n_attention_wv != n_attn_layer) {
+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
+ }
}
size_t total_size_org = 0;
diff --git a/src/llama.cpp b/src/llama.cpp
index 7dec50ae..bac66c24 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -563,6 +563,52 @@ static bool llm_load_tensors(
}
}
} break;
@@ -364,7 +635,6 @@ index 26be6254..4778a9ed 100644
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+
+ auto & layer = model.layers[i];
+
+ if (hparams.cross_attention_layers(i)) {
@@ -395,10 +665,10 @@ index 26be6254..4778a9ed 100644
+ }
+ }
+ } break;
case LLM_ARCH_MINICPM3:
case LLM_ARCH_DECI:
{
const int64_t n_embd_head_qk_rope = hparams.n_rot;
@@ -9511,7 +9679,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -2514,7 +2560,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
@@ -407,7 +677,7 @@ index 26be6254..4778a9ed 100644
}
if (params.vocab_only) {
@@ -9594,6 +9762,21 @@ static struct ggml_tensor * llm_build_inp_embd(
@@ -2598,6 +2644,21 @@ static struct ggml_tensor * llm_build_inp_embd(
return inpL;
}
@@ -429,7 +699,7 @@ index 26be6254..4778a9ed 100644
static void llm_build_kv_store(
struct ggml_context * ctx,
const llama_hparams & hparams,
@@ -10561,6 +10744,7 @@ struct llm_build_context {
@@ -3593,6 +3654,7 @@ struct llm_build_context {
lctx.inp_pos_bucket = nullptr;
lctx.inp_embd_enc = nullptr;
lctx.inp_KQ_mask_cross = nullptr;
@@ -437,11 +707,11 @@ index 26be6254..4778a9ed 100644
}
void free() {
@@ -11040,6 +11224,240 @@ struct llm_build_context {
@@ -4074,6 +4136,240 @@ struct llm_build_context {
return gf;
}
+ struct ggml_cgraph * build_mllama() {
+ struct ggml_cgraph * build_mllama() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -675,10 +945,10 @@ index 26be6254..4778a9ed 100644
+ return gf;
+ }
+
struct ggml_cgraph * build_baichuan() {
struct ggml_cgraph * build_deci() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@@ -16993,6 +17411,10 @@ static struct ggml_cgraph * llama_build_graph(
@@ -10646,6 +10942,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_llama();
} break;
@@ -686,33 +956,10 @@ index 26be6254..4778a9ed 100644
+ {
+ result = llm.build_mllama();
+ } break;
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_DECI:
{
result = llm.build_baichuan();
@@ -17258,10 +17680,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
}
if (ubatch.embd) {
- const int64_t n_embd = hparams.n_embd;
- const int64_t n_tokens = ubatch.n_tokens;
+ if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
+ ggml_backend_tensor_set(lctx.inp_cross_attn_state, ubatch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
+ // zero out inp_embd since it's not used
+ float * inp_embd_data = (float *)lctx.inp_embd->data;
+ for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
+ inp_embd_data[i] = 0.0f;
+ }
+ } else {
+ const int64_t n_embd = hparams.n_embd;
+ const int64_t n_tokens = ubatch.n_tokens;
- ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+ ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+ }
}
if (ubatch.pos && lctx.inp_pos) {
@@ -17862,7 +18293,7 @@ static int llama_decode_internal(
result = llm.build_deci();
@@ -10971,7 +11271,7 @@ static int llama_decode_internal(
n_outputs = 1;
}
@@ -721,7 +968,7 @@ index 26be6254..4778a9ed 100644
/* simple_split */ !kv_self.recurrent,
/* logits_all */ n_outputs == n_tokens_all);
@@ -18172,7 +18603,7 @@ static int llama_encode_internal(
@@ -11282,7 +11582,7 @@ static int llama_encode_internal(
const int64_t n_embd = hparams.n_embd;
@@ -730,57 +977,11 @@ index 26be6254..4778a9ed 100644
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
@@ -19203,7 +19634,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+ if (qs.n_attention_wv != n_attn_layer) {
+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
+ }
}
size_t total_size_org = 0;
@@ -20360,6 +20793,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
+ case LLM_ARCH_MLLAMA:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
case LLM_ARCH_PLAMO:
@@ -21790,6 +22224,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
ctx->cparams.causal_attn = causal_attn;
}
+void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
+ ctx->cparams.cross_attn = cross_attention;
+}
+
struct llama_batch llama_batch_get_one(
llama_token * tokens,
int32_t n_tokens) {
@@ -21797,6 +22235,7 @@ struct llama_batch llama_batch_get_one(
/*n_tokens =*/ n_tokens,
/*tokens =*/ tokens,
/*embd =*/ nullptr,
+ /*n_embd =*/ 0,
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -21809,6 +22248,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_tokens =*/ 0,
/*tokens =*/ nullptr,
/*embd =*/ nullptr,
+ /*n_embd =*/ 0,
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -21817,6 +22257,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
+ batch.n_embd = embd;
} else {
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
}
@@ -11775,6 +12075,7 @@ struct llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
+ /*.cross_attn =*/ false,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};

View File

@@ -15,7 +15,7 @@ Subject: [PATCH] add unpad operator
8 files changed, 220 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index b0c1ac9c..091e6e6b 100644
index c714fc8c..1bc50fca 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -499,6 +499,7 @@ extern "C" {
@@ -26,7 +26,7 @@ index b0c1ac9c..091e6e6b 100644
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
@@ -1718,6 +1719,15 @@ extern "C" {
@@ -1735,6 +1736,15 @@ extern "C" {
int p0,
int p1);
@@ -43,7 +43,7 @@ index b0c1ac9c..091e6e6b 100644
// timesteps: [N,]
// return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 67e67a08..bebff207 100644
index b7fefb9d..b307d554 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10588,6 +10588,59 @@ static void ggml_compute_forward_pad_reflect_1d(
@@ -126,7 +126,7 @@ index 67e67a08..bebff207 100644
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 8fd7c1a3..7c351b89 100644
index bb425ee8..1e7c2a22 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2085,6 +2085,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
@@ -139,7 +139,7 @@ index 8fd7c1a3..7c351b89 100644
case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst);
break;
@@ -3012,6 +3015,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
@@ -3013,6 +3016,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_GROUP_NORM:
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
@@ -211,7 +211,7 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 28f590f9..787fc713 100644
index a85502ee..84e027eb 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -311,6 +311,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
@@ -332,7 +332,7 @@ index 8ba43904..204c93e6 100644
device char * dst,
constant int64_t & ne0,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 51cc8566..0e74e554 100644
index 2bbe5f48..7ffcd907 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -954,6 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -369,7 +369,7 @@ index 51cc8566..0e74e554 100644
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -4180,6 +4182,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
@@ -4214,6 +4216,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
return result;
}

View File

@@ -11,10 +11,10 @@ the characters
2 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 8c9aaf5a..3e372dc3 100644
index 3fcfcaa3..8f44705a 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
@@ -375,7 +375,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
regex_exprs = {
"[\r\n]",
@@ -24,7 +24,7 @@ index 8c9aaf5a..3e372dc3 100644
"\\s+$",
"[一-龥ࠀ-一가-퟿]+",
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 3d459263..51dd81fb 100644
index 7aca6544..6155da80 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -2,6 +2,11 @@

View File

@@ -10,7 +10,7 @@ Subject: [PATCH] relative include paths
3 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index bebff207..d6dd5600 100644
index b307d554..4eb39c52 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10,7 +10,7 @@
@@ -23,7 +23,7 @@ index bebff207..d6dd5600 100644
#if defined(_MSC_VER) || defined(__MINGW32__)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index c390957a..1af5f7eb 100644
index f11399cc..2a8b40ce 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -4,8 +4,7 @@

View File

@@ -19,12 +19,12 @@ multiple batches of processing until everything is complete.
1 file changed, 46 insertions(+), 53 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index 4778a9ed..654e32bc 100644
index bac66c24..c95da45d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3025,6 +3025,13 @@ struct llama_kv_cache {
}
};
@@ -3536,6 +3536,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
}
+// block of KV slots to move when defragging
+struct llama_kv_defrag_move {
@@ -33,10 +33,10 @@ index 4778a9ed..654e32bc 100644
+ uint32_t len;
+};
+
struct llama_control_vector {
std::vector<struct ggml_tensor *> tensors; // per layer
std::vector<ggml_context_ptr> ctxs;
@@ -10802,35 +10809,23 @@ struct llm_build_context {
struct llm_build_context {
const llama_model & model;
llama_context & lctx;
@@ -3712,35 +3719,23 @@ struct llm_build_context {
return gf;
}
@@ -78,7 +78,7 @@ index 4778a9ed..654e32bc 100644
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
@@ -10838,31 +10833,29 @@ struct llm_build_context {
@@ -3748,31 +3743,29 @@ struct llm_build_context {
if (flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
@@ -118,7 +118,7 @@ index 4778a9ed..654e32bc 100644
}
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
@@ -17325,7 +17318,7 @@ struct llm_build_context {
@@ -10856,7 +10849,7 @@ struct llm_build_context {
}
};
@@ -127,7 +127,7 @@ index 4778a9ed..654e32bc 100644
llama_ubatch dummy = {};
dummy.equal_seqs = true;
@@ -17335,7 +17328,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
@@ -10866,7 +10859,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
llm.init();
@@ -136,7 +136,7 @@ index 4778a9ed..654e32bc 100644
llm.free();
@@ -18351,7 +18344,12 @@ static int llama_decode_internal(
@@ -11329,7 +11322,12 @@ static int llama_decode_internal(
kv_self.head = 0;
}
@@ -150,7 +150,7 @@ index 4778a9ed..654e32bc 100644
if (!slot) {
return 1;
}
@@ -18756,8 +18754,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
@@ -11735,8 +11733,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
//const int64_t t_start = ggml_time_us();
@@ -161,7 +161,7 @@ index 4778a9ed..654e32bc 100644
// each move requires 6*n_layer tensors (see build_defrag)
// - source view, destination view, copy operation
@@ -18821,19 +18819,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
@@ -11800,19 +11798,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
// are we moving a continuous block of memory?
bool cont = false;
@@ -181,7 +181,7 @@ index 4778a9ed..654e32bc 100644
cont = false;
continue;
}
@@ -18849,8 +18839,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
@@ -11828,8 +11818,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
kv_self.head = n_used;
if (!cont) {
@@ -193,7 +193,7 @@ index 4778a9ed..654e32bc 100644
}
nf++;
@@ -18860,22 +18852,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
@@ -11839,22 +11831,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
}
}
@@ -218,7 +218,7 @@ index 4778a9ed..654e32bc 100644
#if 0
// CPU defrag
@@ -18950,11 +18936,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
@@ -11929,11 +11915,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
#else
// ggml_graph defrag

View File

@@ -0,0 +1,113 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sat, 4 Jan 2025 22:52:48 -0800
Subject: [PATCH] re-enable gpu for clip
---
examples/llava/clip.cpp | 86 ++++++++++++++++++++---------------------
1 file changed, 43 insertions(+), 43 deletions(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index b3c1829f..718052e1 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -8,25 +8,25 @@
#include "ggml-alloc.h"
#include "ggml-backend.h"
-//#ifdef GGML_USE_CUDA
-//#include "ggml-cuda.h"
-//#endif
-//
-//#ifdef GGML_USE_SYCL
-//#include "ggml-sycl.h"
-//#endif
-//
-//#ifdef GGML_USE_METAL
-//#include "ggml-metal.h"
-//#endif
-//
-//#ifdef GGML_USE_CANN
-//#include "ggml-cann.h"
-//#endif
-//
-//#ifdef GGML_USE_VULKAN
-//#include "ggml-vulkan.h"
-//#endif
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_SYCL
+#include "ggml-sycl.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#ifdef GGML_USE_CANN
+#include "ggml-cann.h"
+#endif
+
+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
@@ -1235,30 +1235,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}
}
-//#ifdef GGML_USE_CUDA
-// new_clip->backend = ggml_backend_cuda_init(0);
-// LOG_INF("%s: CLIP using CUDA backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_METAL
-// new_clip->backend = ggml_backend_metal_init();
-// LOG_INF("%s: CLIP using Metal backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_CANN
-// new_clip->backend = ggml_backend_cann_init(0);
-// LOG_INF("%s: CLIP using CANN backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_VULKAN
-// new_clip->backend = ggml_backend_vk_init(0);
-// LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_SYCL
-// new_clip->backend = ggml_backend_sycl_init(0);
-// LOG_INF("%s: CLIP using SYCL backend\n", __func__);
-//#endif
+#ifdef GGML_USE_CUDA
+ new_clip->backend = ggml_backend_cuda_init(0);
+ LOG_INF("%s: CLIP using CUDA backend\n", __func__);
+#endif
+
+#ifdef GGML_USE_METAL
+ new_clip->backend = ggml_backend_metal_init();
+ LOG_INF("%s: CLIP using Metal backend\n", __func__);
+#endif
+
+#ifdef GGML_USE_CANN
+ new_clip->backend = ggml_backend_cann_init(0);
+ LOG_INF("%s: CLIP using CANN backend\n", __func__);
+#endif
+
+#ifdef GGML_USE_VULKAN
+ new_clip->backend = ggml_backend_vk_init(0);
+ LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
+#endif
+
+#ifdef GGML_USE_SYCL
+ new_clip->backend = ggml_backend_sycl_init(0);
+ LOG_INF("%s: CLIP using SYCL backend\n", __func__);
+#endif
if (!new_clip->backend) {
new_clip->backend = ggml_backend_cpu_init();