mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
llama: update vendored code to commit 46e3556 (#8308)
This commit is contained in:
@@ -9,7 +9,7 @@ Subject: [PATCH] cuda
|
||||
2 files changed, 9 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index fdb4b986..9b80fe07 100644
|
||||
index e2d6c405..1b62c056 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -106,7 +106,12 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
@@ -26,7 +26,7 @@ index fdb4b986..9b80fe07 100644
|
||||
|
||||
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index c180adc8..000f1777 100644
|
||||
index 0b06be72..0a6ae325 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -424,6 +424,10 @@ struct ggml_backend_cuda_buffer_context {
|
||||
|
||||
@@ -4,14 +4,14 @@ Date: Mon, 16 Sep 2024 15:53:13 -0700
|
||||
Subject: [PATCH] pretokenizer
|
||||
|
||||
---
|
||||
src/llama.cpp | 14 +++-----------
|
||||
src/llama-model.cpp | 14 +++-----------
|
||||
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index abc1252e..626c3e3f 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -6400,16 +6400,7 @@ static void llm_load_vocab(
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 405e0528..00b80c52 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1249,16 +1249,7 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
|
||||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
vocab.tokenizer_add_space_prefix = false;
|
||||
vocab.tokenizer_clean_spaces = true;
|
||||
@@ -29,9 +29,9 @@ index abc1252e..626c3e3f 100644
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -6514,7 +6505,8 @@ static void llm_load_vocab(
|
||||
tokenizer_pre == "minerva-7b") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
|
||||
@@ -1373,7 +1364,8 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
|
||||
tokenizer_pre == "megrez") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||
} else {
|
||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||
|
||||
@@ -4,14 +4,15 @@ Date: Mon, 16 Sep 2024 15:53:14 -0700
|
||||
Subject: [PATCH] embeddings
|
||||
|
||||
---
|
||||
src/llama.cpp | 9 ++++++---
|
||||
1 file changed, 6 insertions(+), 3 deletions(-)
|
||||
src/llama-context.cpp | 2 +-
|
||||
src/llama.cpp | 6 ++++--
|
||||
2 files changed, 5 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 626c3e3f..9e292c4f 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -17419,7 +17419,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 38a55fb2..b9c4a5bf 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -475,7 +475,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
@@ -20,7 +21,11 @@ index 626c3e3f..9e292c4f 100644
|
||||
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
||||
|
||||
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
||||
@@ -17714,7 +17714,6 @@ static int llama_decode_internal(
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index ea78ea48..4eb3f6b9 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -10876,7 +10876,6 @@ static int llama_decode_internal(
|
||||
res = nullptr;
|
||||
embd = nullptr;
|
||||
} else if (cparams.embeddings) {
|
||||
@@ -28,7 +33,7 @@ index 626c3e3f..9e292c4f 100644
|
||||
embd = nullptr;
|
||||
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
||||
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
|
||||
@@ -17722,11 +17721,15 @@ static int llama_decode_internal(
|
||||
@@ -10884,12 +10883,15 @@ static int llama_decode_internal(
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -37,7 +42,7 @@ index 626c3e3f..9e292c4f 100644
|
||||
embd = nullptr; // do not extract embeddings when not needed
|
||||
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
||||
}
|
||||
+
|
||||
|
||||
+ if (!cparams.causal_attn) {
|
||||
+ res = nullptr; // do not extract logits when not needed
|
||||
+ }
|
||||
|
||||
@@ -8,7 +8,7 @@ Subject: [PATCH] clip-unicode
|
||||
1 file changed, 39 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index ba28c07c..46998e4c 100644
|
||||
index 3cd0d2fa..b3c1829f 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -56,6 +56,19 @@
|
||||
|
||||
@@ -11,38 +11,29 @@ tensor to store the scalar. the scalar is implemented a 1-dimensional
|
||||
tensor with 2 elements dervied from the model's bskcn_tv configuration.
|
||||
in general, the values are (bskcn_tv, 1 - bskcn_tv)
|
||||
---
|
||||
src/llama.cpp | 267 +++++++++++++++++++++++++++++++++++++++++++++++---
|
||||
1 file changed, 253 insertions(+), 14 deletions(-)
|
||||
src/llama-arch.cpp | 53 +++++++----
|
||||
src/llama-arch.h | 3 +
|
||||
src/llama-hparams.cpp | 8 ++
|
||||
src/llama-hparams.h | 5 +
|
||||
src/llama-model-loader.cpp | 1 +
|
||||
src/llama-model.cpp | 16 ++++
|
||||
src/llama-model.h | 3 +
|
||||
src/llama.cpp | 185 +++++++++++++++++++++++++++++++++++++
|
||||
8 files changed, 258 insertions(+), 16 deletions(-)
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 9e292c4f..26be6254 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -196,6 +196,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GRANITE,
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
LLM_ARCH_CHAMELEON,
|
||||
+ LLM_ARCH_SOLAR,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index 007d79f8..5b376c5e 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -59,6 +59,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GRANITE, "granite" },
|
||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
||||
+ { LLM_ARCH_SOLAR, "solar" },
|
||||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -251,6 +252,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GRANITE, "granite" },
|
||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
||||
+ { LLM_ARCH_SOLAR, "solar" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -308,6 +310,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
@@ -411,20 +414,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
@@ -106,22 +107,23 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
|
||||
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
|
||||
|
||||
@@ -54,40 +45,36 @@ index 9e292c4f..26be6254 100644
|
||||
- { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
||||
- { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
||||
- { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
||||
- { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
|
||||
- { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
|
||||
- { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
||||
- { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
||||
- { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
||||
- { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
- { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
- { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
+ { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
+ { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
+ { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
||||
+ { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
||||
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
|
||||
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
||||
+ { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
||||
+ { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
||||
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
||||
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
||||
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
||||
+ { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
+ { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
|
||||
+ { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
+ { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
+ { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
||||
+ { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
||||
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
|
||||
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
||||
+ { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
||||
+ { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
||||
+ { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
|
||||
+ { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
|
||||
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
||||
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
||||
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
||||
+ { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
+ { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
@@ -607,6 +611,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
+ LLM_TENSOR_BSKCN_TV,
|
||||
};
|
||||
|
||||
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
|
||||
@@ -1564,6 +1569,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
@@ -1240,6 +1242,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
|
||||
},
|
||||
},
|
||||
+ {
|
||||
@@ -111,7 +98,133 @@ index 9e292c4f..26be6254 100644
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@@ -2425,6 +2448,7 @@ enum e_model {
|
||||
@@ -1372,6 +1392,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
+ {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index 45e458bb..eac7055b 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -63,6 +63,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GRANITE,
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
LLM_ARCH_CHAMELEON,
|
||||
+ LLM_ARCH_SOLAR,
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
@@ -126,6 +127,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
@@ -305,6 +307,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
+ LLM_TENSOR_BSKCN_TV,
|
||||
LLM_TENSOR_CONV1D,
|
||||
LLM_TENSOR_CONVNEXT_DW,
|
||||
LLM_TENSOR_CONVNEXT_NORM,
|
||||
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
|
||||
index c4053469..450738da 100644
|
||||
--- a/src/llama-hparams.cpp
|
||||
+++ b/src/llama-hparams.cpp
|
||||
@@ -69,3 +69,11 @@ uint32_t llama_hparams::n_embd_v_s() const {
|
||||
// corresponds to Mamba's ssm_states size
|
||||
return ssm_d_state * ssm_d_inner;
|
||||
}
|
||||
+
|
||||
+bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
|
||||
+ if (il < n_layer) {
|
||||
+ return n_bskcn_arr[n][il] > 0;
|
||||
+ }
|
||||
+
|
||||
+ GGML_ABORT("fatal error");
|
||||
+}
|
||||
\ No newline at end of file
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index a29f20ec..fd898e27 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -52,6 +52,8 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
|
||||
+ std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
|
||||
+
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
uint32_t n_lora_kv = 0;
|
||||
@@ -134,6 +136,9 @@ struct llama_hparams {
|
||||
|
||||
// dimension of the recurrent state embeddings
|
||||
uint32_t n_embd_v_s() const;
|
||||
+
|
||||
+ // Block skip connection
|
||||
+ bool n_bskcn(uint32_t n, uint32_t il) const;
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
|
||||
index 7743b465..422524a8 100644
|
||||
--- a/src/llama-model-loader.cpp
|
||||
+++ b/src/llama-model-loader.cpp
|
||||
@@ -364,6 +364,7 @@ namespace GGUFMeta {
|
||||
// TODO: this is not very clever - figure out something better
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
||||
+ template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
||||
|
||||
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
|
||||
int trace = 0;
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 00b80c52..306c557d 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1091,6 +1091,21 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
+ {
|
||||
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
+ for (size_t i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
|
||||
+ auto & bskcn = hparams.n_bskcn_arr[i];
|
||||
+ bskcn.fill(0);
|
||||
+ auto kv = LLM_KV(model.arch);
|
||||
+ ml.get_key_or_arr(format((kv(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION) + ".%d").c_str(), i), bskcn, hparams.n_layer, false);
|
||||
+ }
|
||||
+
|
||||
+ switch (hparams.n_layer) {
|
||||
+ case 64: model.type = e_model::MODEL_22B; break;
|
||||
+ default: model.type = e_model::MODEL_UNKNOWN;
|
||||
+ }
|
||||
+ } break;
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -2065,6 +2080,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
// the pairs of head values are offset by n_rot/2
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index ce038932..c1b9c0a1 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -54,6 +54,7 @@ enum llm_type {
|
||||
MODEL_15B,
|
||||
MODEL_16B,
|
||||
MODEL_20B,
|
||||
@@ -119,78 +232,20 @@ index 9e292c4f..26be6254 100644
|
||||
MODEL_30B,
|
||||
MODEL_32B,
|
||||
MODEL_34B,
|
||||
@@ -2475,6 +2499,8 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
@@ -275,6 +276,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_up_scale = nullptr;
|
||||
struct ggml_tensor * ffn_down_scale = nullptr;
|
||||
|
||||
+ std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
|
||||
+ struct ggml_tensor * bskcn_tv = nullptr;
|
||||
+
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
uint32_t n_lora_kv = 0;
|
||||
@@ -2546,6 +2572,7 @@ struct llama_hparams {
|
||||
if (this->n_head_arr != other.n_head_arr) return true;
|
||||
if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
|
||||
if (this->n_ff_arr != other.n_ff_arr) return true;
|
||||
+ if (this->n_bskcn_arr != other.n_bskcn_arr) return true;
|
||||
struct llama_layer_posnet posnet;
|
||||
|
||||
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
|
||||
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
|
||||
@@ -2658,6 +2685,14 @@ struct llama_hparams {
|
||||
return ssm_d_state * ssm_d_inner;
|
||||
}
|
||||
}
|
||||
+
|
||||
+ bool n_bskcn(uint32_t n, uint32_t il = 0) const {
|
||||
+ if (il < n_layer) {
|
||||
+ return n_bskcn_arr[n][il] > 0;
|
||||
+ }
|
||||
+
|
||||
+ GGML_ABORT("fatal error");
|
||||
+ }
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
@@ -2844,6 +2879,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_gate_scale;
|
||||
struct ggml_tensor * ffn_up_scale;
|
||||
struct ggml_tensor * ffn_down_scale;
|
||||
+
|
||||
+ struct ggml_tensor * bskcn_tv;
|
||||
};
|
||||
|
||||
// very similar to llama_batch,
|
||||
@@ -6247,6 +6284,21 @@ static void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
+ {
|
||||
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
+
|
||||
+ for (int i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
|
||||
+ auto & bskcn = hparams.n_bskcn_arr.at(i);
|
||||
+ bskcn.fill(0);
|
||||
+ ml.get_key_or_arr(::format(LLM_KV_NAMES.at(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION), LLM_ARCH_NAMES.at(ml.llm_kv.arch), i), bskcn, hparams.n_layer, false);
|
||||
+ }
|
||||
+
|
||||
+ switch (hparams.n_layer) {
|
||||
+ case 64: model.type = e_model::MODEL_22B; break;
|
||||
+ default: model.type = e_model::MODEL_UNKNOWN;
|
||||
+ }
|
||||
+ }
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
@@ -7239,6 +7291,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
|
||||
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
+ {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}
|
||||
};
|
||||
|
||||
// checks if the weight tensor can be used with the specified buffer type and device
|
||||
@@ -9253,6 +9306,35 @@ static bool llm_load_tensors(
|
||||
struct llama_layer_convnext convnext;
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 4eb3f6b9..7dec50ae 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -2206,6 +2206,35 @@ static bool llm_load_tensors(
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
@@ -226,11 +281,10 @@ index 9e292c4f..26be6254 100644
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -16671,6 +16753,158 @@ struct llm_build_context {
|
||||
|
||||
@@ -10226,6 +10255,158 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
+
|
||||
|
||||
+ ggml_cgraph * build_solar() {
|
||||
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
+
|
||||
@@ -382,10 +436,11 @@ index 9e292c4f..26be6254 100644
|
||||
+
|
||||
+ return gf;
|
||||
+ }
|
||||
};
|
||||
+
|
||||
struct ggml_cgraph * build_wavtokenizer_dec() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||
@@ -16942,6 +17176,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
@@ -10660,6 +10841,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_chameleon();
|
||||
} break;
|
||||
@@ -393,14 +448,6 @@ index 9e292c4f..26be6254 100644
|
||||
+ {
|
||||
+ result = llm.build_solar();
|
||||
+ } break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -20137,6 +20375,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
// the pairs of head values are offset by n_rot/2
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
result = llm.build_wavtokenizer_dec();
|
||||
|
||||
@@ -8,7 +8,7 @@ Subject: [PATCH] conditional-fattn
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 000f1777..8fd7c1a3 100644
|
||||
index 0a6ae325..bb425ee8 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2162,9 +2162,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
|
||||
@@ -12,10 +12,24 @@ kv cache once per run
|
||||
|
||||
remaining is to implement the cross attention mask
|
||||
---
|
||||
examples/llava/llava.cpp | 5 +-
|
||||
include/llama.h | 5 +
|
||||
src/llama.cpp | 477 +++++++++++++++++++++++++++++++++++++--
|
||||
3 files changed, 467 insertions(+), 20 deletions(-)
|
||||
examples/llava/llava.cpp | 5 +-
|
||||
ggml/src/ggml-backend-reg.cpp | 6 +-
|
||||
include/llama.h | 6 +
|
||||
src/llama-arch.cpp | 44 +++++
|
||||
src/llama-arch.h | 10 ++
|
||||
src/llama-batch.cpp | 3 +
|
||||
src/llama-context.cpp | 19 ++-
|
||||
src/llama-context.h | 2 +
|
||||
src/llama-cparams.h | 1 +
|
||||
src/llama-hparams.cpp | 8 +-
|
||||
src/llama-hparams.h | 4 +
|
||||
src/llama-kv-cache.cpp | 33 ++++
|
||||
src/llama-model-loader.cpp | 2 +
|
||||
src/llama-model.cpp | 59 ++-----
|
||||
src/llama-model.h | 51 ++++++
|
||||
src/llama-quant.cpp | 4 +-
|
||||
src/llama.cpp | 307 +++++++++++++++++++++++++++++++++-
|
||||
17 files changed, 508 insertions(+), 56 deletions(-)
|
||||
|
||||
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
|
||||
index 16f30c56..0f0f3f62 100644
|
||||
@@ -47,11 +61,28 @@ index 16f30c56..0f0f3f62 100644
|
||||
if (llama_decode(ctx_llama, llava_batch.batch)) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index 7ddd178b..899d16f2 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -171,9 +171,9 @@ struct ggml_backend_registry {
|
||||
#ifdef GGML_USE_CANN
|
||||
register_backend(ggml_backend_cann_reg());
|
||||
#endif
|
||||
-#ifdef GGML_USE_BLAS
|
||||
- register_backend(ggml_backend_blas_reg());
|
||||
-#endif
|
||||
+// #ifdef GGML_USE_BLAS
|
||||
+// register_backend(ggml_backend_blas_reg());
|
||||
+// #endif
|
||||
#ifdef GGML_USE_RPC
|
||||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
diff --git a/include/llama.h b/include/llama.h
|
||||
index c67988a3..0f266283 100644
|
||||
index a0d5ba5d..9f411960 100644
|
||||
--- a/include/llama.h
|
||||
+++ b/include/llama.h
|
||||
@@ -249,6 +249,7 @@ extern "C" {
|
||||
@@ -250,6 +250,7 @@ extern "C" {
|
||||
|
||||
llama_token * token;
|
||||
float * embd;
|
||||
@@ -59,7 +90,15 @@ index c67988a3..0f266283 100644
|
||||
llama_pos * pos;
|
||||
int32_t * n_seq_id;
|
||||
llama_seq_id ** seq_id;
|
||||
@@ -423,6 +424,10 @@ extern "C" {
|
||||
@@ -347,6 +348,7 @@ extern "C" {
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
bool no_perf; // whether to measure performance timings
|
||||
+ bool cross_attn; // whether to use cross attention
|
||||
|
||||
// Abort callback
|
||||
// if it returns true, execution of llama_decode() will be aborted
|
||||
@@ -426,6 +428,10 @@ extern "C" {
|
||||
struct llama_model * model,
|
||||
struct llama_context_params params);
|
||||
|
||||
@@ -70,58 +109,27 @@ index c67988a3..0f266283 100644
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 26be6254..4778a9ed 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -146,6 +146,7 @@ static std::string format(const char * fmt, ...) {
|
||||
|
||||
enum llm_arch {
|
||||
LLM_ARCH_LLAMA,
|
||||
+ LLM_ARCH_MLLAMA,
|
||||
LLM_ARCH_FALCON,
|
||||
LLM_ARCH_BAICHUAN,
|
||||
LLM_ARCH_GROK,
|
||||
@@ -202,6 +203,7 @@ enum llm_arch {
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index 5b376c5e..b35aeb31 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_LLAMA, "llama" },
|
||||
+ { LLM_ARCH_MLLAMA, "mllama" },
|
||||
{ LLM_ARCH_FALCON, "falcon" },
|
||||
{ LLM_ARCH_GROK, "grok" },
|
||||
{ LLM_ARCH_GPT2, "gpt2" },
|
||||
@@ -311,6 +313,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
|
||||
{ LLM_ARCH_LLAMA, "llama" },
|
||||
+ { LLM_ARCH_MLLAMA, "mllama" },
|
||||
{ LLM_ARCH_DECI, "deci" },
|
||||
{ LLM_ARCH_FALCON, "falcon" },
|
||||
{ LLM_ARCH_GROK, "grok" },
|
||||
@@ -124,6 +125,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
@@ -429,6 +432,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
|
||||
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
@@ -612,6 +616,14 @@ enum llm_tensor {
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
LLM_TENSOR_BSKCN_TV,
|
||||
+ LLM_TENSOR_CROSS_ATTN_K_NORM,
|
||||
+ LLM_TENSOR_CROSS_ATTN_K_PROJ,
|
||||
+ LLM_TENSOR_CROSS_ATTN_O_PROJ,
|
||||
+ LLM_TENSOR_CROSS_ATTN_Q_NORM,
|
||||
+ LLM_TENSOR_CROSS_ATTN_Q_PROJ,
|
||||
+ LLM_TENSOR_CROSS_ATTN_V_PROJ,
|
||||
+ LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
|
||||
+ LLM_TENSOR_CROSS_ATTN_MLP_GATE,
|
||||
};
|
||||
|
||||
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
|
||||
@@ -641,6 +653,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
@@ -220,6 +222,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
@@ -160,79 +168,129 @@ index 26be6254..4778a9ed 100644
|
||||
+ },
|
||||
+ },
|
||||
{
|
||||
LLM_ARCH_BAICHUAN,
|
||||
LLM_ARCH_DECI,
|
||||
{
|
||||
@@ -2456,6 +2502,7 @@ enum e_model {
|
||||
MODEL_40B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
+ MODEL_90B,
|
||||
MODEL_236B,
|
||||
MODEL_314B,
|
||||
MODEL_SMALL,
|
||||
@@ -2500,6 +2547,7 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
@@ -1393,6 +1429,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index eac7055b..e8235ae0 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
|
||||
+ std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
|
||||
enum llm_arch {
|
||||
LLM_ARCH_LLAMA,
|
||||
+ LLM_ARCH_MLLAMA,
|
||||
LLM_ARCH_DECI,
|
||||
LLM_ARCH_FALCON,
|
||||
LLM_ARCH_BAICHUAN,
|
||||
@@ -128,6 +129,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
|
||||
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
@@ -2569,10 +2617,11 @@ struct llama_hparams {
|
||||
if (this->n_expert != other.n_expert) return true;
|
||||
if (this->n_expert_used != other.n_expert_used) return true;
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
@@ -308,6 +310,14 @@ enum llm_tensor {
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
LLM_TENSOR_BSKCN_TV,
|
||||
+ LLM_TENSOR_CROSS_ATTN_K_NORM,
|
||||
+ LLM_TENSOR_CROSS_ATTN_K_PROJ,
|
||||
+ LLM_TENSOR_CROSS_ATTN_O_PROJ,
|
||||
+ LLM_TENSOR_CROSS_ATTN_Q_NORM,
|
||||
+ LLM_TENSOR_CROSS_ATTN_Q_PROJ,
|
||||
+ LLM_TENSOR_CROSS_ATTN_V_PROJ,
|
||||
+ LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
|
||||
+ LLM_TENSOR_CROSS_ATTN_MLP_GATE,
|
||||
LLM_TENSOR_CONV1D,
|
||||
LLM_TENSOR_CONVNEXT_DW,
|
||||
LLM_TENSOR_CONVNEXT_NORM,
|
||||
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
|
||||
index 01d5ca57..8682b0e6 100644
|
||||
--- a/src/llama-batch.cpp
|
||||
+++ b/src/llama-batch.cpp
|
||||
@@ -316,6 +316,7 @@ struct llama_batch llama_batch_get_one(
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ tokens,
|
||||
/*embd =*/ nullptr,
|
||||
+ /*n_embd =*/ 0,
|
||||
/*pos =*/ nullptr,
|
||||
/*n_seq_id =*/ nullptr,
|
||||
/*seq_id =*/ nullptr,
|
||||
@@ -328,6 +329,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
/*n_tokens =*/ 0,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ nullptr,
|
||||
+ /*n_embd =*/ 0,
|
||||
/*pos =*/ nullptr,
|
||||
/*n_seq_id =*/ nullptr,
|
||||
/*seq_id =*/ nullptr,
|
||||
@@ -336,6 +338,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
|
||||
- if (this->n_head_arr != other.n_head_arr) return true;
|
||||
- if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
|
||||
- if (this->n_ff_arr != other.n_ff_arr) return true;
|
||||
- if (this->n_bskcn_arr != other.n_bskcn_arr) return true;
|
||||
+ if (this->n_head_arr != other.n_head_arr) return true;
|
||||
+ if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
|
||||
+ if (this->n_ff_arr != other.n_ff_arr) return true;
|
||||
+ if (this->n_bskcn_arr != other.n_bskcn_arr) return true;
|
||||
+ if (this->cross_attn_layers != other.cross_attn_layers) return true;
|
||||
|
||||
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
|
||||
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
|
||||
@@ -2693,6 +2742,10 @@ struct llama_hparams {
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
if (embd) {
|
||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
||||
+ batch.n_embd = embd;
|
||||
} else {
|
||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
||||
}
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index b9c4a5bf..9d0e7ca3 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -71,10 +71,19 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
|
||||
}
|
||||
|
||||
if (ubatch.embd) {
|
||||
- const int64_t n_embd = hparams.n_embd;
|
||||
- const int64_t n_tokens = ubatch.n_tokens;
|
||||
+ if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
|
||||
+ ggml_backend_tensor_set(lctx.inp_cross_attn_state, ubatch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
|
||||
+ // zero out inp_embd since it's not used
|
||||
+ float * inp_embd_data = (float *)lctx.inp_embd->data;
|
||||
+ for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
|
||||
+ inp_embd_data[i] = 0.0f;
|
||||
+ }
|
||||
+ } else {
|
||||
+ const int64_t n_embd = hparams.n_embd;
|
||||
+ const int64_t n_tokens = ubatch.n_tokens;
|
||||
|
||||
- ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
||||
+ ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
||||
+ }
|
||||
}
|
||||
|
||||
if (ubatch.pos && lctx.inp_pos) {
|
||||
@@ -653,6 +662,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
||||
ctx->cparams.causal_attn = causal_attn;
|
||||
}
|
||||
|
||||
+void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
|
||||
+ ctx->cparams.cross_attn = cross_attention;
|
||||
+}
|
||||
+
|
||||
+ bool cross_attention_layers(uint32_t il) const {
|
||||
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
|
||||
+ }
|
||||
};
|
||||
void llama_synchronize(struct llama_context * ctx) {
|
||||
ggml_backend_sched_synchronize(ctx->sched.get());
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
@@ -2722,6 +2775,9 @@ struct llama_cparams {
|
||||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
bool no_perf;
|
||||
+ // TODO (jmorganca): this should most likely be passed in as part of a batch
|
||||
+ // and not set on the context for all batches.
|
||||
+ bool cross_attn = false;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
@@ -2881,6 +2937,16 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_down_scale;
|
||||
|
||||
struct ggml_tensor * bskcn_tv;
|
||||
+
|
||||
+ // cross attention
|
||||
+ struct ggml_tensor * cross_attn_k_norm;
|
||||
+ struct ggml_tensor * cross_attn_k_proj;
|
||||
+ struct ggml_tensor * cross_attn_o_proj;
|
||||
+ struct ggml_tensor * cross_attn_q_norm;
|
||||
+ struct ggml_tensor * cross_attn_q_proj;
|
||||
+ struct ggml_tensor * cross_attn_v_proj;
|
||||
+ struct ggml_tensor * cross_attn_attn_gate;
|
||||
+ struct ggml_tensor * cross_attn_mlp_gate;
|
||||
};
|
||||
|
||||
// very similar to llama_batch,
|
||||
@@ -3472,6 +3538,8 @@ struct llama_context {
|
||||
diff --git a/src/llama-context.h b/src/llama-context.h
|
||||
index 0d163c47..4980a60e 100644
|
||||
--- a/src/llama-context.h
|
||||
+++ b/src/llama-context.h
|
||||
@@ -107,6 +107,8 @@ struct llama_context {
|
||||
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
|
||||
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
|
||||
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
|
||||
@@ -240,11 +298,73 @@ index 26be6254..4778a9ed 100644
|
||||
+ struct ggml_tensor * inp_cross_attn_state; // F32 [4, n_embd, 1061]
|
||||
};
|
||||
|
||||
struct llama_lora_weight {
|
||||
@@ -3610,6 +3678,39 @@ static bool llama_kv_cache_init(
|
||||
// TODO: make these methods of llama_context
|
||||
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
|
||||
index 252012f3..9681e5a0 100644
|
||||
--- a/src/llama-cparams.h
|
||||
+++ b/src/llama-cparams.h
|
||||
@@ -29,6 +29,7 @@ struct llama_cparams {
|
||||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
bool no_perf;
|
||||
+ bool cross_attn;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
|
||||
index 450738da..42f8a58f 100644
|
||||
--- a/src/llama-hparams.cpp
|
||||
+++ b/src/llama-hparams.cpp
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
+#include <algorithm>
|
||||
+
|
||||
uint32_t llama_hparams::n_head(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
return n_head_arr[il];
|
||||
@@ -76,4 +78,8 @@ bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
-}
|
||||
\ No newline at end of file
|
||||
+}
|
||||
+
|
||||
+bool llama_hparams::cross_attention_layers(uint32_t il) const {
|
||||
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
|
||||
+}
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index fd898e27..f826cd9a 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -53,6 +53,7 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
|
||||
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
|
||||
+ std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
|
||||
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
@@ -139,6 +140,9 @@ struct llama_hparams {
|
||||
|
||||
// Block skip connection
|
||||
bool n_bskcn(uint32_t n, uint32_t il) const;
|
||||
+
|
||||
+ // cross attention layers
|
||||
+ bool cross_attention_layers(uint32_t il) const;
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 53379253..cf814dbe 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -72,6 +72,39 @@ bool llama_kv_cache_init(
|
||||
cache.v_l.reserve(n_layer);
|
||||
|
||||
for (int i = 0; i < (int) n_layer; i++) {
|
||||
for (int i = 0; i < n_layer; i++) {
|
||||
+ // for cross attention layers
|
||||
+ if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
|
||||
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||
@@ -281,36 +401,94 @@ index 26be6254..4778a9ed 100644
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
||||
|
||||
@@ -5547,12 +5648,14 @@ static void llm_load_hparams(
|
||||
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
|
||||
index 422524a8..b12d6566 100644
|
||||
--- a/src/llama-model-loader.cpp
|
||||
+++ b/src/llama-model-loader.cpp
|
||||
@@ -240,6 +240,8 @@ namespace GGUFMeta {
|
||||
return true;
|
||||
}
|
||||
|
||||
// zero-out the per-layer hparams
|
||||
- std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||||
- std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||||
- std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||||
+ std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||||
+ std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||||
+ std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||||
+ template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
|
||||
+
|
||||
template<typename T, size_t N_MAX>
|
||||
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 306c557d..4f9bbf90 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -146,46 +146,6 @@ std::string llama_model_ftype_name(const llama_model & model) {
|
||||
return llama_model_ftype_name(model.ftype);
|
||||
}
|
||||
|
||||
-template<typename F>
|
||||
-static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
|
||||
- ggml_init_params params = {
|
||||
- /*.mem_size =*/ ggml_tensor_overhead()*8,
|
||||
- /*.mem_buffer =*/ NULL,
|
||||
- /*.no_alloc =*/ true,
|
||||
- };
|
||||
-
|
||||
- ggml_context_ptr ctx { ggml_init(params) };
|
||||
- if (!ctx) {
|
||||
- throw std::runtime_error(format("failed to create ggml context"));
|
||||
- }
|
||||
-
|
||||
- ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
|
||||
- ggml_tensor * op_tensor = fn(ctx.get());
|
||||
- for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
- if (op_tensor->src[i] != nullptr) {
|
||||
- assert(op_tensor->src[i]->buffer == nullptr);
|
||||
- op_tensor->src[i]->buffer = buf.get();
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
||||
-
|
||||
- return op_supported;
|
||||
-}
|
||||
-
|
||||
-template<typename F>
|
||||
-static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
|
||||
- for (const auto & cur : buft_list) {
|
||||
- ggml_backend_dev_t cur_dev = cur.first;
|
||||
- ggml_backend_buffer_type_t cur_buft = cur.second;
|
||||
- if (buft_supported(cur_buft, cur_dev, fn)) {
|
||||
- return cur_buft;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- throw std::runtime_error(format("no suitable buffer type found"));
|
||||
-}
|
||||
-
|
||||
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
|
||||
return select_buft(
|
||||
*model.dev_layer.at(il).buft_list,
|
||||
@@ -312,9 +272,11 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
|
||||
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||||
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||||
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||||
+ std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
|
||||
|
||||
- ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
|
||||
- ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
|
||||
+ ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
|
||||
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
|
||||
- ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
||||
- ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
||||
+ ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
||||
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
||||
+ ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
|
||||
|
||||
// n_head_kv is optional, default to n_head
|
||||
hparams.n_head_kv_arr = hparams.n_head_arr;
|
||||
@@ -5601,7 +5704,7 @@ static void llm_load_hparams(
|
||||
@@ -363,7 +325,7 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||
|
||||
- if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
||||
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_MLLAMA || model.arch == LLM_ARCH_FALCON) {
|
||||
- if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
|
||||
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_MLLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
|
||||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||||
}
|
||||
@@ -5641,6 +5744,16 @@ static void llm_load_hparams(
|
||||
@@ -405,6 +367,16 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
|
||||
}
|
||||
}
|
||||
} break;
|
||||
@@ -324,27 +502,120 @@ index 26be6254..4778a9ed 100644
|
||||
+ default: model.type = e_model::MODEL_UNKNOWN;
|
||||
+ }
|
||||
+ } break;
|
||||
case LLM_ARCH_MINICPM:
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@@ -7291,7 +7404,15 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
|
||||
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
- {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}
|
||||
+ {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
};
|
||||
@@ -2062,6 +2034,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
|
||||
// checks if the weight tensor can be used with the specified buffer type and device
|
||||
@@ -7801,6 +7922,53 @@ static bool llm_load_tensors(
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
case LLM_ARCH_LLAMA:
|
||||
+ case LLM_ARCH_MLLAMA:
|
||||
case LLM_ARCH_DECI:
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
case LLM_ARCH_STARCODER:
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index c1b9c0a1..5b23e2ba 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -9,6 +9,7 @@
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <vector>
|
||||
+#include <stdexcept>
|
||||
|
||||
// available models
|
||||
// TODO: this enum does not follow the enum naming convention
|
||||
@@ -62,6 +63,7 @@ enum llm_type {
|
||||
MODEL_40B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
+ MODEL_90B,
|
||||
MODEL_236B,
|
||||
MODEL_314B,
|
||||
MODEL_671B,
|
||||
@@ -278,6 +280,16 @@ struct llama_layer {
|
||||
|
||||
struct ggml_tensor * bskcn_tv = nullptr;
|
||||
|
||||
+ // cross attention
|
||||
+ struct ggml_tensor * cross_attn_k_norm = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_k_proj = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_o_proj = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_q_norm = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_q_proj = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_v_proj = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_attn_gate = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_mlp_gate = nullptr;
|
||||
+
|
||||
struct llama_layer_posnet posnet;
|
||||
|
||||
struct llama_layer_convnext convnext;
|
||||
@@ -376,6 +388,45 @@ std::string llama_model_arch_name (const llama_model & model);
|
||||
std::string llama_model_type_name (const llama_model & model);
|
||||
std::string llama_model_ftype_name(const llama_model & model);
|
||||
|
||||
+template<typename F>
|
||||
+bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
|
||||
+ ggml_init_params params = {
|
||||
+ /*.mem_size =*/ ggml_tensor_overhead()*8,
|
||||
+ /*.mem_buffer =*/ NULL,
|
||||
+ /*.no_alloc =*/ true,
|
||||
+ };
|
||||
+
|
||||
+ ggml_context_ptr ctx { ggml_init(params) };
|
||||
+ if (!ctx) {
|
||||
+ throw std::runtime_error("failed to create ggml context");
|
||||
+ }
|
||||
+
|
||||
+ ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
|
||||
+ ggml_tensor * op_tensor = fn(ctx.get());
|
||||
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
+ if (op_tensor->src[i] != nullptr) {
|
||||
+ op_tensor->src[i]->buffer = buf.get();
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
||||
+
|
||||
+ return op_supported;
|
||||
+}
|
||||
+
|
||||
+template<typename F>
|
||||
+ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
|
||||
+ for (const auto & cur : buft_list) {
|
||||
+ ggml_backend_dev_t cur_dev = cur.first;
|
||||
+ ggml_backend_buffer_type_t cur_buft = cur.second;
|
||||
+ if (buft_supported(cur_buft, cur_dev, fn)) {
|
||||
+ return cur_buft;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ throw std::runtime_error("no suitable buffer type found");
|
||||
+}
|
||||
+
|
||||
// used by llama_adapter_cvec
|
||||
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
|
||||
|
||||
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
|
||||
index 42974f8f..27def6fd 100644
|
||||
--- a/src/llama-quant.cpp
|
||||
+++ b/src/llama-quant.cpp
|
||||
@@ -629,7 +629,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
if (llama_model_has_encoder(&model)) {
|
||||
n_attn_layer *= 3;
|
||||
}
|
||||
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
|
||||
+ if (qs.n_attention_wv != n_attn_layer) {
|
||||
+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
|
||||
+ }
|
||||
}
|
||||
|
||||
size_t total_size_org = 0;
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 7dec50ae..bac66c24 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -563,6 +563,52 @@ static bool llm_load_tensors(
|
||||
}
|
||||
}
|
||||
} break;
|
||||
@@ -364,7 +635,6 @@ index 26be6254..4778a9ed 100644
|
||||
+ }
|
||||
+
|
||||
+ for (int i = 0; i < n_layer; ++i) {
|
||||
+
|
||||
+ auto & layer = model.layers[i];
|
||||
+
|
||||
+ if (hparams.cross_attention_layers(i)) {
|
||||
@@ -395,10 +665,10 @@ index 26be6254..4778a9ed 100644
|
||||
+ }
|
||||
+ }
|
||||
+ } break;
|
||||
case LLM_ARCH_MINICPM3:
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
@@ -9511,7 +9679,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@@ -2514,7 +2560,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
|
||||
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
|
||||
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
||||
@@ -407,7 +677,7 @@ index 26be6254..4778a9ed 100644
|
||||
}
|
||||
|
||||
if (params.vocab_only) {
|
||||
@@ -9594,6 +9762,21 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||
@@ -2598,6 +2644,21 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||
return inpL;
|
||||
}
|
||||
|
||||
@@ -429,7 +699,7 @@ index 26be6254..4778a9ed 100644
|
||||
static void llm_build_kv_store(
|
||||
struct ggml_context * ctx,
|
||||
const llama_hparams & hparams,
|
||||
@@ -10561,6 +10744,7 @@ struct llm_build_context {
|
||||
@@ -3593,6 +3654,7 @@ struct llm_build_context {
|
||||
lctx.inp_pos_bucket = nullptr;
|
||||
lctx.inp_embd_enc = nullptr;
|
||||
lctx.inp_KQ_mask_cross = nullptr;
|
||||
@@ -437,11 +707,11 @@ index 26be6254..4778a9ed 100644
|
||||
}
|
||||
|
||||
void free() {
|
||||
@@ -11040,6 +11224,240 @@ struct llm_build_context {
|
||||
@@ -4074,6 +4136,240 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
+ struct ggml_cgraph * build_mllama() {
|
||||
+ struct ggml_cgraph * build_mllama() {
|
||||
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
+
|
||||
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
@@ -675,10 +945,10 @@ index 26be6254..4778a9ed 100644
|
||||
+ return gf;
|
||||
+ }
|
||||
+
|
||||
struct ggml_cgraph * build_baichuan() {
|
||||
struct ggml_cgraph * build_deci() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
@@ -16993,6 +17411,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
@@ -10646,6 +10942,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_llama();
|
||||
} break;
|
||||
@@ -686,33 +956,10 @@ index 26be6254..4778a9ed 100644
|
||||
+ {
|
||||
+ result = llm.build_mllama();
|
||||
+ } break;
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
result = llm.build_baichuan();
|
||||
@@ -17258,10 +17680,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
|
||||
}
|
||||
|
||||
if (ubatch.embd) {
|
||||
- const int64_t n_embd = hparams.n_embd;
|
||||
- const int64_t n_tokens = ubatch.n_tokens;
|
||||
+ if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
|
||||
+ ggml_backend_tensor_set(lctx.inp_cross_attn_state, ubatch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
|
||||
+ // zero out inp_embd since it's not used
|
||||
+ float * inp_embd_data = (float *)lctx.inp_embd->data;
|
||||
+ for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
|
||||
+ inp_embd_data[i] = 0.0f;
|
||||
+ }
|
||||
+ } else {
|
||||
+ const int64_t n_embd = hparams.n_embd;
|
||||
+ const int64_t n_tokens = ubatch.n_tokens;
|
||||
|
||||
- ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
||||
+ ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
||||
+ }
|
||||
}
|
||||
|
||||
if (ubatch.pos && lctx.inp_pos) {
|
||||
@@ -17862,7 +18293,7 @@ static int llama_decode_internal(
|
||||
result = llm.build_deci();
|
||||
@@ -10971,7 +11271,7 @@ static int llama_decode_internal(
|
||||
n_outputs = 1;
|
||||
}
|
||||
|
||||
@@ -721,7 +968,7 @@ index 26be6254..4778a9ed 100644
|
||||
/* simple_split */ !kv_self.recurrent,
|
||||
/* logits_all */ n_outputs == n_tokens_all);
|
||||
|
||||
@@ -18172,7 +18603,7 @@ static int llama_encode_internal(
|
||||
@@ -11282,7 +11582,7 @@ static int llama_encode_internal(
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
@@ -730,57 +977,11 @@ index 26be6254..4778a9ed 100644
|
||||
|
||||
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
|
||||
|
||||
@@ -19203,7 +19634,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
if (llama_model_has_encoder(&model)) {
|
||||
n_attn_layer *= 3;
|
||||
}
|
||||
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
|
||||
+ if (qs.n_attention_wv != n_attn_layer) {
|
||||
+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
|
||||
+ }
|
||||
}
|
||||
|
||||
size_t total_size_org = 0;
|
||||
@@ -20360,6 +20793,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
case LLM_ARCH_LLAMA:
|
||||
+ case LLM_ARCH_MLLAMA:
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
case LLM_ARCH_STARCODER:
|
||||
case LLM_ARCH_PLAMO:
|
||||
@@ -21790,6 +22224,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
||||
ctx->cparams.causal_attn = causal_attn;
|
||||
}
|
||||
|
||||
+void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
|
||||
+ ctx->cparams.cross_attn = cross_attention;
|
||||
+}
|
||||
+
|
||||
struct llama_batch llama_batch_get_one(
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens) {
|
||||
@@ -21797,6 +22235,7 @@ struct llama_batch llama_batch_get_one(
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ tokens,
|
||||
/*embd =*/ nullptr,
|
||||
+ /*n_embd =*/ 0,
|
||||
/*pos =*/ nullptr,
|
||||
/*n_seq_id =*/ nullptr,
|
||||
/*seq_id =*/ nullptr,
|
||||
@@ -21809,6 +22248,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
/*n_tokens =*/ 0,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ nullptr,
|
||||
+ /*n_embd =*/ 0,
|
||||
/*pos =*/ nullptr,
|
||||
/*n_seq_id =*/ nullptr,
|
||||
/*seq_id =*/ nullptr,
|
||||
@@ -21817,6 +22257,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
|
||||
if (embd) {
|
||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
||||
+ batch.n_embd = embd;
|
||||
} else {
|
||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
||||
}
|
||||
@@ -11775,6 +12075,7 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.offload_kqv =*/ true,
|
||||
/*.flash_attn =*/ false,
|
||||
/*.no_perf =*/ true,
|
||||
+ /*.cross_attn =*/ false,
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
};
|
||||
|
||||
@@ -15,7 +15,7 @@ Subject: [PATCH] add unpad operator
|
||||
8 files changed, 220 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
||||
index b0c1ac9c..091e6e6b 100644
|
||||
index c714fc8c..1bc50fca 100644
|
||||
--- a/ggml/include/ggml.h
|
||||
+++ b/ggml/include/ggml.h
|
||||
@@ -499,6 +499,7 @@ extern "C" {
|
||||
@@ -26,7 +26,7 @@ index b0c1ac9c..091e6e6b 100644
|
||||
GGML_OP_ARANGE,
|
||||
GGML_OP_TIMESTEP_EMBEDDING,
|
||||
GGML_OP_ARGSORT,
|
||||
@@ -1718,6 +1719,15 @@ extern "C" {
|
||||
@@ -1735,6 +1736,15 @@ extern "C" {
|
||||
int p0,
|
||||
int p1);
|
||||
|
||||
@@ -43,7 +43,7 @@ index b0c1ac9c..091e6e6b 100644
|
||||
// timesteps: [N,]
|
||||
// return: [N, dim]
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 67e67a08..bebff207 100644
|
||||
index b7fefb9d..b307d554 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -10588,6 +10588,59 @@ static void ggml_compute_forward_pad_reflect_1d(
|
||||
@@ -126,7 +126,7 @@ index 67e67a08..bebff207 100644
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 8fd7c1a3..7c351b89 100644
|
||||
index bb425ee8..1e7c2a22 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2085,6 +2085,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
@@ -139,7 +139,7 @@ index 8fd7c1a3..7c351b89 100644
|
||||
case GGML_OP_ARANGE:
|
||||
ggml_cuda_op_arange(ctx, dst);
|
||||
break;
|
||||
@@ -3012,6 +3015,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
@@ -3013,6 +3016,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_GROUP_NORM:
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
@@ -211,7 +211,7 @@ index 8fd386b0..e2ededc3 100644
|
||||
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 28f590f9..787fc713 100644
|
||||
index a85502ee..84e027eb 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -311,6 +311,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
@@ -332,7 +332,7 @@ index 8ba43904..204c93e6 100644
|
||||
device char * dst,
|
||||
constant int64_t & ne0,
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index 51cc8566..0e74e554 100644
|
||||
index 2bbe5f48..7ffcd907 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -954,6 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
@@ -369,7 +369,7 @@ index 51cc8566..0e74e554 100644
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -4180,6 +4182,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
||||
@@ -4214,6 +4216,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@@ -11,10 +11,10 @@ the characters
|
||||
2 files changed, 23 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 8c9aaf5a..3e372dc3 100644
|
||||
index 3fcfcaa3..8f44705a 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
@@ -375,7 +375,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
||||
regex_exprs = {
|
||||
"[\r\n]",
|
||||
@@ -24,7 +24,7 @@ index 8c9aaf5a..3e372dc3 100644
|
||||
"\\s+$",
|
||||
"[一-龥ࠀ-一가-]+",
|
||||
diff --git a/src/unicode.cpp b/src/unicode.cpp
|
||||
index 3d459263..51dd81fb 100644
|
||||
index 7aca6544..6155da80 100644
|
||||
--- a/src/unicode.cpp
|
||||
+++ b/src/unicode.cpp
|
||||
@@ -2,6 +2,11 @@
|
||||
|
||||
@@ -10,7 +10,7 @@ Subject: [PATCH] relative include paths
|
||||
3 files changed, 3 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index bebff207..d6dd5600 100644
|
||||
index b307d554..4eb39c52 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -10,7 +10,7 @@
|
||||
@@ -23,7 +23,7 @@ index bebff207..d6dd5600 100644
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
|
||||
index c390957a..1af5f7eb 100644
|
||||
index f11399cc..2a8b40ce 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
|
||||
@@ -4,8 +4,7 @@
|
||||
|
||||
@@ -19,12 +19,12 @@ multiple batches of processing until everything is complete.
|
||||
1 file changed, 46 insertions(+), 53 deletions(-)
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 4778a9ed..654e32bc 100644
|
||||
index bac66c24..c95da45d 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -3025,6 +3025,13 @@ struct llama_kv_cache {
|
||||
}
|
||||
};
|
||||
@@ -3536,6 +3536,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
|
||||
return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
|
||||
}
|
||||
|
||||
+// block of KV slots to move when defragging
|
||||
+struct llama_kv_defrag_move {
|
||||
@@ -33,10 +33,10 @@ index 4778a9ed..654e32bc 100644
|
||||
+ uint32_t len;
|
||||
+};
|
||||
+
|
||||
struct llama_control_vector {
|
||||
std::vector<struct ggml_tensor *> tensors; // per layer
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
@@ -10802,35 +10809,23 @@ struct llm_build_context {
|
||||
struct llm_build_context {
|
||||
const llama_model & model;
|
||||
llama_context & lctx;
|
||||
@@ -3712,35 +3719,23 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
@@ -78,7 +78,7 @@ index 4778a9ed..654e32bc 100644
|
||||
|
||||
ggml_tensor * view_v_src;
|
||||
ggml_tensor * view_v_dst;
|
||||
@@ -10838,31 +10833,29 @@ struct llm_build_context {
|
||||
@@ -3748,31 +3743,29 @@ struct llm_build_context {
|
||||
if (flash_attn) {
|
||||
// NOTE: the V cache is not transposed when using flash attention
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
@@ -118,7 +118,7 @@ index 4778a9ed..654e32bc 100644
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||
@@ -17325,7 +17318,7 @@ struct llm_build_context {
|
||||
@@ -10856,7 +10849,7 @@ struct llm_build_context {
|
||||
}
|
||||
};
|
||||
|
||||
@@ -127,7 +127,7 @@ index 4778a9ed..654e32bc 100644
|
||||
llama_ubatch dummy = {};
|
||||
dummy.equal_seqs = true;
|
||||
|
||||
@@ -17335,7 +17328,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
|
||||
@@ -10866,7 +10859,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
|
||||
|
||||
llm.init();
|
||||
|
||||
@@ -136,7 +136,7 @@ index 4778a9ed..654e32bc 100644
|
||||
|
||||
llm.free();
|
||||
|
||||
@@ -18351,7 +18344,12 @@ static int llama_decode_internal(
|
||||
@@ -11329,7 +11322,12 @@ static int llama_decode_internal(
|
||||
kv_self.head = 0;
|
||||
}
|
||||
|
||||
@@ -150,7 +150,7 @@ index 4778a9ed..654e32bc 100644
|
||||
if (!slot) {
|
||||
return 1;
|
||||
}
|
||||
@@ -18756,8 +18754,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
@@ -11735,8 +11733,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
|
||||
//const int64_t t_start = ggml_time_us();
|
||||
|
||||
@@ -161,7 +161,7 @@ index 4778a9ed..654e32bc 100644
|
||||
|
||||
// each move requires 6*n_layer tensors (see build_defrag)
|
||||
// - source view, destination view, copy operation
|
||||
@@ -18821,19 +18819,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
@@ -11800,19 +11798,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
// are we moving a continuous block of memory?
|
||||
bool cont = false;
|
||||
|
||||
@@ -181,7 +181,7 @@ index 4778a9ed..654e32bc 100644
|
||||
cont = false;
|
||||
continue;
|
||||
}
|
||||
@@ -18849,8 +18839,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
@@ -11828,8 +11818,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
kv_self.head = n_used;
|
||||
|
||||
if (!cont) {
|
||||
@@ -193,7 +193,7 @@ index 4778a9ed..654e32bc 100644
|
||||
}
|
||||
|
||||
nf++;
|
||||
@@ -18860,22 +18852,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
@@ -11839,22 +11831,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
}
|
||||
}
|
||||
|
||||
@@ -218,7 +218,7 @@ index 4778a9ed..654e32bc 100644
|
||||
|
||||
#if 0
|
||||
// CPU defrag
|
||||
@@ -18950,11 +18936,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
@@ -11929,11 +11915,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
#else
|
||||
// ggml_graph defrag
|
||||
|
||||
|
||||
113
llama/patches/0015-re-enable-gpu-for-clip.patch
Normal file
113
llama/patches/0015-re-enable-gpu-for-clip.patch
Normal file
@@ -0,0 +1,113 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Sat, 4 Jan 2025 22:52:48 -0800
|
||||
Subject: [PATCH] re-enable gpu for clip
|
||||
|
||||
---
|
||||
examples/llava/clip.cpp | 86 ++++++++++++++++++++---------------------
|
||||
1 file changed, 43 insertions(+), 43 deletions(-)
|
||||
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index b3c1829f..718052e1 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -8,25 +8,25 @@
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
-//#ifdef GGML_USE_CUDA
|
||||
-//#include "ggml-cuda.h"
|
||||
-//#endif
|
||||
-//
|
||||
-//#ifdef GGML_USE_SYCL
|
||||
-//#include "ggml-sycl.h"
|
||||
-//#endif
|
||||
-//
|
||||
-//#ifdef GGML_USE_METAL
|
||||
-//#include "ggml-metal.h"
|
||||
-//#endif
|
||||
-//
|
||||
-//#ifdef GGML_USE_CANN
|
||||
-//#include "ggml-cann.h"
|
||||
-//#endif
|
||||
-//
|
||||
-//#ifdef GGML_USE_VULKAN
|
||||
-//#include "ggml-vulkan.h"
|
||||
-//#endif
|
||||
+#ifdef GGML_USE_CUDA
|
||||
+#include "ggml-cuda.h"
|
||||
+#endif
|
||||
+
|
||||
+#ifdef GGML_USE_SYCL
|
||||
+#include "ggml-sycl.h"
|
||||
+#endif
|
||||
+
|
||||
+#ifdef GGML_USE_METAL
|
||||
+#include "ggml-metal.h"
|
||||
+#endif
|
||||
+
|
||||
+#ifdef GGML_USE_CANN
|
||||
+#include "ggml-cann.h"
|
||||
+#endif
|
||||
+
|
||||
+#ifdef GGML_USE_VULKAN
|
||||
+#include "ggml-vulkan.h"
|
||||
+#endif
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb_image.h"
|
||||
@@ -1235,30 +1235,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
}
|
||||
|
||||
-//#ifdef GGML_USE_CUDA
|
||||
-// new_clip->backend = ggml_backend_cuda_init(0);
|
||||
-// LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
||||
-//#endif
|
||||
-//
|
||||
-//#ifdef GGML_USE_METAL
|
||||
-// new_clip->backend = ggml_backend_metal_init();
|
||||
-// LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
||||
-//#endif
|
||||
-//
|
||||
-//#ifdef GGML_USE_CANN
|
||||
-// new_clip->backend = ggml_backend_cann_init(0);
|
||||
-// LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
||||
-//#endif
|
||||
-//
|
||||
-//#ifdef GGML_USE_VULKAN
|
||||
-// new_clip->backend = ggml_backend_vk_init(0);
|
||||
-// LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
||||
-//#endif
|
||||
-//
|
||||
-//#ifdef GGML_USE_SYCL
|
||||
-// new_clip->backend = ggml_backend_sycl_init(0);
|
||||
-// LOG_INF("%s: CLIP using SYCL backend\n", __func__);
|
||||
-//#endif
|
||||
+#ifdef GGML_USE_CUDA
|
||||
+ new_clip->backend = ggml_backend_cuda_init(0);
|
||||
+ LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
||||
+#endif
|
||||
+
|
||||
+#ifdef GGML_USE_METAL
|
||||
+ new_clip->backend = ggml_backend_metal_init();
|
||||
+ LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
||||
+#endif
|
||||
+
|
||||
+#ifdef GGML_USE_CANN
|
||||
+ new_clip->backend = ggml_backend_cann_init(0);
|
||||
+ LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
||||
+#endif
|
||||
+
|
||||
+#ifdef GGML_USE_VULKAN
|
||||
+ new_clip->backend = ggml_backend_vk_init(0);
|
||||
+ LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
||||
+#endif
|
||||
+
|
||||
+#ifdef GGML_USE_SYCL
|
||||
+ new_clip->backend = ggml_backend_sycl_init(0);
|
||||
+ LOG_INF("%s: CLIP using SYCL backend\n", __func__);
|
||||
+#endif
|
||||
|
||||
if (!new_clip->backend) {
|
||||
new_clip->backend = ggml_backend_cpu_init();
|
||||
Reference in New Issue
Block a user