mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 07:46:59 +00:00
llama: update llama.cpp vendor code to commit d7cfe1ff (#9356)
This commit is contained in:
@@ -10,7 +10,7 @@ Subject: [PATCH] cuda
|
||||
3 files changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index e2d6c405..a12172dc 100644
|
||||
index dba7be33..1ca40b2c 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
@@ -22,10 +22,10 @@ index e2d6c405..a12172dc 100644
|
||||
|
||||
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 0b06be72..be29e979 100644
|
||||
index ebb2ccae..b094929b 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -424,6 +424,7 @@ struct ggml_backend_cuda_buffer_context {
|
||||
@@ -529,6 +529,7 @@ struct ggml_backend_cuda_buffer_context {
|
||||
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
@@ -34,10 +34,10 @@ index 0b06be72..be29e979 100644
|
||||
|
||||
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index a85502ee..cd8ef741 100644
|
||||
index c550142a..fd9a4e77 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -4187,6 +4187,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
@@ -4350,6 +4350,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
}
|
||||
|
||||
free(ctx);
|
||||
|
||||
@@ -4,17 +4,17 @@ Date: Mon, 16 Sep 2024 15:53:13 -0700
|
||||
Subject: [PATCH] pretokenizer
|
||||
|
||||
---
|
||||
src/llama-model.cpp | 14 +++-----------
|
||||
src/llama-vocab.cpp | 14 +++-----------
|
||||
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 405e0528..00b80c52 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1249,16 +1249,7 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
|
||||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
vocab.tokenizer_add_space_prefix = false;
|
||||
vocab.tokenizer_clean_spaces = true;
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index ad9ffe66..a4eee9b8 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1468,16 +1468,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
if (type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
add_space_prefix = false;
|
||||
clean_spaces = true;
|
||||
- if (tokenizer_pre.empty()) {
|
||||
- LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
||||
- LLAMA_LOG_WARN("%s: \n", __func__);
|
||||
@@ -23,19 +23,19 @@ index 405e0528..00b80c52 100644
|
||||
- LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
|
||||
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
||||
- LLAMA_LOG_WARN("%s: \n", __func__);
|
||||
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
- pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
- } else if (tokenizer_pre == "default") {
|
||||
+ if (tokenizer_pre == "default") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -1373,7 +1364,8 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
|
||||
@@ -1593,7 +1584,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "megrez") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||
} else {
|
||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
}
|
||||
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (type == LLAMA_VOCAB_TYPE_SPM) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
|
||||
@@ -9,10 +9,10 @@ Subject: [PATCH] embeddings
|
||||
2 files changed, 5 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 38a55fb2..b9c4a5bf 100644
|
||||
index 671d2a81..47e79ed4 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -475,7 +475,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
|
||||
@@ -479,7 +479,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
@@ -22,10 +22,10 @@ index 38a55fb2..b9c4a5bf 100644
|
||||
|
||||
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index ea78ea48..4eb3f6b9 100644
|
||||
index 607f2786..ac85bfed 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -10876,7 +10876,6 @@ static int llama_decode_internal(
|
||||
@@ -8652,7 +8652,6 @@ static int llama_decode_impl(
|
||||
res = nullptr;
|
||||
embd = nullptr;
|
||||
} else if (cparams.embeddings) {
|
||||
@@ -33,7 +33,7 @@ index ea78ea48..4eb3f6b9 100644
|
||||
embd = nullptr;
|
||||
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
||||
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
|
||||
@@ -10884,12 +10883,15 @@ static int llama_decode_internal(
|
||||
@@ -8660,12 +8659,15 @@ static int llama_decode_impl(
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] clip-unicode
|
||||
1 file changed, 39 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index 3cd0d2fa..b3c1829f 100644
|
||||
index 76d4a785..205af1eb 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -56,6 +56,19 @@
|
||||
@@ -58,6 +58,19 @@
|
||||
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||
#endif // defined(LLAVA_LOG_OFF)
|
||||
|
||||
@@ -31,7 +31,7 @@ index 3cd0d2fa..b3c1829f 100644
|
||||
//#define CLIP_DEBUG_FUNCTIONS
|
||||
|
||||
// RGB uint8 image
|
||||
@@ -1322,8 +1335,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
@@ -1402,8 +1415,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
@@ -62,7 +62,7 @@ index 3cd0d2fa..b3c1829f 100644
|
||||
if (!fin) {
|
||||
LOG_ERR("cannot open model file for loading tensors\n");
|
||||
clip_free(new_clip);
|
||||
@@ -1363,7 +1397,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
@@ -1443,7 +1477,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,21 +11,21 @@ tensor to store the scalar. the scalar is implemented a 1-dimensional
|
||||
tensor with 2 elements dervied from the model's bskcn_tv configuration.
|
||||
in general, the values are (bskcn_tv, 1 - bskcn_tv)
|
||||
---
|
||||
src/llama-arch.cpp | 53 +++++++----
|
||||
src/llama-arch.cpp | 21 +++++
|
||||
src/llama-arch.h | 3 +
|
||||
src/llama-hparams.cpp | 8 ++
|
||||
src/llama-hparams.h | 5 +
|
||||
src/llama-hparams.h | 5 ++
|
||||
src/llama-model-loader.cpp | 1 +
|
||||
src/llama-model.cpp | 16 ++++
|
||||
src/llama-model.cpp | 44 +++++++++++
|
||||
src/llama-model.h | 3 +
|
||||
src/llama.cpp | 185 +++++++++++++++++++++++++++++++++++++
|
||||
8 files changed, 258 insertions(+), 16 deletions(-)
|
||||
src/llama.cpp | 152 ++++++++++++++++++++++++++++++++++++-
|
||||
8 files changed, 236 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index 007d79f8..5b376c5e 100644
|
||||
index 97a1e7e5..a1e0ebcc 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -59,6 +59,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
@@ -61,6 +61,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GRANITE, "granite" },
|
||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
||||
@@ -33,48 +33,16 @@ index 007d79f8..5b376c5e 100644
|
||||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
@@ -106,22 +107,23 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
|
||||
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
|
||||
|
||||
- { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
- { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
- { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
||||
- { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
||||
- { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
|
||||
- { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
||||
- { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
||||
- { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
||||
- { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
|
||||
- { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
|
||||
- { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
||||
- { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
||||
- { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
||||
- { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
- { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
- { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
+ { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
+ { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
+ { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
||||
+ { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
||||
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
|
||||
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
||||
+ { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
||||
+ { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
||||
+ { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
|
||||
+ { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
|
||||
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
||||
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
||||
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
||||
+ { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
+ { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
@@ -125,6 +126,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
@@ -1240,6 +1242,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
|
||||
@@ -1271,6 +1273,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
},
|
||||
},
|
||||
+ {
|
||||
@@ -96,9 +64,9 @@ index 007d79f8..5b376c5e 100644
|
||||
+ },
|
||||
+ },
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
{
|
||||
@@ -1372,6 +1392,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
@@ -1429,6 +1449,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
@@ -107,10 +75,10 @@ index 007d79f8..5b376c5e 100644
|
||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index 45e458bb..eac7055b 100644
|
||||
index 122fdceb..77919578 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -63,6 +63,7 @@ enum llm_arch {
|
||||
@@ -65,6 +65,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GRANITE,
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
LLM_ARCH_CHAMELEON,
|
||||
@@ -118,7 +86,7 @@ index 45e458bb..eac7055b 100644
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
@@ -126,6 +127,7 @@ enum llm_kv {
|
||||
@@ -129,6 +130,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
@@ -126,7 +94,7 @@ index 45e458bb..eac7055b 100644
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
@@ -305,6 +307,7 @@ enum llm_tensor {
|
||||
@@ -311,6 +313,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
@@ -135,7 +103,7 @@ index 45e458bb..eac7055b 100644
|
||||
LLM_TENSOR_CONVNEXT_DW,
|
||||
LLM_TENSOR_CONVNEXT_NORM,
|
||||
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
|
||||
index c4053469..450738da 100644
|
||||
index ea87b295..f3955de9 100644
|
||||
--- a/src/llama-hparams.cpp
|
||||
+++ b/src/llama-hparams.cpp
|
||||
@@ -69,3 +69,11 @@ uint32_t llama_hparams::n_embd_v_s() const {
|
||||
@@ -152,10 +120,10 @@ index c4053469..450738da 100644
|
||||
+}
|
||||
\ No newline at end of file
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index a29f20ec..fd898e27 100644
|
||||
index 1fe45410..1bdcdfd5 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -52,6 +52,8 @@ struct llama_hparams {
|
||||
@@ -50,6 +50,8 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
|
||||
@@ -164,7 +132,7 @@ index a29f20ec..fd898e27 100644
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
uint32_t n_lora_kv = 0;
|
||||
@@ -134,6 +136,9 @@ struct llama_hparams {
|
||||
@@ -133,6 +135,9 @@ struct llama_hparams {
|
||||
|
||||
// dimension of the recurrent state embeddings
|
||||
uint32_t n_embd_v_s() const;
|
||||
@@ -175,23 +143,23 @@ index a29f20ec..fd898e27 100644
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
|
||||
index 7743b465..422524a8 100644
|
||||
index 05d58ad9..1252aca1 100644
|
||||
--- a/src/llama-model-loader.cpp
|
||||
+++ b/src/llama-model-loader.cpp
|
||||
@@ -364,6 +364,7 @@ namespace GGUFMeta {
|
||||
@@ -439,6 +439,7 @@ namespace GGUFMeta {
|
||||
// TODO: this is not very clever - figure out something better
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
||||
+ template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
||||
|
||||
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
|
||||
int trace = 0;
|
||||
llama_model_loader::llama_model_loader(
|
||||
const std::string & fname,
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 00b80c52..306c557d 100644
|
||||
index 36a0a009..ad1315c6 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1091,6 +1091,21 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
@@ -1238,6 +1238,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
@@ -200,52 +168,19 @@ index 00b80c52..306c557d 100644
|
||||
+ for (size_t i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
|
||||
+ auto & bskcn = hparams.n_bskcn_arr[i];
|
||||
+ bskcn.fill(0);
|
||||
+ auto kv = LLM_KV(model.arch);
|
||||
+ auto kv = LLM_KV(arch);
|
||||
+ ml.get_key_or_arr(format((kv(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION) + ".%d").c_str(), i), bskcn, hparams.n_layer, false);
|
||||
+ }
|
||||
+
|
||||
+ switch (hparams.n_layer) {
|
||||
+ case 64: model.type = e_model::MODEL_22B; break;
|
||||
+ default: model.type = e_model::MODEL_UNKNOWN;
|
||||
+ case 64: type = LLM_TYPE_22B; break;
|
||||
+ default: type = LLM_TYPE_UNKNOWN;
|
||||
+ }
|
||||
+ } break;
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -2065,6 +2080,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
// the pairs of head values are offset by n_rot/2
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index ce038932..c1b9c0a1 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -54,6 +54,7 @@ enum llm_type {
|
||||
MODEL_15B,
|
||||
MODEL_16B,
|
||||
MODEL_20B,
|
||||
+ MODEL_22B,
|
||||
MODEL_30B,
|
||||
MODEL_32B,
|
||||
MODEL_34B,
|
||||
@@ -275,6 +276,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_up_scale = nullptr;
|
||||
struct ggml_tensor * ffn_down_scale = nullptr;
|
||||
|
||||
+ struct ggml_tensor * bskcn_tv = nullptr;
|
||||
+
|
||||
struct llama_layer_posnet posnet;
|
||||
|
||||
struct llama_layer_convnext convnext;
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 4eb3f6b9..7dec50ae 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -2206,6 +2206,35 @@ static bool llm_load_tensors(
|
||||
@@ -3316,6 +3331,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
@@ -256,16 +191,16 @@ index 4eb3f6b9..7dec50ae 100644
|
||||
+ } break;
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
+ {
|
||||
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
+
|
||||
+ // output
|
||||
+ {
|
||||
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
+ }
|
||||
+
|
||||
+ for (int i = 0; i < n_layer; ++i) {
|
||||
+ auto & layer = model.layers[i];
|
||||
+ auto & layer = layers[i];
|
||||
+
|
||||
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
+
|
||||
@@ -277,16 +212,53 @@ index 4eb3f6b9..7dec50ae 100644
|
||||
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
+
|
||||
+ layer.bskcn_tv = create_tensor(tn(LLM_TENSOR_BSKCN_TV, "weight", i), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
+
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -10226,6 +10255,158 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
@@ -3900,6 +3943,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
+ ggml_cgraph * build_solar() {
|
||||
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
// the pairs of head values are offset by n_rot/2
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index a7c30444..1afb0024 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -55,6 +55,7 @@ enum llm_type {
|
||||
LLM_TYPE_15B,
|
||||
LLM_TYPE_16B,
|
||||
LLM_TYPE_20B,
|
||||
+ LLM_TYPE_22B,
|
||||
LLM_TYPE_30B,
|
||||
LLM_TYPE_32B,
|
||||
LLM_TYPE_34B,
|
||||
@@ -281,6 +282,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_up_scale = nullptr;
|
||||
struct ggml_tensor * ffn_down_scale = nullptr;
|
||||
|
||||
+ struct ggml_tensor * bskcn_tv = nullptr;
|
||||
+
|
||||
struct llama_layer_posnet posnet;
|
||||
|
||||
struct llama_layer_convnext convnext;
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index ac85bfed..6d320ea4 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -7953,9 +7953,155 @@ struct llm_build_context {
|
||||
cb(img_logits, "img_logits", -1);
|
||||
cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
|
||||
cb(cur, "result_output", -1);
|
||||
-
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
+ return gf;
|
||||
+ }
|
||||
+
|
||||
+ ggml_cgraph * build_solar() {
|
||||
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||
+
|
||||
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
+ int32_t n_tokens = this->n_tokens;
|
||||
@@ -333,7 +305,7 @@ index 4eb3f6b9..7dec50ae 100644
|
||||
+ ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
|
||||
+ ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
|
||||
+ }
|
||||
+
|
||||
|
||||
+ // norm
|
||||
+ cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
+ model.layers[il].attn_norm, NULL,
|
||||
@@ -422,25 +394,18 @@ index 4eb3f6b9..7dec50ae 100644
|
||||
+ }
|
||||
+
|
||||
+ cur = inpL;
|
||||
+
|
||||
+ cur = llm_build_norm(ctx0, cur, hparams,
|
||||
+ model.output_norm, NULL,
|
||||
+ LLM_NORM_RMS, cb, -1);
|
||||
+ cb(cur, "result_norm", -1);
|
||||
+
|
||||
+ // lm_head
|
||||
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||
+ cb(cur, "result_output", -1);
|
||||
+
|
||||
+ ggml_build_forward_expand(gf, cur);
|
||||
+
|
||||
+ return gf;
|
||||
+ }
|
||||
+
|
||||
struct ggml_cgraph * build_wavtokenizer_dec() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
return gf;
|
||||
}
|
||||
|
||||
@@ -10660,6 +10841,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
@@ -8398,6 +8544,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_chameleon();
|
||||
} break;
|
||||
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index be29e979..aaa79ea4 100644
|
||||
index b094929b..36165840 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2159,9 +2159,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
@@ -2282,9 +2282,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_ARGSORT:
|
||||
ggml_cuda_op_argsort(ctx, dst);
|
||||
break;
|
||||
|
||||
@@ -15,27 +15,27 @@ remaining is to implement the cross attention mask
|
||||
examples/llava/llava.cpp | 5 +-
|
||||
ggml/src/ggml-backend-reg.cpp | 6 +-
|
||||
include/llama.h | 6 +
|
||||
src/llama-arch.cpp | 44 +++++
|
||||
src/llama-arch.cpp | 44 ++++++
|
||||
src/llama-arch.h | 10 ++
|
||||
src/llama-batch.cpp | 3 +
|
||||
src/llama-context.cpp | 19 ++-
|
||||
src/llama-context.cpp | 28 ++--
|
||||
src/llama-context.h | 2 +
|
||||
src/llama-cparams.h | 1 +
|
||||
src/llama-hparams.cpp | 8 +-
|
||||
src/llama-hparams.h | 4 +
|
||||
src/llama-kv-cache.cpp | 33 ++++
|
||||
src/llama-hparams.cpp | 6 +
|
||||
src/llama-hparams.h | 5 +
|
||||
src/llama-kv-cache.cpp | 13 +-
|
||||
src/llama-model-loader.cpp | 2 +
|
||||
src/llama-model.cpp | 59 ++-----
|
||||
src/llama-model.h | 51 ++++++
|
||||
src/llama-model.cpp | 65 ++++++++-
|
||||
src/llama-model.h | 12 ++
|
||||
src/llama-quant.cpp | 4 +-
|
||||
src/llama.cpp | 307 +++++++++++++++++++++++++++++++++-
|
||||
17 files changed, 508 insertions(+), 56 deletions(-)
|
||||
src/llama.cpp | 262 +++++++++++++++++++++++++++++++++-
|
||||
17 files changed, 452 insertions(+), 22 deletions(-)
|
||||
|
||||
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
|
||||
index 16f30c56..0f0f3f62 100644
|
||||
index 518aad3f..f0e484a1 100644
|
||||
--- a/examples/llava/llava.cpp
|
||||
+++ b/examples/llava/llava.cpp
|
||||
@@ -429,7 +429,7 @@ struct llava_embd_batch {
|
||||
@@ -445,7 +445,7 @@ struct llava_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
@@ -44,7 +44,7 @@ index 16f30c56..0f0f3f62 100644
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -441,6 +441,7 @@ struct llava_embd_batch {
|
||||
@@ -457,6 +457,7 @@ struct llava_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
@@ -52,7 +52,7 @@ index 16f30c56..0f0f3f62 100644
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@@ -464,7 +465,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
@@ -480,7 +481,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
n_eval = n_batch;
|
||||
}
|
||||
float * embd = image_embed->embed+i*n_embd;
|
||||
@@ -62,7 +62,7 @@ index 16f30c56..0f0f3f62 100644
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index 7ddd178b..899d16f2 100644
|
||||
index 955ed505..95036ef8 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -171,9 +171,9 @@ struct ggml_backend_registry {
|
||||
@@ -79,10 +79,10 @@ index 7ddd178b..899d16f2 100644
|
||||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
diff --git a/include/llama.h b/include/llama.h
|
||||
index a0d5ba5d..9f411960 100644
|
||||
index 47919602..cc948005 100644
|
||||
--- a/include/llama.h
|
||||
+++ b/include/llama.h
|
||||
@@ -250,6 +250,7 @@ extern "C" {
|
||||
@@ -249,6 +249,7 @@ extern "C" {
|
||||
|
||||
llama_token * token;
|
||||
float * embd;
|
||||
@@ -90,7 +90,7 @@ index a0d5ba5d..9f411960 100644
|
||||
llama_pos * pos;
|
||||
int32_t * n_seq_id;
|
||||
llama_seq_id ** seq_id;
|
||||
@@ -347,6 +348,7 @@ extern "C" {
|
||||
@@ -343,6 +344,7 @@ extern "C" {
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
bool no_perf; // whether to measure performance timings
|
||||
@@ -98,9 +98,9 @@ index a0d5ba5d..9f411960 100644
|
||||
|
||||
// Abort callback
|
||||
// if it returns true, execution of llama_decode() will be aborted
|
||||
@@ -426,6 +428,10 @@ extern "C" {
|
||||
struct llama_model * model,
|
||||
struct llama_context_params params);
|
||||
@@ -443,6 +445,10 @@ extern "C" {
|
||||
struct llama_context_params params),
|
||||
"use llama_init_from_model instead");
|
||||
|
||||
+ // TODO (jmorganca): this should most likely be passed in as part of a batch
|
||||
+ // and not set on the context for all batches.
|
||||
@@ -110,7 +110,7 @@ index a0d5ba5d..9f411960 100644
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index 5b376c5e..b35aeb31 100644
|
||||
index a1e0ebcc..b6f20286 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -6,6 +6,7 @@
|
||||
@@ -121,15 +121,15 @@ index 5b376c5e..b35aeb31 100644
|
||||
{ LLM_ARCH_DECI, "deci" },
|
||||
{ LLM_ARCH_FALCON, "falcon" },
|
||||
{ LLM_ARCH_GROK, "grok" },
|
||||
@@ -124,6 +125,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
|
||||
@@ -127,6 +128,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
@@ -220,6 +222,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
@@ -225,6 +227,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
@@ -170,7 +170,7 @@ index 5b376c5e..b35aeb31 100644
|
||||
{
|
||||
LLM_ARCH_DECI,
|
||||
{
|
||||
@@ -1393,6 +1429,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
@@ -1450,6 +1486,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
@@ -186,7 +186,7 @@ index 5b376c5e..b35aeb31 100644
|
||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index eac7055b..e8235ae0 100644
|
||||
index 77919578..ec742224 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -10,6 +10,7 @@
|
||||
@@ -197,7 +197,7 @@ index eac7055b..e8235ae0 100644
|
||||
LLM_ARCH_DECI,
|
||||
LLM_ARCH_FALCON,
|
||||
LLM_ARCH_BAICHUAN,
|
||||
@@ -128,6 +129,7 @@ enum llm_kv {
|
||||
@@ -131,6 +132,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
@@ -205,7 +205,7 @@ index eac7055b..e8235ae0 100644
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
@@ -308,6 +310,14 @@ enum llm_tensor {
|
||||
@@ -314,6 +316,14 @@ enum llm_tensor {
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
LLM_TENSOR_BSKCN_TV,
|
||||
@@ -249,10 +249,10 @@ index 01d5ca57..8682b0e6 100644
|
||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
||||
}
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index b9c4a5bf..9d0e7ca3 100644
|
||||
index 47e79ed4..7b22fe13 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -71,10 +71,19 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
|
||||
@@ -74,10 +74,19 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
|
||||
}
|
||||
|
||||
if (ubatch.embd) {
|
||||
@@ -275,7 +275,30 @@ index b9c4a5bf..9d0e7ca3 100644
|
||||
}
|
||||
|
||||
if (ubatch.pos && lctx.inp_pos) {
|
||||
@@ -653,6 +662,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
||||
@@ -470,12 +479,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
|
||||
size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
|
||||
const auto & cparams = lctx.cparams;
|
||||
const auto & hparams = lctx.model.hparams;
|
||||
- const auto & vocab = lctx.model.vocab;
|
||||
|
||||
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
|
||||
|
||||
const auto n_batch = cparams.n_batch;
|
||||
- const auto n_vocab = vocab.n_tokens();
|
||||
+ const auto n_vocab = hparams.n_vocab;
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
@@ -542,7 +550,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
|
||||
void llama_output_reorder(struct llama_context & ctx) {
|
||||
std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
|
||||
if (!out_ids.empty()) {
|
||||
- const uint32_t n_vocab = ctx.model.vocab.n_tokens();
|
||||
+ const uint32_t n_vocab = ctx.model.hparams.n_vocab;
|
||||
const uint32_t n_embd = ctx.model.hparams.n_embd;
|
||||
|
||||
const int32_t n_outputs = ctx.n_outputs;
|
||||
@@ -657,6 +665,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
||||
ctx->cparams.causal_attn = causal_attn;
|
||||
}
|
||||
|
||||
@@ -286,8 +309,26 @@ index b9c4a5bf..9d0e7ca3 100644
|
||||
void llama_synchronize(struct llama_context * ctx) {
|
||||
ggml_backend_sched_synchronize(ctx->sched.get());
|
||||
|
||||
@@ -726,7 +738,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
|
||||
}
|
||||
|
||||
- return ctx->logits + j*ctx->model.vocab.n_tokens();
|
||||
+ return ctx->logits + j*ctx->model.hparams.n_vocab;
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
||||
#ifndef NDEBUG
|
||||
@@ -886,7 +898,7 @@ struct llama_data_write {
|
||||
}
|
||||
|
||||
void write_logits(const struct llama_context * ctx) {
|
||||
- const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
|
||||
+ const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
|
||||
|
||||
write(&logits_size, sizeof(logits_size));
|
||||
|
||||
diff --git a/src/llama-context.h b/src/llama-context.h
|
||||
index 0d163c47..4980a60e 100644
|
||||
index a9268b29..cf12c9d7 100644
|
||||
--- a/src/llama-context.h
|
||||
+++ b/src/llama-context.h
|
||||
@@ -107,6 +107,8 @@ struct llama_context {
|
||||
@@ -312,7 +353,7 @@ index 252012f3..9681e5a0 100644
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
|
||||
index 450738da..42f8a58f 100644
|
||||
index f3955de9..0b841028 100644
|
||||
--- a/src/llama-hparams.cpp
|
||||
+++ b/src/llama-hparams.cpp
|
||||
@@ -2,6 +2,8 @@
|
||||
@@ -328,18 +369,25 @@ index 450738da..42f8a58f 100644
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
-}
|
||||
\ No newline at end of file
|
||||
+}
|
||||
+
|
||||
+bool llama_hparams::cross_attention_layers(uint32_t il) const {
|
||||
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
|
||||
+}
|
||||
}
|
||||
\ No newline at end of file
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index fd898e27..f826cd9a 100644
|
||||
index 1bdcdfd5..05383046 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -53,6 +53,7 @@ struct llama_hparams {
|
||||
@@ -41,6 +41,7 @@ struct llama_hparams {
|
||||
uint32_t n_expert = 0;
|
||||
uint32_t n_expert_used = 0;
|
||||
uint32_t n_rel_attn_bkts = 0;
|
||||
+ uint32_t n_vocab = 0;
|
||||
|
||||
// for WavTokenizer
|
||||
struct llama_hparams_posnet posnet;
|
||||
@@ -51,6 +52,7 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
|
||||
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
|
||||
@@ -347,65 +395,45 @@ index fd898e27..f826cd9a 100644
|
||||
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
@@ -139,6 +140,9 @@ struct llama_hparams {
|
||||
@@ -138,6 +140,9 @@ struct llama_hparams {
|
||||
|
||||
// Block skip connection
|
||||
bool n_bskcn(uint32_t n, uint32_t il) const;
|
||||
+
|
||||
+ // cross attention layers
|
||||
+ // cross attention layers
|
||||
+ bool cross_attention_layers(uint32_t il) const;
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 53379253..cf814dbe 100644
|
||||
index feffdf0d..b541c5a3 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -72,6 +72,39 @@ bool llama_kv_cache_init(
|
||||
cache.v_l.reserve(n_layer);
|
||||
@@ -91,8 +91,17 @@ bool llama_kv_cache_init(
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; i++) {
|
||||
- ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
||||
- ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
|
||||
+ ggml_tensor * k, *v;
|
||||
+
|
||||
+ // for cross attention layers
|
||||
+ if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
|
||||
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||
+ const llama_model::buft_list_t * buft_list;
|
||||
+ if (offload) {
|
||||
+ buft_list = model.dev_layer.at(i).buft_list;
|
||||
+ } else {
|
||||
+ buft_list = &model.cpu_buft_list;
|
||||
+ }
|
||||
+ ggml_backend_buffer_type_t buft = select_buft(*buft_list,
|
||||
+ [&](ggml_context * ctx) {
|
||||
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
||||
+ if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
|
||||
+ return k;
|
||||
+ }
|
||||
+ ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
|
||||
+ return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
|
||||
+ });
|
||||
+ ggml_context * ctx = ctx_for_buft(buft);
|
||||
+
|
||||
+ if (!ctx) {
|
||||
+ LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
|
||||
+ return false;
|
||||
+ }
|
||||
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
|
||||
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
|
||||
+ ggml_format_name(k, "cache_k_l%d", i);
|
||||
+ ggml_format_name(v, "cache_v_l%d", i);
|
||||
+ cache.k_l.push_back(k);
|
||||
+ cache.v_l.push_back(v);
|
||||
+ continue;
|
||||
+ k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
|
||||
+ v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
|
||||
+ } else {
|
||||
+ k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
||||
+ v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
|
||||
+ }
|
||||
+
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
||||
|
||||
ggml_format_name(k, "cache_k_l%d", i);
|
||||
ggml_format_name(v, "cache_v_l%d", i);
|
||||
cache.k_l.push_back(k);
|
||||
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
|
||||
index 422524a8..b12d6566 100644
|
||||
index 1252aca1..45d08721 100644
|
||||
--- a/src/llama-model-loader.cpp
|
||||
+++ b/src/llama-model-loader.cpp
|
||||
@@ -240,6 +240,8 @@ namespace GGUFMeta {
|
||||
@@ -315,6 +315,8 @@ namespace GGUFMeta {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -415,80 +443,47 @@ index 422524a8..b12d6566 100644
|
||||
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 306c557d..4f9bbf90 100644
|
||||
index ad1315c6..21819080 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -146,46 +146,6 @@ std::string llama_model_ftype_name(const llama_model & model) {
|
||||
return llama_model_ftype_name(model.ftype);
|
||||
}
|
||||
@@ -401,6 +401,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
-template<typename F>
|
||||
-static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
|
||||
- ggml_init_params params = {
|
||||
- /*.mem_size =*/ ggml_tensor_overhead()*8,
|
||||
- /*.mem_buffer =*/ NULL,
|
||||
- /*.no_alloc =*/ true,
|
||||
- };
|
||||
-
|
||||
- ggml_context_ptr ctx { ggml_init(params) };
|
||||
- if (!ctx) {
|
||||
- throw std::runtime_error(format("failed to create ggml context"));
|
||||
- }
|
||||
-
|
||||
- ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
|
||||
- ggml_tensor * op_tensor = fn(ctx.get());
|
||||
- for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
- if (op_tensor->src[i] != nullptr) {
|
||||
- assert(op_tensor->src[i]->buffer == nullptr);
|
||||
- op_tensor->src[i]->buffer = buf.get();
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
||||
-
|
||||
- return op_supported;
|
||||
-}
|
||||
-
|
||||
-template<typename F>
|
||||
-static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
|
||||
- for (const auto & cur : buft_list) {
|
||||
- ggml_backend_dev_t cur_dev = cur.first;
|
||||
- ggml_backend_buffer_type_t cur_buft = cur.second;
|
||||
- if (buft_supported(cur_buft, cur_dev, fn)) {
|
||||
- return cur_buft;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- throw std::runtime_error(format("no suitable buffer type found"));
|
||||
-}
|
||||
-
|
||||
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
|
||||
return select_buft(
|
||||
*model.dev_layer.at(il).buft_list,
|
||||
@@ -312,9 +272,11 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
|
||||
// get general kv
|
||||
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
|
||||
+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
|
||||
|
||||
// everything past this point is not vocab-related
|
||||
if (hparams.vocab_only) {
|
||||
@@ -412,6 +413,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false);
|
||||
|
||||
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||||
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
||||
@@ -435,9 +437,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||||
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||||
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||||
+ std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
|
||||
|
||||
- ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
||||
- ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
||||
+ ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
||||
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
||||
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
||||
+ ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
|
||||
|
||||
// n_head_kv is optional, default to n_head
|
||||
hparams.n_head_kv_arr = hparams.n_head_arr;
|
||||
@@ -363,7 +325,7 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
|
||||
@@ -486,7 +490,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||
|
||||
- if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
|
||||
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_MLLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
|
||||
- if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
|
||||
+ if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
|
||||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||||
}
|
||||
@@ -405,6 +367,16 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
|
||||
@@ -530,6 +534,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
} break;
|
||||
@@ -497,145 +492,44 @@ index 306c557d..4f9bbf90 100644
|
||||
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
+
|
||||
+ switch (hparams.n_layer) {
|
||||
+ case 40: model.type = e_model::MODEL_11B; break;
|
||||
+ case 100: model.type = e_model::MODEL_90B; break;
|
||||
+ default: model.type = e_model::MODEL_UNKNOWN;
|
||||
+ case 40: type = LLM_TYPE_11B; break;
|
||||
+ case 100: type = LLM_TYPE_90B; break;
|
||||
+ default: type = LLM_TYPE_UNKNOWN;
|
||||
+ }
|
||||
+ } break;
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@@ -2062,6 +2034,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
case LLM_ARCH_LLAMA:
|
||||
+ case LLM_ARCH_MLLAMA:
|
||||
case LLM_ARCH_DECI:
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
case LLM_ARCH_STARCODER:
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index c1b9c0a1..5b23e2ba 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -9,6 +9,7 @@
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <vector>
|
||||
+#include <stdexcept>
|
||||
|
||||
// available models
|
||||
// TODO: this enum does not follow the enum naming convention
|
||||
@@ -62,6 +63,7 @@ enum llm_type {
|
||||
MODEL_40B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
+ MODEL_90B,
|
||||
MODEL_236B,
|
||||
MODEL_314B,
|
||||
MODEL_671B,
|
||||
@@ -278,6 +280,16 @@ struct llama_layer {
|
||||
|
||||
struct ggml_tensor * bskcn_tv = nullptr;
|
||||
|
||||
+ // cross attention
|
||||
+ struct ggml_tensor * cross_attn_k_norm = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_k_proj = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_o_proj = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_q_norm = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_q_proj = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_v_proj = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_attn_gate = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_mlp_gate = nullptr;
|
||||
+
|
||||
struct llama_layer_posnet posnet;
|
||||
|
||||
struct llama_layer_convnext convnext;
|
||||
@@ -376,6 +388,45 @@ std::string llama_model_arch_name (const llama_model & model);
|
||||
std::string llama_model_type_name (const llama_model & model);
|
||||
std::string llama_model_ftype_name(const llama_model & model);
|
||||
|
||||
+template<typename F>
|
||||
+bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
|
||||
+ ggml_init_params params = {
|
||||
+ /*.mem_size =*/ ggml_tensor_overhead()*8,
|
||||
+ /*.mem_buffer =*/ NULL,
|
||||
+ /*.no_alloc =*/ true,
|
||||
+ };
|
||||
+
|
||||
+ ggml_context_ptr ctx { ggml_init(params) };
|
||||
+ if (!ctx) {
|
||||
+ throw std::runtime_error("failed to create ggml context");
|
||||
+ }
|
||||
+
|
||||
+ ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
|
||||
+ ggml_tensor * op_tensor = fn(ctx.get());
|
||||
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
+ if (op_tensor->src[i] != nullptr) {
|
||||
+ op_tensor->src[i]->buffer = buf.get();
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
||||
+
|
||||
+ return op_supported;
|
||||
+}
|
||||
+
|
||||
+template<typename F>
|
||||
+ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
|
||||
+ for (const auto & cur : buft_list) {
|
||||
+ ggml_backend_dev_t cur_dev = cur.first;
|
||||
+ ggml_backend_buffer_type_t cur_buft = cur.second;
|
||||
+ if (buft_supported(cur_buft, cur_dev, fn)) {
|
||||
+ return cur_buft;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ throw std::runtime_error("no suitable buffer type found");
|
||||
+}
|
||||
+
|
||||
// used by llama_adapter_cvec
|
||||
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
|
||||
|
||||
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
|
||||
index 42974f8f..27def6fd 100644
|
||||
--- a/src/llama-quant.cpp
|
||||
+++ b/src/llama-quant.cpp
|
||||
@@ -629,7 +629,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
if (llama_model_has_encoder(&model)) {
|
||||
n_attn_layer *= 3;
|
||||
}
|
||||
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
|
||||
+ if (qs.n_attention_wv != n_attn_layer) {
|
||||
+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
|
||||
+ }
|
||||
}
|
||||
|
||||
size_t total_size_org = 0;
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 7dec50ae..bac66c24 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -563,6 +563,52 @@ static bool llm_load_tensors(
|
||||
@@ -1398,7 +1412,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||
const int64_t n_ff = hparams.n_ff();
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
- const int64_t n_vocab = vocab.n_tokens();
|
||||
+ const int64_t n_vocab = hparams.n_vocab;
|
||||
const int64_t n_token_types = vocab.n_token_types();
|
||||
const int64_t n_rot = hparams.n_rot;
|
||||
const int64_t n_expert = hparams.n_expert;
|
||||
@@ -1581,6 +1595,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
} break;
|
||||
+ case LLM_ARCH_MLLAMA:
|
||||
+ {
|
||||
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
|
||||
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
|
||||
+
|
||||
+ // output
|
||||
+ {
|
||||
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
+
|
||||
+ // if output is NULL, init from the input tok embed
|
||||
+ if (model.output == NULL) {
|
||||
+ model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
||||
+ if (output == NULL) {
|
||||
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ for (int i = 0; i < n_layer; ++i) {
|
||||
+ auto & layer = model.layers[i];
|
||||
+ auto & layer = layers[i];
|
||||
+
|
||||
+ if (hparams.cross_attention_layers(i)) {
|
||||
+ layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0);
|
||||
@@ -667,17 +561,72 @@ index 7dec50ae..bac66c24 100644
|
||||
+ } break;
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@@ -2514,7 +2560,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@@ -3925,6 +3985,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
|
||||
|
||||
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
|
||||
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
||||
- throw std::runtime_error("vocab size mismatch");
|
||||
+ LLAMA_LOG_WARN("%s: vocab mismatch %u !- %zu ...\n", __func__, model.hparams.n_vocab, model.vocab.id_to_token.size());
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
case LLM_ARCH_LLAMA:
|
||||
+ case LLM_ARCH_MLLAMA:
|
||||
case LLM_ARCH_DECI:
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
case LLM_ARCH_STARCODER:
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index 1afb0024..7cf57587 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
+#include <stdexcept>
|
||||
|
||||
struct llama_model_loader;
|
||||
|
||||
@@ -63,6 +64,7 @@ enum llm_type {
|
||||
LLM_TYPE_40B,
|
||||
LLM_TYPE_65B,
|
||||
LLM_TYPE_70B,
|
||||
+ LLM_TYPE_90B,
|
||||
LLM_TYPE_236B,
|
||||
LLM_TYPE_314B,
|
||||
LLM_TYPE_671B,
|
||||
@@ -284,6 +286,16 @@ struct llama_layer {
|
||||
|
||||
struct ggml_tensor * bskcn_tv = nullptr;
|
||||
|
||||
+ // cross attention
|
||||
+ struct ggml_tensor * cross_attn_k_norm = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_k_proj = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_o_proj = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_q_norm = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_q_proj = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_v_proj = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_attn_gate = nullptr;
|
||||
+ struct ggml_tensor * cross_attn_mlp_gate = nullptr;
|
||||
+
|
||||
struct llama_layer_posnet posnet;
|
||||
|
||||
struct llama_layer_convnext convnext;
|
||||
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
|
||||
index fb798265..6eb1da08 100644
|
||||
--- a/src/llama-quant.cpp
|
||||
+++ b/src/llama-quant.cpp
|
||||
@@ -632,7 +632,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
if (llama_model_has_encoder(&model)) {
|
||||
n_attn_layer *= 3;
|
||||
}
|
||||
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
|
||||
+ if (qs.n_attention_wv != n_attn_layer) {
|
||||
+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
|
||||
+ }
|
||||
}
|
||||
|
||||
if (params.vocab_only) {
|
||||
@@ -2598,6 +2644,21 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||
size_t total_size_org = 0;
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 6d320ea4..8f7902df 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -154,6 +154,21 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||
return inpL;
|
||||
}
|
||||
|
||||
@@ -699,7 +648,7 @@ index 7dec50ae..bac66c24 100644
|
||||
static void llm_build_kv_store(
|
||||
struct ggml_context * ctx,
|
||||
const llama_hparams & hparams,
|
||||
@@ -3593,6 +3654,7 @@ struct llm_build_context {
|
||||
@@ -1157,6 +1172,7 @@ struct llm_build_context {
|
||||
lctx.inp_pos_bucket = nullptr;
|
||||
lctx.inp_embd_enc = nullptr;
|
||||
lctx.inp_KQ_mask_cross = nullptr;
|
||||
@@ -707,12 +656,12 @@ index 7dec50ae..bac66c24 100644
|
||||
}
|
||||
|
||||
void free() {
|
||||
@@ -4074,6 +4136,240 @@ struct llm_build_context {
|
||||
@@ -1639,6 +1655,240 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
+ struct ggml_cgraph * build_mllama() {
|
||||
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
+ struct ggml_cgraph * build_mllama() {
|
||||
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||
+
|
||||
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
+ int32_t n_tokens = this->n_tokens;
|
||||
@@ -946,9 +895,9 @@ index 7dec50ae..bac66c24 100644
|
||||
+ }
|
||||
+
|
||||
struct ggml_cgraph * build_deci() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||
|
||||
@@ -10646,6 +10942,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
@@ -8344,6 +8594,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_llama();
|
||||
} break;
|
||||
@@ -959,16 +908,33 @@ index 7dec50ae..bac66c24 100644
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
result = llm.build_deci();
|
||||
@@ -10971,7 +11271,7 @@ static int llama_decode_internal(
|
||||
@@ -8634,7 +8888,7 @@ static int llama_prepare_sbatch(
|
||||
n_outputs = 1;
|
||||
}
|
||||
|
||||
- lctx.sbatch.from_batch(batch, n_embd,
|
||||
+ lctx.sbatch.from_batch(batch, batch.n_embd,
|
||||
/* simple_split */ !kv_self.recurrent,
|
||||
/* simple_split */ !lctx.kv_self.recurrent,
|
||||
/* logits_all */ n_outputs == n_tokens_all);
|
||||
|
||||
@@ -11282,7 +11582,7 @@ static int llama_encode_internal(
|
||||
@@ -8749,7 +9003,6 @@ static int llama_decode_impl(
|
||||
const llama_batch & batch = batch_allocr.batch;
|
||||
|
||||
const auto & model = lctx.model;
|
||||
- const auto & vocab = model.vocab;
|
||||
const auto & hparams = model.hparams;
|
||||
const auto & cparams = lctx.cparams;
|
||||
|
||||
@@ -8760,7 +9013,7 @@ static int llama_decode_impl(
|
||||
llama_kv_slot_restorer kv_slot_restorer(kv_self);
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
- const int64_t n_vocab = vocab.n_tokens();
|
||||
+ const int64_t n_vocab = hparams.n_vocab;
|
||||
|
||||
uint32_t n_outputs = 0;
|
||||
uint32_t n_outputs_prev = 0;
|
||||
@@ -9025,7 +9278,7 @@ static int llama_encode_impl(
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
@@ -977,7 +943,7 @@ index 7dec50ae..bac66c24 100644
|
||||
|
||||
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
|
||||
|
||||
@@ -11775,6 +12075,7 @@ struct llama_context_params llama_context_default_params() {
|
||||
@@ -9511,6 +9764,7 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.offload_kqv =*/ true,
|
||||
/*.flash_attn =*/ false,
|
||||
/*.no_perf =*/ true,
|
||||
|
||||
@@ -15,10 +15,10 @@ Subject: [PATCH] add unpad operator
|
||||
8 files changed, 220 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
||||
index c714fc8c..1bc50fca 100644
|
||||
index dd0c6a96..8d269a9c 100644
|
||||
--- a/ggml/include/ggml.h
|
||||
+++ b/ggml/include/ggml.h
|
||||
@@ -499,6 +499,7 @@ extern "C" {
|
||||
@@ -487,6 +487,7 @@ extern "C" {
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
GGML_OP_PAD,
|
||||
GGML_OP_PAD_REFLECT_1D,
|
||||
@@ -26,7 +26,7 @@ index c714fc8c..1bc50fca 100644
|
||||
GGML_OP_ARANGE,
|
||||
GGML_OP_TIMESTEP_EMBEDDING,
|
||||
GGML_OP_ARGSORT,
|
||||
@@ -1735,6 +1736,15 @@ extern "C" {
|
||||
@@ -1743,6 +1744,15 @@ extern "C" {
|
||||
int p0,
|
||||
int p1);
|
||||
|
||||
@@ -43,10 +43,10 @@ index c714fc8c..1bc50fca 100644
|
||||
// timesteps: [N,]
|
||||
// return: [N, dim]
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index b7fefb9d..b307d554 100644
|
||||
index 72325349..2f606d82 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -10588,6 +10588,59 @@ static void ggml_compute_forward_pad_reflect_1d(
|
||||
@@ -10844,6 +10844,59 @@ static void ggml_compute_forward_pad_reflect_1d(
|
||||
}
|
||||
}
|
||||
|
||||
@@ -106,7 +106,7 @@ index b7fefb9d..b307d554 100644
|
||||
// ggml_compute_forward_arange
|
||||
|
||||
static void ggml_compute_forward_arange_f32(
|
||||
@@ -12690,6 +12743,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
@@ -13137,6 +13190,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_pad_reflect_1d(params, tensor);
|
||||
} break;
|
||||
@@ -117,7 +117,7 @@ index b7fefb9d..b307d554 100644
|
||||
case GGML_OP_ARANGE:
|
||||
{
|
||||
ggml_compute_forward_arange(params, tensor);
|
||||
@@ -13033,6 +13090,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
@@ -13484,6 +13541,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
@@ -126,10 +126,10 @@ index b7fefb9d..b307d554 100644
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index aaa79ea4..9286f866 100644
|
||||
index 36165840..1adf08fa 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2082,6 +2082,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
@@ -2198,6 +2198,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_PAD:
|
||||
ggml_cuda_op_pad(ctx, dst);
|
||||
break;
|
||||
@@ -139,8 +139,8 @@ index aaa79ea4..9286f866 100644
|
||||
case GGML_OP_ARANGE:
|
||||
ggml_cuda_op_arange(ctx, dst);
|
||||
break;
|
||||
@@ -3010,6 +3013,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_GROUP_NORM:
|
||||
@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
+ case GGML_OP_UNPAD:
|
||||
@@ -148,7 +148,7 @@ index aaa79ea4..9286f866 100644
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
|
||||
index aba539e8..39fd4b16 100644
|
||||
index aba539e8..b4b87409 100644
|
||||
--- a/ggml/src/ggml-cuda/pad.cu
|
||||
+++ b/ggml/src/ggml-cuda/pad.cu
|
||||
@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
@@ -201,6 +201,7 @@ index aba539e8..39fd4b16 100644
|
||||
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
|
||||
+}
|
||||
\ No newline at end of file
|
||||
diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
|
||||
index 8fd386b0..e2ededc3 100644
|
||||
--- a/ggml/src/ggml-cuda/pad.cuh
|
||||
@@ -211,10 +212,10 @@ index 8fd386b0..e2ededc3 100644
|
||||
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index cd8ef741..318addec 100644
|
||||
index fd9a4e77..e4c093f9 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -311,6 +311,7 @@ enum ggml_metal_kernel_type {
|
||||
@@ -331,6 +331,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
||||
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
|
||||
@@ -222,7 +223,7 @@ index cd8ef741..318addec 100644
|
||||
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
||||
@@ -910,6 +911,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
@@ -946,6 +947,7 @@ @implementation GGMLMetalClass
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
|
||||
@@ -230,7 +231,7 @@ index cd8ef741..318addec 100644
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||
@@ -1145,6 +1147,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
@@ -1254,6 +1256,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
@@ -238,7 +239,7 @@ index cd8ef741..318addec 100644
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
@@ -3348,6 +3351,36 @@ static void ggml_metal_encode_node(
|
||||
@@ -3469,6 +3472,36 @@ static void ggml_metal_encode_node(
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
@@ -276,10 +277,10 @@ index cd8ef741..318addec 100644
|
||||
} break;
|
||||
case GGML_OP_ARANGE:
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
index 8ba43904..204c93e6 100644
|
||||
index d092a169..f38909d0 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
@@ -2944,6 +2944,51 @@ kernel void kernel_pad_reflect_1d_f32(
|
||||
@@ -2953,6 +2953,51 @@ kernel void kernel_pad_reflect_1d_f32(
|
||||
}
|
||||
}
|
||||
|
||||
@@ -332,10 +333,10 @@ index 8ba43904..204c93e6 100644
|
||||
device char * dst,
|
||||
constant int64_t & ne0,
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index 2bbe5f48..7ffcd907 100644
|
||||
index 7fc06724..635aa299 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -954,6 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
@@ -962,6 +962,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"UPSCALE",
|
||||
"PAD",
|
||||
"PAD_REFLECT_1D",
|
||||
@@ -343,16 +344,16 @@ index 2bbe5f48..7ffcd907 100644
|
||||
"ARANGE",
|
||||
"TIMESTEP_EMBEDDING",
|
||||
"ARGSORT",
|
||||
@@ -987,7 +988,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
@@ -996,7 +997,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"OPT_STEP_ADAMW",
|
||||
};
|
||||
|
||||
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||
+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -1050,6 +1051,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
@@ -1059,6 +1060,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"upscale(x)",
|
||||
"pad(x)",
|
||||
"pad_reflect_1d(x)",
|
||||
@@ -360,16 +361,16 @@ index 2bbe5f48..7ffcd907 100644
|
||||
"arange(start, stop, step)",
|
||||
"timestep_embedding(timesteps, dim, max_period)",
|
||||
"argsort(x)",
|
||||
@@ -1083,7 +1085,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
@@ -1093,7 +1095,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"adamw(x)",
|
||||
};
|
||||
|
||||
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||
+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -4214,6 +4216,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
||||
@@ -4225,6 +4227,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@@ -11,10 +11,10 @@ the characters
|
||||
2 files changed, 23 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 3fcfcaa3..8f44705a 100644
|
||||
index a4eee9b8..1ca827eb 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -375,7 +375,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
@@ -295,7 +295,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
||||
regex_exprs = {
|
||||
"[\r\n]",
|
||||
@@ -24,7 +24,7 @@ index 3fcfcaa3..8f44705a 100644
|
||||
"\\s+$",
|
||||
"[一-龥ࠀ-一가-]+",
|
||||
diff --git a/src/unicode.cpp b/src/unicode.cpp
|
||||
index 7aca6544..6155da80 100644
|
||||
index e63bb4ab..9dd53b9a 100644
|
||||
--- a/src/unicode.cpp
|
||||
+++ b/src/unicode.cpp
|
||||
@@ -2,6 +2,11 @@
|
||||
@@ -39,7 +39,7 @@ index 7aca6544..6155da80 100644
|
||||
#include "unicode.h"
|
||||
#include "unicode-data.h"
|
||||
|
||||
@@ -201,6 +206,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
|
||||
@@ -200,6 +205,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
|
||||
}
|
||||
|
||||
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
||||
@@ -62,7 +62,7 @@ index 7aca6544..6155da80 100644
|
||||
#if defined(__clang__)
|
||||
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||
# pragma clang diagnostic push
|
||||
@@ -214,6 +235,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
||||
@@ -213,6 +234,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
||||
#endif
|
||||
|
||||
return conv.from_bytes(s);
|
||||
|
||||
@@ -8,11 +8,11 @@ Subject: [PATCH] Maintain ordering for rules for grammar
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
|
||||
index dadc18c8..2a8dbd22 100644
|
||||
index 3ebcc3d9..30c28808 100644
|
||||
--- a/common/json-schema-to-grammar.cpp
|
||||
+++ b/common/json-schema-to-grammar.cpp
|
||||
@@ -391,7 +391,7 @@ class SchemaConverter {
|
||||
private:
|
||||
@@ -346,7 +346,7 @@ private:
|
||||
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
||||
std::function<json(const std::string &)> _fetch_json;
|
||||
bool _dotall;
|
||||
- std::map<std::string, std::string> _rules;
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Sat, 14 Dec 2024 12:54:00 -0800
|
||||
Subject: [PATCH] fix missing arg in static assert on windows
|
||||
|
||||
---
|
||||
ggml/src/ggml-cuda/concat.cu | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu
|
||||
index 2f42b8a9..5eb9f08d 100644
|
||||
--- a/ggml/src/ggml-cuda/concat.cu
|
||||
+++ b/ggml/src/ggml-cuda/concat.cu
|
||||
@@ -124,7 +124,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
|
||||
uint64_t nb1,
|
||||
uint64_t nb2,
|
||||
uint64_t nb3){
|
||||
- static_assert(dim >= 0 && dim <= 3);
|
||||
+ static_assert(dim >= 0 && dim <= 3, "dim must be between 0 and 3");
|
||||
|
||||
const int64_t i3 = blockIdx.z;
|
||||
const int64_t i2 = blockIdx.y;
|
||||
@@ -19,10 +19,10 @@ multiple batches of processing until everything is complete.
|
||||
1 file changed, 46 insertions(+), 53 deletions(-)
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index bac66c24..c95da45d 100644
|
||||
index 8f7902df..01854fce 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -3536,6 +3536,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
|
||||
@@ -1054,6 +1054,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
|
||||
return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
|
||||
}
|
||||
|
||||
@@ -36,13 +36,13 @@ index bac66c24..c95da45d 100644
|
||||
struct llm_build_context {
|
||||
const llama_model & model;
|
||||
llama_context & lctx;
|
||||
@@ -3712,35 +3719,23 @@ struct llm_build_context {
|
||||
@@ -1230,35 +1237,23 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
- struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
|
||||
+ struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||
|
||||
- for (uint32_t i = 0; i < ids.size(); ++i) {
|
||||
- const uint32_t id = ids[i];
|
||||
@@ -78,7 +78,7 @@ index bac66c24..c95da45d 100644
|
||||
|
||||
ggml_tensor * view_v_src;
|
||||
ggml_tensor * view_v_dst;
|
||||
@@ -3748,31 +3743,29 @@ struct llm_build_context {
|
||||
@@ -1266,31 +1261,29 @@ struct llm_build_context {
|
||||
if (flash_attn) {
|
||||
// NOTE: the V cache is not transposed when using flash attention
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
@@ -118,7 +118,7 @@ index bac66c24..c95da45d 100644
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||
@@ -10856,7 +10849,7 @@ struct llm_build_context {
|
||||
@@ -8508,7 +8501,7 @@ struct llm_build_context {
|
||||
}
|
||||
};
|
||||
|
||||
@@ -127,7 +127,7 @@ index bac66c24..c95da45d 100644
|
||||
llama_ubatch dummy = {};
|
||||
dummy.equal_seqs = true;
|
||||
|
||||
@@ -10866,7 +10859,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
|
||||
@@ -8518,7 +8511,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
|
||||
|
||||
llm.init();
|
||||
|
||||
@@ -136,21 +136,21 @@ index bac66c24..c95da45d 100644
|
||||
|
||||
llm.free();
|
||||
|
||||
@@ -11329,7 +11322,12 @@ static int llama_decode_internal(
|
||||
kv_self.head = 0;
|
||||
}
|
||||
@@ -8956,7 +8949,12 @@ static int llama_prepare_ubatch(
|
||||
kv_self.head = 0;
|
||||
}
|
||||
|
||||
- const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
+ auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
+ if (!slot) {
|
||||
+ llama_kv_cache_defrag(kv_self);
|
||||
+ llama_kv_cache_update(&lctx);
|
||||
+ slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
+ }
|
||||
if (!slot) {
|
||||
return 1;
|
||||
}
|
||||
@@ -11735,8 +11733,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
- const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
+ auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
+ if (!slot) {
|
||||
+ llama_kv_cache_defrag(kv_self);
|
||||
+ llama_kv_cache_update(&lctx);
|
||||
+ slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
+ }
|
||||
if (!slot) {
|
||||
return 1;
|
||||
}
|
||||
@@ -9431,8 +9429,8 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
|
||||
|
||||
//const int64_t t_start = ggml_time_us();
|
||||
|
||||
@@ -161,7 +161,7 @@ index bac66c24..c95da45d 100644
|
||||
|
||||
// each move requires 6*n_layer tensors (see build_defrag)
|
||||
// - source view, destination view, copy operation
|
||||
@@ -11800,19 +11798,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
@@ -9496,19 +9494,11 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
|
||||
// are we moving a continuous block of memory?
|
||||
bool cont = false;
|
||||
|
||||
@@ -181,7 +181,7 @@ index bac66c24..c95da45d 100644
|
||||
cont = false;
|
||||
continue;
|
||||
}
|
||||
@@ -11828,8 +11818,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
@@ -9524,8 +9514,10 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
|
||||
kv_self.head = n_used;
|
||||
|
||||
if (!cont) {
|
||||
@@ -193,7 +193,7 @@ index bac66c24..c95da45d 100644
|
||||
}
|
||||
|
||||
nf++;
|
||||
@@ -11839,22 +11831,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
@@ -9535,22 +9527,16 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
|
||||
}
|
||||
}
|
||||
|
||||
@@ -218,7 +218,7 @@ index bac66c24..c95da45d 100644
|
||||
|
||||
#if 0
|
||||
// CPU defrag
|
||||
@@ -11929,11 +11915,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
@@ -9625,11 +9611,18 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
|
||||
#else
|
||||
// ggml_graph defrag
|
||||
|
||||
@@ -8,12 +8,12 @@ Subject: [PATCH] use dynamic backend loading for clip
|
||||
1 file changed, 27 insertions(+), 47 deletions(-)
|
||||
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index b3c1829f..86b91d5c 100644
|
||||
index 205af1eb..560021c7 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -8,25 +8,25 @@
|
||||
#include "ggml-alloc.h"
|
||||
@@ -9,25 +9,25 @@
|
||||
#include "ggml-backend.h"
|
||||
#include "gguf.h"
|
||||
|
||||
-//#ifdef GGML_USE_CUDA
|
||||
-//#include "ggml-cuda.h"
|
||||
@@ -56,7 +56,7 @@ index b3c1829f..86b91d5c 100644
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb_image.h"
|
||||
@@ -1235,35 +1235,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
@@ -1309,35 +1309,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ Subject: [PATCH] sort devices by score
|
||||
1 file changed, 13 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index 899d16f2..135f7df0 100644
|
||||
index 95036ef8..98d5e14d 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -150,7 +150,7 @@ struct ggml_backend_reg_entry {
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index 84101c32..72b488dd 100644
|
||||
index 0002ac18..0a8d1092 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
@@ -297,6 +297,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
endforeach()
|
||||
|
||||
ggml_add_cpu_backend_variant_impl(${tag_name})
|
||||
@@ -19,7 +19,7 @@ index 84101c32..72b488dd 100644
|
||||
endfunction()
|
||||
|
||||
ggml_add_backend(CPU)
|
||||
@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
@@ -305,6 +306,7 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
||||
endif()
|
||||
@@ -8,7 +8,7 @@ Subject: [PATCH] try/catch backend load
|
||||
1 file changed, 23 insertions(+), 22 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index 135f7df0..84b21dd8 100644
|
||||
index 98d5e14d..1c19129a 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -512,32 +512,33 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
@@ -1,55 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Sun, 9 Feb 2025 17:22:15 -0800
|
||||
Subject: [PATCH] remove sgemm global variables
|
||||
|
||||
removes the 'iq4nlt' global variable in sgemm.cpp that causes
|
||||
a runtime crash when calling dlopen on ggml-cpu libraries as
|
||||
its initialization depends on AVX instructions the host machine
|
||||
may not have
|
||||
---
|
||||
ggml/src/ggml-cpu/llamafile/sgemm.cpp | 17 +++++++++--------
|
||||
1 file changed, 9 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
|
||||
index 8fce576c..3f260ce5 100644
|
||||
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
|
||||
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
|
||||
@@ -279,14 +279,6 @@ template <> inline __m256bh load(const float *p) {
|
||||
}
|
||||
#endif
|
||||
|
||||
-////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
-// CONSTANTS
|
||||
-
|
||||
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
||||
-static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
-static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
|
||||
-#endif
|
||||
-
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// FLOATING POINT MATRIX MULTIPLICATION
|
||||
|
||||
@@ -613,6 +605,14 @@ class tinyBLAS_Q0_AVX {
|
||||
TC *C, int64_t ldc,
|
||||
int ith, int nth)
|
||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||
+ const int8_t kvalues_iq4nl[16] = {
|
||||
+ -127, -104, -83, -65,
|
||||
+ -49, -35, -22, -10,
|
||||
+ 1, 13, 25, 38,
|
||||
+ 53, 69, 89, 113
|
||||
+ };
|
||||
+
|
||||
+ iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
|
||||
}
|
||||
|
||||
void matmul(int64_t m, int64_t n) {
|
||||
@@ -1037,6 +1037,7 @@ class tinyBLAS_Q0_AVX {
|
||||
const int64_t ldc;
|
||||
const int ith;
|
||||
const int nth;
|
||||
+ __m128i iq4nlt;
|
||||
};
|
||||
#endif // __AVX__
|
||||
|
||||
@@ -8,7 +8,7 @@ Subject: [PATCH] use std::filesystem::path instead of wstring
|
||||
1 file changed, 58 insertions(+), 86 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index 84b21dd8..e35a6936 100644
|
||||
index 1c19129a..c854e6bb 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -66,26 +66,6 @@
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] remove amx
|
||||
1 file changed, 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index 72b488dd..50828717 100644
|
||||
index 0a8d1092..4564df91 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
@@ -312,10 +312,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
|
||||
@@ -19,6 +19,6 @@ index 72b488dd..50828717 100644
|
||||
- # MSVC doesn't support AMX
|
||||
- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
- endif()
|
||||
else ()
|
||||
elseif (GGML_CPU)
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
endif()
|
||||
36
llama/patches/0018-fix-clip-compiler-error.patch
Normal file
36
llama/patches/0018-fix-clip-compiler-error.patch
Normal file
@@ -0,0 +1,36 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Tue, 25 Feb 2025 19:14:51 -0800
|
||||
Subject: [PATCH] fix-clip-compiler-error
|
||||
|
||||
---
|
||||
examples/llava/clip.cpp | 2 +-
|
||||
examples/llava/clip.h | 2 +-
|
||||
2 files changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index 560021c7..54265beb 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -1788,7 +1788,7 @@ void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) {
|
||||
}
|
||||
}
|
||||
|
||||
-void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
|
||||
+void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img) {
|
||||
img->nx = nx;
|
||||
img->ny = ny;
|
||||
img->buf.resize(3 * nx * ny);
|
||||
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
|
||||
index ce6f6194..f9f80d7d 100644
|
||||
--- a/examples/llava/clip.h
|
||||
+++ b/examples/llava/clip.h
|
||||
@@ -75,7 +75,7 @@ CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
|
||||
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
||||
|
||||
/** build image from pixels decoded by other libraries instead of stb_image.h for better performance. The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes */
|
||||
-CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img);
|
||||
+CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
||||
|
||||
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||
|
||||
Reference in New Issue
Block a user