mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
llama: update to commit e1e8e099 (#10513)
This commit is contained in:
@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
|
||||
|
||||
/**
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index a7febef7..31750b6f 100644
|
||||
index 9fb2134f..04ce764e 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
|
||||
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
|
||||
|
||||
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 266d8af4..12886cd3 100644
|
||||
index d92392ed..425524d0 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -4759,6 +4759,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
@@ -5077,6 +5077,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
}
|
||||
|
||||
free(ctx);
|
||||
@@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644
|
||||
|
||||
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
index a0667b7d..bd83adc5 100644
|
||||
index 140a775f..e33c4ba0 100644
|
||||
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
@@ -468,6 +468,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -477,6 +477,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
|
||||
GGML_ASSERT(status);
|
||||
delete ctx;
|
||||
@@ -161,10 +161,10 @@ index a0667b7d..bd83adc5 100644
|
||||
|
||||
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
index 1de34c96..4600f61e 100644
|
||||
index 66b6f2cc..e3e6deae 100644
|
||||
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
@@ -317,6 +317,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
ggml_sycl_set_device(ctx->device);
|
||||
|
||||
delete ctx;
|
||||
@@ -172,7 +172,7 @@ index 1de34c96..4600f61e 100644
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
@@ -761,6 +762,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
||||
@@ -762,6 +763,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
||||
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
@@ -180,7 +180,7 @@ index 1de34c96..4600f61e 100644
|
||||
}
|
||||
|
||||
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -1095,6 +1097,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
||||
@@ -1096,6 +1098,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
||||
|
||||
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_sycl_host_free(buffer->context);
|
||||
@@ -189,10 +189,10 @@ index 1de34c96..4600f61e 100644
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 39f3cd34..c569a8a5 100644
|
||||
index c0bdb9e1..03d03064 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -8653,6 +8653,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -8660,6 +8660,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
||||
delete ctx;
|
||||
@@ -200,7 +200,7 @@ index 39f3cd34..c569a8a5 100644
|
||||
}
|
||||
|
||||
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -8796,6 +8797,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
@@ -8803,6 +8804,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
||||
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
||||
|
||||
@@ -10,7 +10,7 @@ logs instead of throwing an error
|
||||
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 48060517..a35b498c 100644
|
||||
index 50ded286..a9ee9f03 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
@@ -31,7 +31,7 @@ index 48060517..a35b498c 100644
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -1634,7 +1625,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
@@ -1635,7 +1626,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
||||
clean_spaces = false;
|
||||
} else {
|
||||
|
||||
@@ -11,10 +11,10 @@ instead of forcing one or the error
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 983385f8..32f59819 100644
|
||||
index 5a2eef9b..9c1fe93f 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -1236,7 +1236,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1225,7 +1225,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
int64_t n_outputs_all = 0;
|
||||
|
||||
// count outputs
|
||||
@@ -23,7 +23,7 @@ index 983385f8..32f59819 100644
|
||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||
n_outputs_all += batch.logits[i] != 0;
|
||||
}
|
||||
@@ -1348,7 +1348,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1337,7 +1337,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||
//}
|
||||
|
||||
@@ -32,7 +32,7 @@ index 983385f8..32f59819 100644
|
||||
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
|
||||
|
||||
if (t_embd && res->get_embd_pooled()) {
|
||||
@@ -1492,7 +1492,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
@@ -1481,7 +1481,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
|
||||
@@ -10,12 +10,12 @@ filesystems for paths that include wide characters
|
||||
1 file changed, 39 insertions(+)
|
||||
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index 75970615..d57b4bd6 100644
|
||||
index ad3e7df1..b3218c78 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -29,6 +29,19 @@
|
||||
#include <limits>
|
||||
@@ -30,6 +30,19 @@
|
||||
#include <array>
|
||||
#include <numeric>
|
||||
|
||||
+#if defined(_WIN32)
|
||||
+#define WIN32_LEAN_AND_MEAN
|
||||
@@ -33,7 +33,7 @@ index 75970615..d57b4bd6 100644
|
||||
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
|
||||
|
||||
//#define CLIP_DEBUG_FUNCTIONS
|
||||
@@ -1430,7 +1443,29 @@ struct clip_model_loader {
|
||||
@@ -1971,7 +1984,29 @@ struct clip_model_loader {
|
||||
{
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
@@ -63,7 +63,7 @@ index 75970615..d57b4bd6 100644
|
||||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
@@ -1457,7 +1492,11 @@ struct clip_model_loader {
|
||||
@@ -1998,7 +2033,11 @@ struct clip_model_loader {
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
|
||||
7 files changed, 248 insertions(+)
|
||||
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index 62e1480b..f754bc8f 100644
|
||||
index f2bc8ca7..5ab3f572 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
@@ -69,6 +69,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GRANITE, "granite" },
|
||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
||||
@@ -26,7 +26,7 @@ index 62e1480b..f754bc8f 100644
|
||||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||
{ LLM_ARCH_PLM, "plm" },
|
||||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
||||
@@ -140,6 +141,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
@@ -142,6 +143,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
@@ -34,7 +34,7 @@ index 62e1480b..f754bc8f 100644
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
|
||||
@@ -1482,6 +1484,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
@@ -1502,6 +1504,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
},
|
||||
},
|
||||
@@ -59,7 +59,7 @@ index 62e1480b..f754bc8f 100644
|
||||
{
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
{
|
||||
@@ -1660,6 +1680,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
@@ -1680,6 +1700,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
@@ -68,10 +68,10 @@ index 62e1480b..f754bc8f 100644
|
||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index 98ca00a1..439aaeab 100644
|
||||
index 41a023da..525c1b7d 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -72,6 +72,7 @@ enum llm_arch {
|
||||
@@ -73,6 +73,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GRANITE,
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
LLM_ARCH_CHAMELEON,
|
||||
@@ -79,7 +79,7 @@ index 98ca00a1..439aaeab 100644
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
LLM_ARCH_PLM,
|
||||
LLM_ARCH_BAILINGMOE,
|
||||
@@ -144,6 +145,7 @@ enum llm_kv {
|
||||
@@ -146,6 +147,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
@@ -87,7 +87,7 @@ index 98ca00a1..439aaeab 100644
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
|
||||
@@ -344,6 +346,7 @@ enum llm_tensor {
|
||||
@@ -346,6 +348,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
@@ -115,7 +115,7 @@ index 90dfe7a7..8a667960 100644
|
||||
if (il < n_layer) {
|
||||
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index 80fcd65d..6e278945 100644
|
||||
index 7ee6a5b7..48dce407 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -55,6 +55,8 @@ struct llama_hparams {
|
||||
@@ -127,7 +127,7 @@ index 80fcd65d..6e278945 100644
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
uint32_t n_lora_kv = 0;
|
||||
@@ -153,6 +155,9 @@ struct llama_hparams {
|
||||
@@ -154,6 +156,9 @@ struct llama_hparams {
|
||||
// dimension of the recurrent state embeddings
|
||||
uint32_t n_embd_v_s() const;
|
||||
|
||||
@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
|
||||
llama_model_loader::llama_model_loader(
|
||||
const std::string & fname,
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 6b7bfecf..aba42819 100644
|
||||
index 822e2bb2..572378c9 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1374,6 +1374,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -1386,6 +1386,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@@ -175,7 +175,7 @@ index 6b7bfecf..aba42819 100644
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -3717,6 +3732,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -3741,6 +3756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
@@ -210,7 +210,7 @@ index 6b7bfecf..aba42819 100644
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -12296,6 +12339,165 @@ struct llm_build_chameleon : public llm_graph_context {
|
||||
@@ -12342,6 +12385,165 @@ struct llm_build_chameleon : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
@@ -376,7 +376,7 @@ index 6b7bfecf..aba42819 100644
|
||||
struct llm_build_wavtokenizer_dec : public llm_graph_context {
|
||||
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
ggml_tensor * cur;
|
||||
@@ -13045,6 +13247,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
@@ -13092,6 +13294,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
{
|
||||
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
|
||||
} break;
|
||||
@@ -387,7 +387,7 @@ index 6b7bfecf..aba42819 100644
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
|
||||
@@ -13191,6 +13397,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -13238,6 +13444,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
@@ -396,18 +396,18 @@ index 6b7bfecf..aba42819 100644
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index fd82d106..5865d5e9 100644
|
||||
index 95eca002..856e6042 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -62,6 +62,7 @@ enum llm_type {
|
||||
@@ -64,6 +64,7 @@ enum llm_type {
|
||||
LLM_TYPE_15B,
|
||||
LLM_TYPE_16B,
|
||||
LLM_TYPE_20B,
|
||||
+ LLM_TYPE_22B,
|
||||
LLM_TYPE_27B,
|
||||
LLM_TYPE_30B,
|
||||
LLM_TYPE_32B,
|
||||
LLM_TYPE_34B,
|
||||
@@ -307,6 +308,8 @@ struct llama_layer {
|
||||
@@ -311,6 +312,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_up_scale = nullptr;
|
||||
struct ggml_tensor * ffn_down_scale = nullptr;
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ Subject: [PATCH] add mllama support
|
||||
|
||||
adds support for the llama 3.2 vision architecture
|
||||
---
|
||||
examples/llava/gemma3-cli.cpp | 3 +-
|
||||
examples/llava/llava.cpp | 5 +-
|
||||
examples/llava/mtmd.cpp | 6 +-
|
||||
ggml/src/ggml-backend-reg.cpp | 6 +-
|
||||
@@ -25,34 +24,13 @@ adds support for the llama 3.2 vision architecture
|
||||
src/llama-model.cpp | 309 +++++++++++++++++++++++++++++++++-
|
||||
src/llama-model.h | 12 ++
|
||||
src/llama-quant.cpp | 4 +-
|
||||
20 files changed, 475 insertions(+), 22 deletions(-)
|
||||
19 files changed, 473 insertions(+), 21 deletions(-)
|
||||
|
||||
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
|
||||
index 3d566475..654d1358 100644
|
||||
--- a/examples/llava/gemma3-cli.cpp
|
||||
+++ b/examples/llava/gemma3-cli.cpp
|
||||
@@ -106,7 +106,7 @@ struct decode_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -118,6 +118,7 @@ struct decode_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
+ /*n_embd =*/ n_embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
|
||||
index 03a22cbb..5eb40bcd 100644
|
||||
index c00d16ae..bab027b5 100644
|
||||
--- a/examples/llava/llava.cpp
|
||||
+++ b/examples/llava/llava.cpp
|
||||
@@ -456,7 +456,7 @@ struct llava_embd_batch {
|
||||
@@ -457,7 +457,7 @@ struct llava_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
@@ -61,7 +39,7 @@ index 03a22cbb..5eb40bcd 100644
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -468,6 +468,7 @@ struct llava_embd_batch {
|
||||
@@ -469,6 +469,7 @@ struct llava_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
@@ -69,7 +47,7 @@ index 03a22cbb..5eb40bcd 100644
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@@ -491,7 +492,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
@@ -492,7 +493,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
n_eval = n_batch;
|
||||
}
|
||||
float * embd = image_embed->embed+i*n_embd;
|
||||
@@ -79,19 +57,19 @@ index 03a22cbb..5eb40bcd 100644
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
|
||||
index 3fd5bebc..f0cec596 100644
|
||||
index 7081fd73..c14ac501 100644
|
||||
--- a/examples/llava/mtmd.cpp
|
||||
+++ b/examples/llava/mtmd.cpp
|
||||
@@ -233,7 +233,7 @@ struct decode_embd_batch {
|
||||
@@ -476,7 +476,7 @@ struct decode_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
- decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
pos .resize(n_tokens * n_pos_per_embd);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -245,6 +245,7 @@ struct decode_embd_batch {
|
||||
@@ -487,6 +487,7 @@ struct decode_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
@@ -99,16 +77,16 @@ index 3fd5bebc..f0cec596 100644
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@@ -311,7 +312,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
|
||||
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
|
||||
@@ -610,7 +611,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
int32_t i_batch = 0;
|
||||
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
||||
float * embd = mtmd_get_output_embd(ctx);
|
||||
- decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
|
||||
- decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
||||
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
|
||||
+ decode_embd_batch batch_img(embd, n_embd, n_tokens, n_past, 0);
|
||||
int64_t t1 = ggml_time_ms();
|
||||
ret = llama_decode(lctx, batch_img.batch);
|
||||
if (ret != 0) {
|
||||
+ decode_embd_batch batch_embd(embd, n_embd, n_tokens, n_past, 0);
|
||||
|
||||
const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
|
||||
const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index 405d8e31..82ae1b5b 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -127,10 +105,10 @@ index 405d8e31..82ae1b5b 100644
|
||||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
diff --git a/include/llama.h b/include/llama.h
|
||||
index 5657fbf0..f91896e4 100644
|
||||
index 06c56395..f1628e88 100644
|
||||
--- a/include/llama.h
|
||||
+++ b/include/llama.h
|
||||
@@ -255,6 +255,7 @@ extern "C" {
|
||||
@@ -256,6 +256,7 @@ extern "C" {
|
||||
|
||||
llama_token * token;
|
||||
float * embd;
|
||||
@@ -138,7 +116,7 @@ index 5657fbf0..f91896e4 100644
|
||||
llama_pos * pos;
|
||||
int32_t * n_seq_id;
|
||||
llama_seq_id ** seq_id;
|
||||
@@ -357,6 +358,7 @@ extern "C" {
|
||||
@@ -358,6 +359,7 @@ extern "C" {
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
bool no_perf; // whether to measure performance timings
|
||||
@@ -146,7 +124,7 @@ index 5657fbf0..f91896e4 100644
|
||||
|
||||
// Abort callback
|
||||
// if it returns true, execution of llama_decode() will be aborted
|
||||
@@ -458,6 +460,10 @@ extern "C" {
|
||||
@@ -459,6 +461,10 @@ extern "C" {
|
||||
struct llama_context_params params),
|
||||
"use llama_init_from_model instead");
|
||||
|
||||
@@ -158,7 +136,7 @@ index 5657fbf0..f91896e4 100644
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index f754bc8f..0568565f 100644
|
||||
index 5ab3f572..eb7b5325 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -6,6 +6,7 @@
|
||||
@@ -169,7 +147,7 @@ index f754bc8f..0568565f 100644
|
||||
{ LLM_ARCH_LLAMA4, "llama4" },
|
||||
{ LLM_ARCH_DECI, "deci" },
|
||||
{ LLM_ARCH_FALCON, "falcon" },
|
||||
@@ -142,6 +143,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
@@ -144,6 +145,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
@@ -177,7 +155,7 @@ index f754bc8f..0568565f 100644
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
|
||||
@@ -271,6 +273,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
@@ -273,6 +275,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
},
|
||||
},
|
||||
@@ -218,7 +196,7 @@ index f754bc8f..0568565f 100644
|
||||
{
|
||||
LLM_ARCH_DECI,
|
||||
{
|
||||
@@ -1681,6 +1717,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
@@ -1701,6 +1737,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
@@ -234,7 +212,7 @@ index f754bc8f..0568565f 100644
|
||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index 439aaeab..6a989034 100644
|
||||
index 525c1b7d..bc8a4f0b 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -11,6 +11,7 @@
|
||||
@@ -245,7 +223,7 @@ index 439aaeab..6a989034 100644
|
||||
LLM_ARCH_DECI,
|
||||
LLM_ARCH_FALCON,
|
||||
LLM_ARCH_BAICHUAN,
|
||||
@@ -146,6 +147,7 @@ enum llm_kv {
|
||||
@@ -148,6 +149,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
@@ -253,7 +231,7 @@ index 439aaeab..6a989034 100644
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
|
||||
@@ -347,6 +349,14 @@ enum llm_tensor {
|
||||
@@ -349,6 +351,14 @@ enum llm_tensor {
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
LLM_TENSOR_BSKCN_TV,
|
||||
@@ -297,10 +275,10 @@ index 01d5ca57..8682b0e6 100644
|
||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
||||
}
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 32f59819..0343ba8a 100644
|
||||
index 9c1fe93f..cd06ad91 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -862,7 +862,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
@@ -851,7 +851,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
|
||||
}
|
||||
|
||||
@@ -309,7 +287,7 @@ index 32f59819..0343ba8a 100644
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
||||
#ifndef NDEBUG
|
||||
@@ -983,6 +983,10 @@ void llama_context::set_warmup(bool value) {
|
||||
@@ -972,6 +972,10 @@ void llama_context::set_warmup(bool value) {
|
||||
cparams.warmup = value;
|
||||
}
|
||||
|
||||
@@ -320,7 +298,7 @@ index 32f59819..0343ba8a 100644
|
||||
void llama_context::set_adapter_lora(
|
||||
llama_adapter_lora * adapter,
|
||||
float scale) {
|
||||
@@ -1058,7 +1062,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
@@ -1047,7 +1051,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
@@ -329,7 +307,7 @@ index 32f59819..0343ba8a 100644
|
||||
|
||||
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
|
||||
|
||||
@@ -1198,10 +1202,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1187,10 +1191,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
const llama_batch & batch = batch_allocr.batch;
|
||||
|
||||
@@ -341,7 +319,7 @@ index 32f59819..0343ba8a 100644
|
||||
|
||||
const int64_t n_tokens_all = batch.n_tokens;
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
@@ -1249,7 +1252,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1238,7 +1241,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
const bool logits_all = n_outputs_all == n_tokens_all;
|
||||
|
||||
@@ -350,7 +328,7 @@ index 32f59819..0343ba8a 100644
|
||||
/* simple_split */ !kv_self->recurrent,
|
||||
/* logits_all */ logits_all);
|
||||
|
||||
@@ -1483,12 +1486,11 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1472,12 +1475,11 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
const auto & hparams = model.hparams;
|
||||
@@ -364,7 +342,7 @@ index 32f59819..0343ba8a 100644
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
@@ -1558,7 +1560,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
@@ -1545,7 +1547,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
void llama_context::output_reorder() {
|
||||
auto & out_ids = sbatch.out_ids;
|
||||
if (!out_ids.empty()) {
|
||||
@@ -373,7 +351,7 @@ index 32f59819..0343ba8a 100644
|
||||
const uint32_t n_embd = model.hparams.n_embd;
|
||||
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
@@ -2065,7 +2067,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||
@@ -2052,7 +2054,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||
{
|
||||
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
|
||||
|
||||
@@ -382,7 +360,7 @@ index 32f59819..0343ba8a 100644
|
||||
|
||||
io.write(&logits_size, sizeof(logits_size));
|
||||
|
||||
@@ -2248,6 +2250,7 @@ llama_context_params llama_context_default_params() {
|
||||
@@ -2235,6 +2237,7 @@ llama_context_params llama_context_default_params() {
|
||||
/*.offload_kqv =*/ true,
|
||||
/*.flash_attn =*/ false,
|
||||
/*.no_perf =*/ true,
|
||||
@@ -390,7 +368,7 @@ index 32f59819..0343ba8a 100644
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
};
|
||||
@@ -2375,6 +2378,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
|
||||
@@ -2362,6 +2365,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
|
||||
ctx->set_warmup(warmup);
|
||||
}
|
||||
|
||||
@@ -402,7 +380,7 @@ index 32f59819..0343ba8a 100644
|
||||
ctx->synchronize();
|
||||
}
|
||||
diff --git a/src/llama-context.h b/src/llama-context.h
|
||||
index 04facb54..baa03276 100644
|
||||
index 5457f077..a50c4afa 100644
|
||||
--- a/src/llama-context.h
|
||||
+++ b/src/llama-context.h
|
||||
@@ -65,6 +65,7 @@ struct llama_context {
|
||||
@@ -426,10 +404,10 @@ index 30e550f0..85ad91b9 100644
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
|
||||
index a85e9728..d740c120 100644
|
||||
index fabb9ca2..b67216a4 100644
|
||||
--- a/src/llama-graph.cpp
|
||||
+++ b/src/llama-graph.cpp
|
||||
@@ -546,6 +546,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||
@@ -560,6 +560,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
@@ -442,7 +420,7 @@ index a85e9728..d740c120 100644
|
||||
//
|
||||
// llm_graph_context
|
||||
//
|
||||
@@ -1506,6 +1512,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
||||
@@ -1532,6 +1538,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
||||
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
|
||||
}
|
||||
|
||||
@@ -469,7 +447,7 @@ index a85e9728..d740c120 100644
|
||||
llm_graph_input_attn_cross * inp,
|
||||
ggml_cgraph * gf,
|
||||
diff --git a/src/llama-graph.h b/src/llama-graph.h
|
||||
index d192dc14..260a2af2 100644
|
||||
index d0c8d321..0fe18150 100644
|
||||
--- a/src/llama-graph.h
|
||||
+++ b/src/llama-graph.h
|
||||
@@ -86,6 +86,7 @@ public:
|
||||
@@ -480,7 +458,7 @@ index d192dc14..260a2af2 100644
|
||||
};
|
||||
|
||||
class llm_graph_input_pos : public llm_graph_input_i {
|
||||
@@ -285,6 +286,16 @@ public:
|
||||
@@ -283,6 +284,16 @@ public:
|
||||
const llama_cross * cross = nullptr;
|
||||
};
|
||||
|
||||
@@ -497,7 +475,7 @@ index d192dc14..260a2af2 100644
|
||||
//
|
||||
// llm_graph_result
|
||||
//
|
||||
@@ -493,6 +504,7 @@ struct llm_graph_context {
|
||||
@@ -491,6 +502,7 @@ struct llm_graph_context {
|
||||
ggml_tensor * build_inp_cls() const;
|
||||
ggml_tensor * build_inp_s_copy() const;
|
||||
ggml_tensor * build_inp_s_mask() const;
|
||||
@@ -518,7 +496,7 @@ index 8a667960..6a02de03 100644
|
||||
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
|
||||
+}
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index 6e278945..c8a34d52 100644
|
||||
index 48dce407..b6fc7e6d 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -2,6 +2,8 @@
|
||||
@@ -546,7 +524,7 @@ index 6e278945..c8a34d52 100644
|
||||
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
@@ -158,6 +162,9 @@ struct llama_hparams {
|
||||
@@ -159,6 +163,9 @@ struct llama_hparams {
|
||||
// Block skip connection
|
||||
bool n_bskcn(uint32_t n, uint32_t il) const;
|
||||
|
||||
@@ -593,10 +571,10 @@ index a012aeae..2e11507d 100644
|
||||
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index aba42819..d051696c 100644
|
||||
index 572378c9..9d099f11 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -419,6 +419,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -423,6 +423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
// get general kv
|
||||
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
|
||||
@@ -604,7 +582,7 @@ index aba42819..d051696c 100644
|
||||
|
||||
// everything past this point is not vocab-related
|
||||
if (hparams.vocab_only) {
|
||||
@@ -430,6 +431,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -434,6 +435,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
@@ -612,7 +590,7 @@ index aba42819..d051696c 100644
|
||||
|
||||
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||||
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
||||
@@ -453,9 +455,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -457,9 +459,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||||
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||||
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||||
@@ -624,7 +602,7 @@ index aba42819..d051696c 100644
|
||||
|
||||
// n_head_kv is optional, default to n_head
|
||||
hparams.n_head_kv_arr = hparams.n_head_arr;
|
||||
@@ -508,7 +512,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -512,7 +516,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||
|
||||
@@ -633,7 +611,7 @@ index aba42819..d051696c 100644
|
||||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||||
}
|
||||
@@ -571,6 +575,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -575,6 +579,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.use_kq_norm = false;
|
||||
}
|
||||
} break;
|
||||
@@ -650,7 +628,7 @@ index aba42819..d051696c 100644
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@@ -1550,7 +1564,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -1562,7 +1576,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||
const int64_t n_ff = hparams.n_ff();
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
@@ -659,7 +637,7 @@ index aba42819..d051696c 100644
|
||||
const int64_t n_token_types = vocab.n_token_types();
|
||||
const int64_t n_rot = hparams.n_rot;
|
||||
const int64_t n_expert = hparams.n_expert;
|
||||
@@ -1803,6 +1817,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -1815,6 +1829,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
} break;
|
||||
@@ -712,7 +690,7 @@ index aba42819..d051696c 100644
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@@ -4683,6 +4743,246 @@ struct llm_build_llama : public llm_graph_context {
|
||||
@@ -4707,6 +4767,246 @@ struct llm_build_llama : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
@@ -959,7 +937,7 @@ index aba42819..d051696c 100644
|
||||
struct llm_build_deci : public llm_graph_context {
|
||||
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
@@ -13017,6 +13317,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
@@ -13063,6 +13363,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
{
|
||||
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
||||
} break;
|
||||
@@ -970,7 +948,7 @@ index aba42819..d051696c 100644
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
llm = std::make_unique<llm_build_deci>(*this, params, gf);
|
||||
@@ -13377,6 +13681,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -13424,6 +13728,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_LLAMA4:
|
||||
@@ -979,7 +957,7 @@ index aba42819..d051696c 100644
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
case LLM_ARCH_STARCODER:
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index 5865d5e9..72bab5be 100644
|
||||
index 856e6042..6be91282 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -11,6 +11,7 @@
|
||||
@@ -990,15 +968,15 @@ index 5865d5e9..72bab5be 100644
|
||||
|
||||
struct llama_cparams;
|
||||
struct llama_ubatch;
|
||||
@@ -70,6 +71,7 @@ enum llm_type {
|
||||
@@ -73,6 +74,7 @@ enum llm_type {
|
||||
LLM_TYPE_40B,
|
||||
LLM_TYPE_65B,
|
||||
LLM_TYPE_70B,
|
||||
+ LLM_TYPE_90B,
|
||||
LLM_TYPE_236B,
|
||||
LLM_TYPE_290B,
|
||||
LLM_TYPE_314B,
|
||||
LLM_TYPE_671B,
|
||||
@@ -310,6 +312,16 @@ struct llama_layer {
|
||||
@@ -314,6 +316,16 @@ struct llama_layer {
|
||||
|
||||
struct ggml_tensor * bskcn_tv = nullptr;
|
||||
|
||||
|
||||
@@ -18,10 +18,10 @@ adds the unpad operator to GGML
|
||||
10 files changed, 223 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
||||
index 8fcc16df..d19fc167 100644
|
||||
index 1b8603e7..53ef31b2 100644
|
||||
--- a/ggml/include/ggml.h
|
||||
+++ b/ggml/include/ggml.h
|
||||
@@ -488,6 +488,7 @@ extern "C" {
|
||||
@@ -489,6 +489,7 @@ extern "C" {
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
GGML_OP_PAD,
|
||||
GGML_OP_PAD_REFLECT_1D,
|
||||
@@ -29,7 +29,7 @@ index 8fcc16df..d19fc167 100644
|
||||
GGML_OP_ARANGE,
|
||||
GGML_OP_TIMESTEP_EMBEDDING,
|
||||
GGML_OP_ARGSORT,
|
||||
@@ -1757,6 +1758,15 @@ extern "C" {
|
||||
@@ -1777,6 +1778,15 @@ extern "C" {
|
||||
int p0,
|
||||
int p1);
|
||||
|
||||
@@ -46,10 +46,10 @@ index 8fcc16df..d19fc167 100644
|
||||
// timesteps: [N,]
|
||||
// return: [N, dim]
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 50400328..432942bf 100644
|
||||
index 64405449..34624cca 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -1960,6 +1960,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
@@ -1964,6 +1964,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_pad_reflect_1d(params, tensor);
|
||||
} break;
|
||||
@@ -60,7 +60,7 @@ index 50400328..432942bf 100644
|
||||
case GGML_OP_ARANGE:
|
||||
{
|
||||
ggml_compute_forward_arange(params, tensor);
|
||||
@@ -2282,6 +2286,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
@@ -2287,6 +2291,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
@@ -69,10 +69,10 @@ index 50400328..432942bf 100644
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||
index 6050147b..66b8da68 100644
|
||||
index 7413192b..becdae07 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||
@@ -6531,6 +6531,61 @@ void ggml_compute_forward_pad_reflect_1d(
|
||||
@@ -6703,6 +6703,61 @@ void ggml_compute_forward_pad_reflect_1d(
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,10 +135,10 @@ index 6050147b..66b8da68 100644
|
||||
|
||||
static void ggml_compute_forward_arange_f32(
|
||||
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
|
||||
index 410a3720..3eca1cf8 100644
|
||||
index dc081b9e..a7125555 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.h
|
||||
+++ b/ggml/src/ggml-cpu/ops.h
|
||||
@@ -71,6 +71,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
|
||||
@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
|
||||
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
@@ -147,10 +147,10 @@ index 410a3720..3eca1cf8 100644
|
||||
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 31750b6f..0fef9522 100644
|
||||
index 04ce764e..491acccb 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2246,6 +2246,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
@@ -2223,6 +2223,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_PAD:
|
||||
ggml_cuda_op_pad(ctx, dst);
|
||||
break;
|
||||
@@ -160,7 +160,7 @@ index 31750b6f..0fef9522 100644
|
||||
case GGML_OP_ARANGE:
|
||||
ggml_cuda_op_arange(ctx, dst);
|
||||
break;
|
||||
@@ -3222,6 +3225,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_UPSCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
|
||||
case GGML_OP_PAD:
|
||||
@@ -233,7 +233,7 @@ index 8fd386b0..e2ededc3 100644
|
||||
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 12886cd3..b2e95a66 100644
|
||||
index 425524d0..112abef6 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
@@ -244,7 +244,7 @@ index 12886cd3..b2e95a66 100644
|
||||
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
||||
@@ -1020,6 +1021,7 @@ @implementation GGMLMetalClass
|
||||
@@ -1277,6 +1278,7 @@ @implementation GGMLMetalClass
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
|
||||
@@ -252,7 +252,7 @@ index 12886cd3..b2e95a66 100644
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||
@@ -1384,6 +1386,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
@@ -1647,6 +1649,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
@@ -260,7 +260,7 @@ index 12886cd3..b2e95a66 100644
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
@@ -3731,6 +3734,36 @@ static void ggml_metal_encode_node(
|
||||
@@ -4047,6 +4050,36 @@ static bool ggml_metal_encode_node(
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
@@ -298,7 +298,7 @@ index 12886cd3..b2e95a66 100644
|
||||
} break;
|
||||
case GGML_OP_ARANGE:
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
index 8d6e99e6..71f0f97f 100644
|
||||
index 9f4147e9..6ceb3cef 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
@@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32(
|
||||
@@ -354,10 +354,10 @@ index 8d6e99e6..71f0f97f 100644
|
||||
device char * dst,
|
||||
constant ggml_metal_kargs_arange & args,
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index 950772c7..2276b631 100644
|
||||
index 7654ae17..3c57aff8 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -963,6 +963,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"UPSCALE",
|
||||
"PAD",
|
||||
"PAD_REFLECT_1D",
|
||||
@@ -365,16 +365,16 @@ index 950772c7..2276b631 100644
|
||||
"ARANGE",
|
||||
"TIMESTEP_EMBEDDING",
|
||||
"ARGSORT",
|
||||
@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"OPT_STEP_ADAMW",
|
||||
};
|
||||
|
||||
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
|
||||
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -1057,6 +1058,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"upscale(x)",
|
||||
"pad(x)",
|
||||
"pad_reflect_1d(x)",
|
||||
@@ -382,16 +382,16 @@ index 950772c7..2276b631 100644
|
||||
"arange(start, stop, step)",
|
||||
"timestep_embedding(timesteps, dim, max_period)",
|
||||
"argsort(x)",
|
||||
@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"adamw(x)",
|
||||
};
|
||||
|
||||
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
|
||||
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -4262,6 +4264,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
||||
@@ -4270,6 +4272,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ regex
|
||||
2 files changed, 22 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index a35b498c..032019c9 100644
|
||||
index a9ee9f03..1306864e 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
|
||||
index 90679822..56043678 100644
|
||||
index 5b3059c2..656b3eca 100644
|
||||
--- a/common/json-schema-to-grammar.cpp
|
||||
+++ b/common/json-schema-to-grammar.cpp
|
||||
@@ -346,7 +346,7 @@ private:
|
||||
@@ -349,7 +349,7 @@ private:
|
||||
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
||||
std::function<json(const std::string &)> _fetch_json;
|
||||
bool _dotall;
|
||||
|
||||
@@ -22,10 +22,10 @@ multiple batches of processing until everything is complete.
|
||||
4 files changed, 51 insertions(+), 106 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 0343ba8a..4b3e6a83 100644
|
||||
index cd06ad91..77177c5e 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -594,13 +594,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
|
||||
@@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
|
||||
|
||||
llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
@@ -41,7 +41,7 @@ index 0343ba8a..4b3e6a83 100644
|
||||
#if 0
|
||||
// CPU defrag
|
||||
//
|
||||
@@ -672,32 +671,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
@@ -661,32 +660,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
|
||||
}
|
||||
#else
|
||||
@@ -79,7 +79,7 @@ index 0343ba8a..4b3e6a83 100644
|
||||
|
||||
ggml_tensor * view_v_src;
|
||||
ggml_tensor * view_v_dst;
|
||||
@@ -705,34 +692,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
@@ -694,34 +681,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
if (cparams.flash_attn) {
|
||||
// NOTE: the V cache is not transposed when using flash attention
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
@@ -122,7 +122,7 @@ index 0343ba8a..4b3e6a83 100644
|
||||
#endif
|
||||
|
||||
return res;
|
||||
@@ -741,8 +724,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
@@ -730,8 +713,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
void llama_context::kv_self_update() {
|
||||
auto & kv = kv_self;
|
||||
|
||||
@@ -131,7 +131,7 @@ index 0343ba8a..4b3e6a83 100644
|
||||
if (kv->has_shift) {
|
||||
if (!kv->get_can_shift()) {
|
||||
GGML_ABORT("The current context does not support K-shift");
|
||||
@@ -763,8 +744,6 @@ void llama_context::kv_self_update() {
|
||||
@@ -752,8 +733,6 @@ void llama_context::kv_self_update() {
|
||||
res->set_inputs(nullptr);
|
||||
|
||||
graph_compute(gf, false);
|
||||
@@ -140,7 +140,7 @@ index 0343ba8a..4b3e6a83 100644
|
||||
}
|
||||
|
||||
{
|
||||
@@ -779,49 +758,28 @@ void llama_context::kv_self_update() {
|
||||
@@ -768,49 +747,28 @@ void llama_context::kv_self_update() {
|
||||
// defragment the KV cache if needed
|
||||
if (kv->do_defrag) {
|
||||
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
|
||||
@@ -202,7 +202,7 @@ index 0343ba8a..4b3e6a83 100644
|
||||
}
|
||||
|
||||
enum llama_pooling_type llama_context::pooling_type() const {
|
||||
@@ -1305,9 +1263,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// find KV slot
|
||||
{
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
@@ -219,7 +219,7 @@ index 0343ba8a..4b3e6a83 100644
|
||||
|
||||
if (!kv_self->recurrent) {
|
||||
diff --git a/src/llama-context.h b/src/llama-context.h
|
||||
index baa03276..a59ff8fd 100644
|
||||
index a50c4afa..30f84bfd 100644
|
||||
--- a/src/llama-context.h
|
||||
+++ b/src/llama-context.h
|
||||
@@ -5,6 +5,7 @@
|
||||
@@ -230,7 +230,7 @@ index baa03276..a59ff8fd 100644
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
@@ -180,7 +181,8 @@ private:
|
||||
@@ -179,7 +180,8 @@ private:
|
||||
|
||||
llm_graph_result_ptr build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index f00700da..91d6a7d5 100644
|
||||
index 43d9fc4f..4c0d3824 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
@@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
endforeach()
|
||||
|
||||
ggml_add_cpu_backend_variant_impl(${tag_name})
|
||||
@@ -19,11 +19,11 @@ index f00700da..91d6a7d5 100644
|
||||
endfunction()
|
||||
|
||||
ggml_add_backend(CPU)
|
||||
@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
@@ -287,6 +288,7 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
||||
endif()
|
||||
+ add_custom_target(ggml-cpu)
|
||||
ggml_add_cpu_backend_variant(sandybridge AVX)
|
||||
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(x64)
|
||||
ggml_add_cpu_backend_variant(sse42 SSE42)
|
||||
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Tue, 8 Apr 2025 20:33:01 -0700
|
||||
Date: Thu, 1 May 2025 15:05:08 -0700
|
||||
Subject: [PATCH] remove amx
|
||||
|
||||
disable amx as it reduces performance on some systems
|
||||
@@ -9,16 +9,16 @@ disable amx as it reduces performance on some systems
|
||||
1 file changed, 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index 91d6a7d5..d6b393a2 100644
|
||||
index 4c0d3824..79c26312 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
@@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
- if (NOT MSVC)
|
||||
- # MSVC doesn't support AMX
|
||||
- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
- ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
- endif()
|
||||
elseif (GGML_CPU)
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
|
||||
@@ -53,7 +53,7 @@ index 381a9c7d..e45b453d 100644
|
||||
}
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 032019c9..ba37df35 100644
|
||||
index 1306864e..d6515ff6 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
||||
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 432942bf..6d4abe4c 100644
|
||||
index 34624cca..59bd3c62 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -15,6 +15,8 @@
|
||||
@@ -20,7 +20,7 @@ index 432942bf..6d4abe4c 100644
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
||||
@@ -2854,6 +2856,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
@@ -2859,6 +2861,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
|
||||
@@ -13,10 +13,10 @@ models not supported in llama.cpp
|
||||
4 files changed, 24 insertions(+)
|
||||
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index 0568565f..dd01df60 100644
|
||||
index eb7b5325..df42d1a5 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -73,6 +73,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
@@ -74,6 +74,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||
{ LLM_ARCH_PLM, "plm" },
|
||||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
||||
@@ -24,7 +24,7 @@ index 0568565f..dd01df60 100644
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -1586,6 +1587,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
@@ -1606,6 +1607,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
},
|
||||
},
|
||||
@@ -48,10 +48,10 @@ index 0568565f..dd01df60 100644
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index 6a989034..b6227eeb 100644
|
||||
index bc8a4f0b..bda9d071 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -75,6 +75,7 @@ enum llm_arch {
|
||||
@@ -76,6 +76,7 @@ enum llm_arch {
|
||||
LLM_ARCH_CHAMELEON,
|
||||
LLM_ARCH_SOLAR,
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
@@ -60,10 +60,10 @@ index 6a989034..b6227eeb 100644
|
||||
LLM_ARCH_BAILINGMOE,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index d051696c..c8374159 100644
|
||||
index 9d099f11..ef70486d 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1425,6 +1425,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -1437,6 +1437,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@@ -71,7 +71,7 @@ index d051696c..c8374159 100644
|
||||
default: throw std::runtime_error("unsupported model architecture");
|
||||
}
|
||||
|
||||
@@ -13704,6 +13705,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -13751,6 +13752,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
case LLM_ARCH_SOLAR:
|
||||
case LLM_ARCH_BAILINGMOE:
|
||||
|
||||
@@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644
|
||||
const char * grammar_root,
|
||||
bool lazy,
|
||||
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
|
||||
index d1497985..b1a9dca3 100644
|
||||
index c0a5f934..75731053 100644
|
||||
--- a/src/llama-sampling.cpp
|
||||
+++ b/src/llama-sampling.cpp
|
||||
@@ -1465,7 +1465,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
||||
@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
||||
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
|
||||
}
|
||||
|
||||
@@ -196,7 +196,7 @@ index d1497985..b1a9dca3 100644
|
||||
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
|
||||
|
||||
@@ -1547,7 +1547,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
@@ -1548,7 +1548,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
/* .vocab = */ vocab,
|
||||
/* .grammar_str = */ grammar_str,
|
||||
/* .grammar_root = */ grammar_root,
|
||||
Reference in New Issue
Block a user