mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 07:46:59 +00:00
llama: update to commit de4c07f93 (#10655)
This commit is contained in:
@@ -24,7 +24,7 @@ problem.
|
||||
9 files changed, 21 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 273075f4..dd11f304 100644
|
||||
index b30b4cb3..0ce73a99 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
@@ -43,7 +43,7 @@ index 273075f4..dd11f304 100644
|
||||
}
|
||||
|
||||
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
@@ -1867,6 +1867,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -1871,6 +1871,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
|
||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_aligned_free(buffer->context, buffer->size);
|
||||
@@ -55,7 +55,7 @@ index 273075f4..dd11f304 100644
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
@@ -1914,7 +1919,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||
@@ -1918,7 +1923,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||
};
|
||||
|
||||
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
||||
@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
|
||||
|
||||
/**
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 9fb2134f..04ce764e 100644
|
||||
index b4b85abc..cb0d8528 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
|
||||
@@ -96,7 +96,7 @@ index 9fb2134f..04ce764e 100644
|
||||
}
|
||||
|
||||
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
||||
@@ -789,6 +790,7 @@ struct ggml_backend_cuda_split_buffer_context {
|
||||
@@ -790,6 +791,7 @@ struct ggml_backend_cuda_split_buffer_context {
|
||||
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
@@ -104,7 +104,7 @@ index 9fb2134f..04ce764e 100644
|
||||
}
|
||||
|
||||
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -1062,6 +1064,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
|
||||
@@ -1067,6 +1069,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
|
||||
|
||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
CUDA_CHECK(cudaFreeHost(buffer->context));
|
||||
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
|
||||
|
||||
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index d92392ed..425524d0 100644
|
||||
index 576f9581..1b56f858 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -5077,6 +5077,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
@@ -5214,6 +5214,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
}
|
||||
|
||||
free(ctx);
|
||||
@@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644
|
||||
|
||||
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
index 140a775f..e33c4ba0 100644
|
||||
index 4f0abb5a..de1ec184 100644
|
||||
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
@@ -477,6 +477,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -483,6 +483,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
|
||||
GGML_ASSERT(status);
|
||||
delete ctx;
|
||||
@@ -161,10 +161,10 @@ index 140a775f..e33c4ba0 100644
|
||||
|
||||
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
index 66b6f2cc..e3e6deae 100644
|
||||
index 0ea72994..ae3a3c33 100644
|
||||
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
@@ -317,6 +317,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
@@ -320,6 +320,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
ggml_sycl_set_device(ctx->device);
|
||||
|
||||
delete ctx;
|
||||
@@ -172,7 +172,7 @@ index 66b6f2cc..e3e6deae 100644
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
@@ -762,6 +763,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
||||
@@ -765,6 +766,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
||||
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
@@ -180,7 +180,7 @@ index 66b6f2cc..e3e6deae 100644
|
||||
}
|
||||
|
||||
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -1096,6 +1098,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
||||
@@ -1099,6 +1101,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
||||
|
||||
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_sycl_host_free(buffer->context);
|
||||
@@ -189,10 +189,10 @@ index 66b6f2cc..e3e6deae 100644
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index c0bdb9e1..03d03064 100644
|
||||
index e2b357fd..68768029 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -8660,6 +8660,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -8962,6 +8962,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
||||
delete ctx;
|
||||
@@ -200,7 +200,7 @@ index c0bdb9e1..03d03064 100644
|
||||
}
|
||||
|
||||
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -8803,6 +8804,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
@@ -9105,6 +9106,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
||||
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
||||
|
||||
@@ -10,10 +10,10 @@ logs instead of throwing an error
|
||||
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 50ded286..a9ee9f03 100644
|
||||
index 9389ca80..806c1b3d 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
@@ -1503,16 +1503,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
if (type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
add_space_prefix = false;
|
||||
clean_spaces = true;
|
||||
@@ -31,8 +31,8 @@ index 50ded286..a9ee9f03 100644
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -1635,7 +1626,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
||||
@@ -1651,7 +1642,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
|
||||
clean_spaces = false;
|
||||
} else {
|
||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
|
||||
@@ -11,10 +11,10 @@ instead of forcing one or the error
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 5a2eef9b..9c1fe93f 100644
|
||||
index 62246c10..dca22d8b 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -1225,7 +1225,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -901,7 +901,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
int64_t n_outputs_all = 0;
|
||||
|
||||
// count outputs
|
||||
@@ -23,7 +23,7 @@ index 5a2eef9b..9c1fe93f 100644
|
||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||
n_outputs_all += batch.logits[i] != 0;
|
||||
}
|
||||
@@ -1337,7 +1337,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -982,7 +982,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||
//}
|
||||
|
||||
@@ -32,7 +32,7 @@ index 5a2eef9b..9c1fe93f 100644
|
||||
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
|
||||
|
||||
if (t_embd && res->get_embd_pooled()) {
|
||||
@@ -1481,7 +1481,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
@@ -1151,7 +1151,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
|
||||
@@ -6,16 +6,16 @@ Subject: [PATCH] clip-unicode
|
||||
fixes loading vision models in llama.cpp on windows
|
||||
filesystems for paths that include wide characters
|
||||
---
|
||||
examples/llava/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
|
||||
tools/mtmd/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 39 insertions(+)
|
||||
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index ad3e7df1..b3218c78 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -30,6 +30,19 @@
|
||||
#include <array>
|
||||
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||
index 41ba45a7..cdd8ca44 100644
|
||||
--- a/tools/mtmd/clip.cpp
|
||||
+++ b/tools/mtmd/clip.cpp
|
||||
@@ -31,6 +31,19 @@
|
||||
#include <numeric>
|
||||
#include <functional>
|
||||
|
||||
+#if defined(_WIN32)
|
||||
+#define WIN32_LEAN_AND_MEAN
|
||||
@@ -32,8 +32,8 @@ index ad3e7df1..b3218c78 100644
|
||||
+
|
||||
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
|
||||
|
||||
//#define CLIP_DEBUG_FUNCTIONS
|
||||
@@ -1971,7 +1984,29 @@ struct clip_model_loader {
|
||||
enum ffn_op_type {
|
||||
@@ -2190,7 +2203,29 @@ struct clip_model_loader {
|
||||
{
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
@@ -63,7 +63,7 @@ index ad3e7df1..b3218c78 100644
|
||||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
@@ -1998,7 +2033,11 @@ struct clip_model_loader {
|
||||
@@ -2217,7 +2252,11 @@ struct clip_model_loader {
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -138,7 +138,7 @@ index 7ee6a5b7..48dce407 100644
|
||||
};
|
||||
|
||||
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
|
||||
index ea73a8a7..a012aeae 100644
|
||||
index 4cce5166..7f6617fa 100644
|
||||
--- a/src/llama-model-loader.cpp
|
||||
+++ b/src/llama-model-loader.cpp
|
||||
@@ -439,6 +439,7 @@ namespace GGUFMeta {
|
||||
@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
|
||||
llama_model_loader::llama_model_loader(
|
||||
const std::string & fname,
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 822e2bb2..572378c9 100644
|
||||
index 3a4e72a3..831b68c0 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1386,6 +1386,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -1402,6 +1402,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@@ -175,7 +175,7 @@ index 822e2bb2..572378c9 100644
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -3741,6 +3756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -3774,6 +3789,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
@@ -210,7 +210,7 @@ index 822e2bb2..572378c9 100644
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -12342,6 +12385,165 @@ struct llm_build_chameleon : public llm_graph_context {
|
||||
@@ -12397,6 +12440,165 @@ struct llm_build_chameleon : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
@@ -376,7 +376,7 @@ index 822e2bb2..572378c9 100644
|
||||
struct llm_build_wavtokenizer_dec : public llm_graph_context {
|
||||
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
ggml_tensor * cur;
|
||||
@@ -13092,6 +13294,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
@@ -13157,6 +13359,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
{
|
||||
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
|
||||
} break;
|
||||
@@ -387,7 +387,7 @@ index 822e2bb2..572378c9 100644
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
|
||||
@@ -13238,6 +13444,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -13301,6 +13507,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
@@ -396,10 +396,10 @@ index 822e2bb2..572378c9 100644
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index 95eca002..856e6042 100644
|
||||
index 6bdec263..43746c7d 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -64,6 +64,7 @@ enum llm_type {
|
||||
@@ -65,6 +65,7 @@ enum llm_type {
|
||||
LLM_TYPE_15B,
|
||||
LLM_TYPE_16B,
|
||||
LLM_TYPE_20B,
|
||||
@@ -407,7 +407,7 @@ index 95eca002..856e6042 100644
|
||||
LLM_TYPE_27B,
|
||||
LLM_TYPE_30B,
|
||||
LLM_TYPE_32B,
|
||||
@@ -311,6 +312,8 @@ struct llama_layer {
|
||||
@@ -315,6 +316,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_up_scale = nullptr;
|
||||
struct ggml_tensor * ffn_down_scale = nullptr;
|
||||
|
||||
|
||||
@@ -5,88 +5,27 @@ Subject: [PATCH] add mllama support
|
||||
|
||||
adds support for the llama 3.2 vision architecture
|
||||
---
|
||||
examples/llava/llava.cpp | 5 +-
|
||||
examples/llava/mtmd.cpp | 6 +-
|
||||
ggml/src/ggml-backend-reg.cpp | 6 +-
|
||||
include/llama.h | 6 +
|
||||
src/llama-arch.cpp | 44 +++++
|
||||
src/llama-arch.h | 10 ++
|
||||
src/llama-batch.cpp | 3 +
|
||||
src/llama-context.cpp | 25 ++-
|
||||
src/llama-context.cpp | 23 ++-
|
||||
src/llama-context.h | 1 +
|
||||
src/llama-cparams.h | 1 +
|
||||
src/llama-graph.cpp | 25 +++
|
||||
src/llama-graph.h | 12 ++
|
||||
src/llama-hparams.cpp | 4 +
|
||||
src/llama-hparams.h | 7 +
|
||||
src/llama-kv-cache.cpp | 12 +-
|
||||
src/llama-kv-cache.cpp | 14 +-
|
||||
src/llama-model-loader.cpp | 2 +
|
||||
src/llama-model.cpp | 309 +++++++++++++++++++++++++++++++++-
|
||||
src/llama-model.cpp | 311 +++++++++++++++++++++++++++++++++-
|
||||
src/llama-model.h | 12 ++
|
||||
src/llama-quant.cpp | 4 +-
|
||||
19 files changed, 473 insertions(+), 21 deletions(-)
|
||||
tools/mtmd/llava.cpp | 5 +-
|
||||
tools/mtmd/mtmd-helper.cpp | 7 +-
|
||||
19 files changed, 475 insertions(+), 22 deletions(-)
|
||||
|
||||
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
|
||||
index c00d16ae..bab027b5 100644
|
||||
--- a/examples/llava/llava.cpp
|
||||
+++ b/examples/llava/llava.cpp
|
||||
@@ -457,7 +457,7 @@ struct llava_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
- llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
+ llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -469,6 +469,7 @@ struct llava_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
+ /*n_embd =*/ n_embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@@ -492,7 +493,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
n_eval = n_batch;
|
||||
}
|
||||
float * embd = image_embed->embed+i*n_embd;
|
||||
- llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
|
||||
+ llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
|
||||
if (llama_decode(ctx_llama, llava_batch.batch)) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
|
||||
index 7081fd73..c14ac501 100644
|
||||
--- a/examples/llava/mtmd.cpp
|
||||
+++ b/examples/llava/mtmd.cpp
|
||||
@@ -476,7 +476,7 @@ struct decode_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
- decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
pos .resize(n_tokens * n_pos_per_embd);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -487,6 +487,7 @@ struct decode_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
+ /*n_embd =*/ n_embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@@ -610,7 +611,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
int32_t i_batch = 0;
|
||||
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
||||
float * embd = mtmd_get_output_embd(ctx);
|
||||
- decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
||||
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
|
||||
+ decode_embd_batch batch_embd(embd, n_embd, n_tokens, n_past, 0);
|
||||
|
||||
const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
|
||||
const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index 405d8e31..82ae1b5b 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -105,10 +44,10 @@ index 405d8e31..82ae1b5b 100644
|
||||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
diff --git a/include/llama.h b/include/llama.h
|
||||
index 06c56395..f1628e88 100644
|
||||
index abedebdb..41beef21 100644
|
||||
--- a/include/llama.h
|
||||
+++ b/include/llama.h
|
||||
@@ -256,6 +256,7 @@ extern "C" {
|
||||
@@ -258,6 +258,7 @@ extern "C" {
|
||||
|
||||
llama_token * token;
|
||||
float * embd;
|
||||
@@ -116,15 +55,15 @@ index 06c56395..f1628e88 100644
|
||||
llama_pos * pos;
|
||||
int32_t * n_seq_id;
|
||||
llama_seq_id ** seq_id;
|
||||
@@ -358,6 +359,7 @@ extern "C" {
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
@@ -365,6 +366,7 @@ extern "C" {
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
bool no_perf; // whether to measure performance timings
|
||||
bool op_offload; // whether to offload host tensor operations to device
|
||||
+ bool cross_attn; // whether to use cross attention
|
||||
};
|
||||
|
||||
// Abort callback
|
||||
// if it returns true, execution of llama_decode() will be aborted
|
||||
@@ -459,6 +461,10 @@ extern "C" {
|
||||
// model quantization parameters
|
||||
@@ -464,6 +466,10 @@ extern "C" {
|
||||
struct llama_context_params params),
|
||||
"use llama_init_from_model instead");
|
||||
|
||||
@@ -247,10 +186,10 @@ index 525c1b7d..bc8a4f0b 100644
|
||||
LLM_TENSOR_CONVNEXT_DW,
|
||||
LLM_TENSOR_CONVNEXT_NORM,
|
||||
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
|
||||
index 01d5ca57..8682b0e6 100644
|
||||
index a88b2fe3..241b316e 100644
|
||||
--- a/src/llama-batch.cpp
|
||||
+++ b/src/llama-batch.cpp
|
||||
@@ -316,6 +316,7 @@ struct llama_batch llama_batch_get_one(
|
||||
@@ -320,6 +320,7 @@ struct llama_batch llama_batch_get_one(
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ tokens,
|
||||
/*embd =*/ nullptr,
|
||||
@@ -258,7 +197,7 @@ index 01d5ca57..8682b0e6 100644
|
||||
/*pos =*/ nullptr,
|
||||
/*n_seq_id =*/ nullptr,
|
||||
/*seq_id =*/ nullptr,
|
||||
@@ -328,6 +329,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
@@ -332,6 +333,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
/*n_tokens =*/ 0,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ nullptr,
|
||||
@@ -266,7 +205,7 @@ index 01d5ca57..8682b0e6 100644
|
||||
/*pos =*/ nullptr,
|
||||
/*n_seq_id =*/ nullptr,
|
||||
/*seq_id =*/ nullptr,
|
||||
@@ -336,6 +338,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
@@ -340,6 +342,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
|
||||
if (embd) {
|
||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
||||
@@ -275,10 +214,10 @@ index 01d5ca57..8682b0e6 100644
|
||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
||||
}
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 9c1fe93f..cd06ad91 100644
|
||||
index dca22d8b..c22687e4 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -851,7 +851,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
@@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
|
||||
}
|
||||
|
||||
@@ -287,7 +226,7 @@ index 9c1fe93f..cd06ad91 100644
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
||||
#ifndef NDEBUG
|
||||
@@ -972,6 +972,10 @@ void llama_context::set_warmup(bool value) {
|
||||
@@ -632,6 +632,10 @@ void llama_context::set_warmup(bool value) {
|
||||
cparams.warmup = value;
|
||||
}
|
||||
|
||||
@@ -298,16 +237,16 @@ index 9c1fe93f..cd06ad91 100644
|
||||
void llama_context::set_adapter_lora(
|
||||
llama_adapter_lora * adapter,
|
||||
float scale) {
|
||||
@@ -1047,7 +1051,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
@@ -709,7 +713,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
- sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||
+ sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||
- llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||
+ llama_sbatch sbatch = llama_sbatch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||
|
||||
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
|
||||
|
||||
@@ -1187,10 +1191,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -863,10 +867,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
const llama_batch & batch = batch_allocr.batch;
|
||||
|
||||
@@ -319,16 +258,16 @@ index 9c1fe93f..cd06ad91 100644
|
||||
|
||||
const int64_t n_tokens_all = batch.n_tokens;
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
@@ -1238,7 +1241,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1087,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// make the outputs have the same order they had in the user-provided batch
|
||||
// note: this is mostly relevant for recurrent models atm
|
||||
if (!sorted_output) {
|
||||
- const uint32_t n_vocab = model.vocab.n_tokens();
|
||||
+ const uint32_t n_vocab = model.hparams.n_vocab;
|
||||
const uint32_t n_embd = model.hparams.n_embd;
|
||||
|
||||
const bool logits_all = n_outputs_all == n_tokens_all;
|
||||
|
||||
- sbatch.from_batch(batch, n_embd,
|
||||
+ sbatch.from_batch(batch, batch.n_embd,
|
||||
/* simple_split */ !kv_self->recurrent,
|
||||
/* logits_all */ logits_all);
|
||||
|
||||
@@ -1472,12 +1475,11 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
@@ -1142,12 +1145,11 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
const auto & hparams = model.hparams;
|
||||
@@ -342,16 +281,7 @@ index 9c1fe93f..cd06ad91 100644
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
@@ -1545,7 +1547,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
void llama_context::output_reorder() {
|
||||
auto & out_ids = sbatch.out_ids;
|
||||
if (!out_ids.empty()) {
|
||||
- const uint32_t n_vocab = model.vocab.n_tokens();
|
||||
+ const uint32_t n_vocab = model.hparams.n_vocab;
|
||||
const uint32_t n_embd = model.hparams.n_embd;
|
||||
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
@@ -2052,7 +2054,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||
@@ -1682,7 +1684,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||
{
|
||||
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
|
||||
|
||||
@@ -360,15 +290,15 @@ index 9c1fe93f..cd06ad91 100644
|
||||
|
||||
io.write(&logits_size, sizeof(logits_size));
|
||||
|
||||
@@ -2235,6 +2237,7 @@ llama_context_params llama_context_default_params() {
|
||||
/*.offload_kqv =*/ true,
|
||||
@@ -2091,6 +2093,7 @@ llama_context_params llama_context_default_params() {
|
||||
/*.flash_attn =*/ false,
|
||||
/*.no_perf =*/ true,
|
||||
/*.op_offload =*/ true,
|
||||
+ /*.cross_attn =*/ false,
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
};
|
||||
@@ -2362,6 +2365,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
|
||||
|
||||
return result;
|
||||
@@ -2216,6 +2219,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
|
||||
ctx->set_warmup(warmup);
|
||||
}
|
||||
|
||||
@@ -380,10 +310,10 @@ index 9c1fe93f..cd06ad91 100644
|
||||
ctx->synchronize();
|
||||
}
|
||||
diff --git a/src/llama-context.h b/src/llama-context.h
|
||||
index 5457f077..a50c4afa 100644
|
||||
index c0ceacb1..c4ab242a 100644
|
||||
--- a/src/llama-context.h
|
||||
+++ b/src/llama-context.h
|
||||
@@ -65,6 +65,7 @@ struct llama_context {
|
||||
@@ -71,6 +71,7 @@ struct llama_context {
|
||||
void set_embeddings (bool value);
|
||||
void set_causal_attn(bool value);
|
||||
void set_warmup(bool value);
|
||||
@@ -392,22 +322,22 @@ index 5457f077..a50c4afa 100644
|
||||
void set_adapter_lora(
|
||||
llama_adapter_lora * adapter,
|
||||
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
|
||||
index 30e550f0..85ad91b9 100644
|
||||
index 246fa577..7a6156ce 100644
|
||||
--- a/src/llama-cparams.h
|
||||
+++ b/src/llama-cparams.h
|
||||
@@ -29,6 +29,7 @@ struct llama_cparams {
|
||||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
@@ -31,6 +31,7 @@ struct llama_cparams {
|
||||
bool no_perf;
|
||||
+ bool cross_attn;
|
||||
bool warmup;
|
||||
bool op_offload;
|
||||
+ bool cross_attn;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
|
||||
index fabb9ca2..b67216a4 100644
|
||||
index b0e3f635..f14869cf 100644
|
||||
--- a/src/llama-graph.cpp
|
||||
+++ b/src/llama-graph.cpp
|
||||
@@ -560,6 +560,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||
@@ -532,6 +532,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
@@ -420,7 +350,7 @@ index fabb9ca2..b67216a4 100644
|
||||
//
|
||||
// llm_graph_context
|
||||
//
|
||||
@@ -1532,6 +1538,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
||||
@@ -1514,6 +1520,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
||||
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
|
||||
}
|
||||
|
||||
@@ -447,10 +377,10 @@ index fabb9ca2..b67216a4 100644
|
||||
llm_graph_input_attn_cross * inp,
|
||||
ggml_cgraph * gf,
|
||||
diff --git a/src/llama-graph.h b/src/llama-graph.h
|
||||
index d0c8d321..0fe18150 100644
|
||||
index 832a8c09..5a322785 100644
|
||||
--- a/src/llama-graph.h
|
||||
+++ b/src/llama-graph.h
|
||||
@@ -86,6 +86,7 @@ public:
|
||||
@@ -87,6 +87,7 @@ public:
|
||||
|
||||
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
||||
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
||||
@@ -458,7 +388,7 @@ index d0c8d321..0fe18150 100644
|
||||
};
|
||||
|
||||
class llm_graph_input_pos : public llm_graph_input_i {
|
||||
@@ -283,6 +284,16 @@ public:
|
||||
@@ -284,6 +285,16 @@ public:
|
||||
const llama_cross * cross = nullptr;
|
||||
};
|
||||
|
||||
@@ -475,7 +405,7 @@ index d0c8d321..0fe18150 100644
|
||||
//
|
||||
// llm_graph_result
|
||||
//
|
||||
@@ -491,6 +502,7 @@ struct llm_graph_context {
|
||||
@@ -495,6 +506,7 @@ struct llm_graph_context {
|
||||
ggml_tensor * build_inp_cls() const;
|
||||
ggml_tensor * build_inp_s_copy() const;
|
||||
ggml_tensor * build_inp_s_mask() const;
|
||||
@@ -535,11 +465,11 @@ index 48dce407..b6fc7e6d 100644
|
||||
};
|
||||
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 7c9d46d8..69f8d35a 100644
|
||||
index 3dcad65b..a7b0a7eb 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -95,8 +95,16 @@ bool llama_kv_cache_unified::init(
|
||||
return false;
|
||||
@@ -100,8 +100,16 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
throw std::runtime_error("failed to create ggml context for kv cache");
|
||||
}
|
||||
|
||||
- ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
||||
@@ -557,8 +487,17 @@ index 7c9d46d8..69f8d35a 100644
|
||||
ggml_format_name(k, "cache_k_l%d", i);
|
||||
ggml_format_name(v, "cache_v_l%d", i);
|
||||
k_l.push_back(k);
|
||||
@@ -446,7 +454,7 @@ void llama_kv_cache_unified::set_full() {
|
||||
llama_sbatch llama_kv_cache_unified::sbatch_init(
|
||||
const llama_batch & batch,
|
||||
bool logits_all) {
|
||||
- return llama_sbatch(batch, hparams.n_embd, true, logits_all);
|
||||
+ return llama_sbatch(batch, batch.n_embd, true, logits_all);
|
||||
}
|
||||
|
||||
llama_ubatch llama_kv_cache_unified::ubatch_next(
|
||||
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
|
||||
index a012aeae..2e11507d 100644
|
||||
index 7f6617fa..2acfd4a8 100644
|
||||
--- a/src/llama-model-loader.cpp
|
||||
+++ b/src/llama-model-loader.cpp
|
||||
@@ -315,6 +315,8 @@ namespace GGUFMeta {
|
||||
@@ -571,10 +510,10 @@ index a012aeae..2e11507d 100644
|
||||
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 572378c9..9d099f11 100644
|
||||
index 831b68c0..e8298f56 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -423,6 +423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -433,6 +433,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
// get general kv
|
||||
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
|
||||
@@ -582,7 +521,7 @@ index 572378c9..9d099f11 100644
|
||||
|
||||
// everything past this point is not vocab-related
|
||||
if (hparams.vocab_only) {
|
||||
@@ -434,6 +435,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -444,6 +445,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
@@ -590,7 +529,7 @@ index 572378c9..9d099f11 100644
|
||||
|
||||
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||||
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
||||
@@ -457,9 +459,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -467,9 +469,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||||
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||||
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||||
@@ -602,7 +541,7 @@ index 572378c9..9d099f11 100644
|
||||
|
||||
// n_head_kv is optional, default to n_head
|
||||
hparams.n_head_kv_arr = hparams.n_head_arr;
|
||||
@@ -512,7 +516,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -522,7 +526,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||
|
||||
@@ -611,7 +550,7 @@ index 572378c9..9d099f11 100644
|
||||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||||
}
|
||||
@@ -575,6 +579,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -585,6 +589,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.use_kq_norm = false;
|
||||
}
|
||||
} break;
|
||||
@@ -628,7 +567,7 @@ index 572378c9..9d099f11 100644
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@@ -1562,7 +1576,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -1581,7 +1595,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||
const int64_t n_ff = hparams.n_ff();
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
@@ -637,7 +576,7 @@ index 572378c9..9d099f11 100644
|
||||
const int64_t n_token_types = vocab.n_token_types();
|
||||
const int64_t n_rot = hparams.n_rot;
|
||||
const int64_t n_expert = hparams.n_expert;
|
||||
@@ -1815,6 +1829,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -1840,6 +1854,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
} break;
|
||||
@@ -690,7 +629,7 @@ index 572378c9..9d099f11 100644
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@@ -4707,6 +4767,246 @@ struct llm_build_llama : public llm_graph_context {
|
||||
@@ -4756,6 +4816,246 @@ struct llm_build_llama : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
@@ -832,7 +771,7 @@ index 572378c9..9d099f11 100644
|
||||
+ // self attention layer
|
||||
+
|
||||
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
+
|
||||
+ // compute Q and K and RoPE them
|
||||
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@@ -937,7 +876,16 @@ index 572378c9..9d099f11 100644
|
||||
struct llm_build_deci : public llm_graph_context {
|
||||
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
@@ -13063,6 +13363,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
@@ -12496,7 +12796,7 @@ struct llm_build_solar : public llm_graph_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@@ -13128,6 +13428,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
{
|
||||
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
||||
} break;
|
||||
@@ -948,7 +896,7 @@ index 572378c9..9d099f11 100644
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
llm = std::make_unique<llm_build_deci>(*this, params, gf);
|
||||
@@ -13424,6 +13728,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -13489,6 +13793,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_LLAMA4:
|
||||
@@ -957,7 +905,7 @@ index 572378c9..9d099f11 100644
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
case LLM_ARCH_STARCODER:
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index 856e6042..6be91282 100644
|
||||
index 43746c7d..9281e629 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -11,6 +11,7 @@
|
||||
@@ -968,7 +916,7 @@ index 856e6042..6be91282 100644
|
||||
|
||||
struct llama_cparams;
|
||||
struct llama_ubatch;
|
||||
@@ -73,6 +74,7 @@ enum llm_type {
|
||||
@@ -74,6 +75,7 @@ enum llm_type {
|
||||
LLM_TYPE_40B,
|
||||
LLM_TYPE_65B,
|
||||
LLM_TYPE_70B,
|
||||
@@ -976,7 +924,7 @@ index 856e6042..6be91282 100644
|
||||
LLM_TYPE_236B,
|
||||
LLM_TYPE_290B,
|
||||
LLM_TYPE_314B,
|
||||
@@ -314,6 +316,16 @@ struct llama_layer {
|
||||
@@ -318,6 +320,16 @@ struct llama_layer {
|
||||
|
||||
struct ggml_tensor * bskcn_tv = nullptr;
|
||||
|
||||
@@ -994,7 +942,7 @@ index 856e6042..6be91282 100644
|
||||
|
||||
struct llama_layer_convnext convnext;
|
||||
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
|
||||
index 7dc54227..223e1f3f 100644
|
||||
index 820d5128..56531980 100644
|
||||
--- a/src/llama-quant.cpp
|
||||
+++ b/src/llama-quant.cpp
|
||||
@@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
@@ -1008,3 +956,72 @@ index 7dc54227..223e1f3f 100644
|
||||
}
|
||||
|
||||
size_t total_size_org = 0;
|
||||
diff --git a/tools/mtmd/llava.cpp b/tools/mtmd/llava.cpp
|
||||
index ebef8b3c..b0eb79bb 100644
|
||||
--- a/tools/mtmd/llava.cpp
|
||||
+++ b/tools/mtmd/llava.cpp
|
||||
@@ -462,7 +462,7 @@ struct llava_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
- llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
+ llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -474,6 +474,7 @@ struct llava_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
+ /*n_embd =*/ n_embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@@ -497,7 +498,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
n_eval = n_batch;
|
||||
}
|
||||
float * embd = image_embed->embed+i*n_embd;
|
||||
- llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
|
||||
+ llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
|
||||
if (llama_decode(ctx_llama, llava_batch.batch)) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
|
||||
index 7a328867..61ebdd43 100644
|
||||
--- a/tools/mtmd/mtmd-helper.cpp
|
||||
+++ b/tools/mtmd/mtmd-helper.cpp
|
||||
@@ -58,7 +58,7 @@ struct decode_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
- decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
pos .resize(n_tokens * n_pos_per_embd);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -69,6 +69,7 @@ struct decode_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
+ /*n_embd =*/ n_embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@@ -131,6 +132,7 @@ struct decode_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ batch.embd + offset * n_mmproj_embd,
|
||||
+ /*n_embd =*/ batch.n_embd,
|
||||
/*pos =*/ pos_ptr,
|
||||
/*n_seq_id =*/ batch.n_seq_id + offset,
|
||||
/*seq_id =*/ batch.seq_id + offset,
|
||||
@@ -166,7 +168,8 @@ int32_t mtmd_helper_decode_image_chunk(
|
||||
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
|
||||
int32_t i_batch = 0;
|
||||
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
||||
- decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
||||
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
|
||||
+ decode_embd_batch batch_embd(encoded_embd, n_embd, n_tokens, n_past, seq_id);
|
||||
|
||||
const int nx = mtmd_image_tokens_get_nx(image_tokens);
|
||||
const int ny = mtmd_image_tokens_get_ny(image_tokens);
|
||||
|
||||
@@ -18,7 +18,7 @@ adds the unpad operator to GGML
|
||||
10 files changed, 223 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
||||
index 1b8603e7..53ef31b2 100644
|
||||
index e91dedf1..8dc107ba 100644
|
||||
--- a/ggml/include/ggml.h
|
||||
+++ b/ggml/include/ggml.h
|
||||
@@ -489,6 +489,7 @@ extern "C" {
|
||||
@@ -29,7 +29,7 @@ index 1b8603e7..53ef31b2 100644
|
||||
GGML_OP_ARANGE,
|
||||
GGML_OP_TIMESTEP_EMBEDDING,
|
||||
GGML_OP_ARGSORT,
|
||||
@@ -1777,6 +1778,15 @@ extern "C" {
|
||||
@@ -1781,6 +1782,15 @@ extern "C" {
|
||||
int p0,
|
||||
int p1);
|
||||
|
||||
@@ -46,10 +46,10 @@ index 1b8603e7..53ef31b2 100644
|
||||
// timesteps: [N,]
|
||||
// return: [N, dim]
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 64405449..34624cca 100644
|
||||
index a30e67f2..835e6495 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -1964,6 +1964,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
@@ -1951,6 +1951,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_pad_reflect_1d(params, tensor);
|
||||
} break;
|
||||
@@ -60,7 +60,7 @@ index 64405449..34624cca 100644
|
||||
case GGML_OP_ARANGE:
|
||||
{
|
||||
ggml_compute_forward_arange(params, tensor);
|
||||
@@ -2287,6 +2291,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
@@ -69,10 +69,10 @@ index 64405449..34624cca 100644
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||
index 7413192b..becdae07 100644
|
||||
index 955fec59..1868a10c 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||
@@ -6703,6 +6703,61 @@ void ggml_compute_forward_pad_reflect_1d(
|
||||
@@ -6690,6 +6690,61 @@ void ggml_compute_forward_pad_reflect_1d(
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,10 +147,10 @@ index dc081b9e..a7125555 100644
|
||||
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 04ce764e..491acccb 100644
|
||||
index cb0d8528..6fe86674 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2223,6 +2223,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
@@ -2238,6 +2238,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_PAD:
|
||||
ggml_cuda_op_pad(ctx, dst);
|
||||
break;
|
||||
@@ -160,7 +160,7 @@ index 04ce764e..491acccb 100644
|
||||
case GGML_OP_ARANGE:
|
||||
ggml_cuda_op_arange(ctx, dst);
|
||||
break;
|
||||
@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
@@ -3212,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_UPSCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
|
||||
case GGML_OP_PAD:
|
||||
@@ -233,10 +233,10 @@ index 8fd386b0..e2ededc3 100644
|
||||
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 425524d0..112abef6 100644
|
||||
index 1b56f858..7641247e 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
@@ -347,6 +347,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
||||
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
|
||||
@@ -244,7 +244,7 @@ index 425524d0..112abef6 100644
|
||||
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
||||
@@ -1277,6 +1278,7 @@ @implementation GGMLMetalClass
|
||||
@@ -1294,6 +1295,7 @@ @implementation GGMLMetalClass
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
|
||||
@@ -252,7 +252,7 @@ index 425524d0..112abef6 100644
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||
@@ -1647,6 +1649,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
@@ -1655,6 +1657,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
@@ -260,7 +260,7 @@ index 425524d0..112abef6 100644
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
@@ -4047,6 +4050,36 @@ static bool ggml_metal_encode_node(
|
||||
@@ -4184,6 +4187,36 @@ static bool ggml_metal_encode_node(
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
@@ -298,10 +298,10 @@ index 425524d0..112abef6 100644
|
||||
} break;
|
||||
case GGML_OP_ARANGE:
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
index 9f4147e9..6ceb3cef 100644
|
||||
index 9cfddf45..080a943b 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
@@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32(
|
||||
@@ -3121,6 +3121,51 @@ kernel void kernel_pad_reflect_1d_f32(
|
||||
}
|
||||
}
|
||||
|
||||
@@ -354,7 +354,7 @@ index 9f4147e9..6ceb3cef 100644
|
||||
device char * dst,
|
||||
constant ggml_metal_kargs_arange & args,
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index 7654ae17..3c57aff8 100644
|
||||
index 8a654624..6b034d35 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
@@ -391,7 +391,7 @@ index 7654ae17..3c57aff8 100644
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -4270,6 +4272,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
||||
@@ -4274,6 +4276,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@@ -12,10 +12,10 @@ regex
|
||||
2 files changed, 22 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index a9ee9f03..1306864e 100644
|
||||
index 806c1b3d..10f34d33 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
@@ -298,7 +298,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
||||
regex_exprs = {
|
||||
"[\r\n]",
|
||||
|
||||
@@ -15,33 +15,102 @@ but this can leave a cache that still does not have adequate space
|
||||
even after defragmentation is triggered. Instead, we should do
|
||||
multiple batches of processing until everything is complete.
|
||||
---
|
||||
src/llama-context.cpp | 105 +++++++++++++----------------------------
|
||||
src/llama-context.h | 4 +-
|
||||
src/llama-kv-cache.cpp | 39 +++------------
|
||||
src/llama-kv-cache.h | 9 +++-
|
||||
4 files changed, 51 insertions(+), 106 deletions(-)
|
||||
src/llama-context.h | 1 +
|
||||
src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
|
||||
src/llama-kv-cache.h | 12 ++++-
|
||||
3 files changed, 47 insertions(+), 73 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index cd06ad91..77177c5e 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
|
||||
diff --git a/src/llama-context.h b/src/llama-context.h
|
||||
index c4ab242a..9970dfc6 100644
|
||||
--- a/src/llama-context.h
|
||||
+++ b/src/llama-context.h
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "llama-cparams.h"
|
||||
#include "llama-graph.h"
|
||||
#include "llama-adapter.h"
|
||||
+#include "llama-kv-cache.h"
|
||||
|
||||
llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
- ggml_cgraph * gf) const {
|
||||
+ ggml_cgraph * gf,
|
||||
+ const std::vector<struct llama_kv_defrag_move> & moves) const {
|
||||
#include "ggml-cpp.h"
|
||||
#include "ggml-opt.h"
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index a7b0a7eb..1a50c034 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -372,8 +372,6 @@ void llama_kv_cache_unified::commit() {
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||
- bool need_reserve = false;
|
||||
-
|
||||
auto * sched = lctx.get_sched();
|
||||
|
||||
if (has_shift) {
|
||||
@@ -396,8 +394,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||
res->set_inputs(nullptr);
|
||||
|
||||
lctx.graph_compute(gf, false);
|
||||
-
|
||||
- need_reserve = true;
|
||||
}
|
||||
|
||||
{
|
||||
@@ -411,27 +407,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||
|
||||
if (do_defrag) {
|
||||
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
|
||||
+ const uint32_t n_max_nodes = lctx.graph_max_nodes();
|
||||
+ const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
|
||||
+ if (!defrag_prepare(n_max_nodes)) {
|
||||
+ LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ for (std::size_t i = 0; i < defrag_info.moves.size(); i += max_moves) {
|
||||
+ std::vector<struct llama_kv_defrag_move> chunk;
|
||||
+ auto end = std::min(i + max_moves, defrag_info.moves.size());
|
||||
+ chunk.assign(defrag_info.moves.begin() + i, defrag_info.moves.begin() + end);
|
||||
|
||||
- if (defrag_prepare(lctx.graph_max_nodes())) {
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
auto * gf = lctx.graph_init();
|
||||
|
||||
- auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
|
||||
+ auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf, chunk);
|
||||
|
||||
ggml_backend_sched_alloc_graph(sched, gf);
|
||||
|
||||
res->set_inputs(nullptr);
|
||||
|
||||
lctx.graph_compute(gf, false);
|
||||
-
|
||||
- need_reserve = true;
|
||||
}
|
||||
|
||||
do_defrag = false;
|
||||
}
|
||||
|
||||
- return need_reserve;
|
||||
+ // we never need to reserve a worst case graph
|
||||
+ return false;
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::defrag_sched(float thold) {
|
||||
@@ -715,11 +720,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
||||
llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
- ggml_cgraph * gf) const {
|
||||
+ ggml_cgraph * gf,
|
||||
+ const std::vector<struct llama_kv_defrag_move> & moves) const {
|
||||
auto res = std::make_unique<llm_graph_result>();
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
- const auto & ids = kv_self->defrag_info.ids;
|
||||
- const auto & ids = defrag_info.ids;
|
||||
-
|
||||
#if 0
|
||||
// CPU defrag
|
||||
//
|
||||
@@ -661,32 +660,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
@@ -791,32 +795,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
||||
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
|
||||
}
|
||||
#else
|
||||
@@ -63,188 +132,63 @@ index cd06ad91..77177c5e 100644
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il],
|
||||
ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il],
|
||||
- n_embd_k_gqa, nm,
|
||||
+ n_embd_k_gqa, move.len,
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
|
||||
- ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i));
|
||||
+ ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.src));
|
||||
ggml_row_size(k_l[il]->type, n_embd_k_gqa),
|
||||
- ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
|
||||
+ ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.src));
|
||||
|
||||
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il],
|
||||
ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il],
|
||||
- n_embd_k_gqa, nm,
|
||||
+ n_embd_k_gqa, move.len,
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
|
||||
- ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id));
|
||||
+ ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.dst));
|
||||
ggml_row_size(k_l[il]->type, n_embd_k_gqa),
|
||||
- ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
|
||||
+ ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.dst));
|
||||
|
||||
ggml_tensor * view_v_src;
|
||||
ggml_tensor * view_v_dst;
|
||||
@@ -694,34 +681,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
@@ -824,31 +816,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
||||
if (cparams.flash_attn) {
|
||||
// NOTE: the V cache is not transposed when using flash attention
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
view_v_src = ggml_view_2d(ctx, v_l[il],
|
||||
- n_embd_v_gqa, nm,
|
||||
+ n_embd_v_gqa, move.len,
|
||||
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
|
||||
- ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i));
|
||||
+ ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.src));
|
||||
ggml_row_size(v_l[il]->type, n_embd_v_gqa),
|
||||
- ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
|
||||
+ ggml_row_size(v_l[il]->type, n_embd_v_gqa*move.dst));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
view_v_dst = ggml_view_2d(ctx, v_l[il],
|
||||
- n_embd_v_gqa, nm,
|
||||
+ n_embd_v_gqa, move.len,
|
||||
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
|
||||
- ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id));
|
||||
+ ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.dst));
|
||||
+ move.len, n_embd_v_gqa,
|
||||
ggml_row_size(v_l[il]->type, n_embd_v_gqa),
|
||||
- ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
|
||||
+ ggml_row_size(v_l[il]->type, move.src));
|
||||
} else {
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
view_v_src = ggml_view_2d(ctx, v_l[il],
|
||||
- nm, n_embd_v_gqa,
|
||||
+ move.len, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
|
||||
- ggml_row_size(kv_self->v_l[il]->type, i));
|
||||
+ ggml_row_size(kv_self->v_l[il]->type, move.src));
|
||||
ggml_row_size(v_l[il]->type, size),
|
||||
- ggml_row_size(v_l[il]->type, i));
|
||||
+ ggml_row_size(v_l[il]->type, move.src));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
view_v_dst = ggml_view_2d(ctx, v_l[il],
|
||||
- nm, n_embd_v_gqa,
|
||||
+ move.len, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
|
||||
- ggml_row_size(kv_self->v_l[il]->type, id));
|
||||
+ ggml_row_size(kv_self->v_l[il]->type, move.dst));
|
||||
ggml_row_size(v_l[il]->type, size),
|
||||
- ggml_row_size(v_l[il]->type, id));
|
||||
+ ggml_row_size(v_l[il]->type, move.dst));
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
|
||||
}
|
||||
-
|
||||
- i += nm - 1;
|
||||
}
|
||||
-
|
||||
- //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||
#endif
|
||||
|
||||
return res;
|
||||
@@ -730,8 +713,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
void llama_context::kv_self_update() {
|
||||
auto & kv = kv_self;
|
||||
|
||||
- bool need_reserve = false;
|
||||
-
|
||||
if (kv->has_shift) {
|
||||
if (!kv->get_can_shift()) {
|
||||
GGML_ABORT("The current context does not support K-shift");
|
||||
@@ -752,8 +733,6 @@ void llama_context::kv_self_update() {
|
||||
res->set_inputs(nullptr);
|
||||
|
||||
graph_compute(gf, false);
|
||||
-
|
||||
- need_reserve = true;
|
||||
}
|
||||
|
||||
{
|
||||
@@ -768,49 +747,28 @@ void llama_context::kv_self_update() {
|
||||
// defragment the KV cache if needed
|
||||
if (kv->do_defrag) {
|
||||
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
|
||||
+ const uint32_t n_max_nodes = graph_max_nodes();
|
||||
+ const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
|
||||
+ if (!kv->defrag_prepare(n_max_nodes)) {
|
||||
+ LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
|
||||
+ return;
|
||||
+ }
|
||||
|
||||
- if (kv->defrag_prepare(graph_max_nodes())) {
|
||||
- ggml_backend_sched_reset(sched.get());
|
||||
+ for (std::size_t i = 0; i < kv_self->defrag_info.moves.size(); i += max_moves) {
|
||||
+ std::vector<struct llama_kv_defrag_move> chunk;
|
||||
+ auto end = std::min(i + max_moves, kv_self->defrag_info.moves.size());
|
||||
+ chunk.assign(kv_self->defrag_info.moves.begin() + i, kv_self->defrag_info.moves.begin() + end);
|
||||
|
||||
+ ggml_backend_sched_reset(sched.get());
|
||||
auto * gf = graph_init();
|
||||
-
|
||||
- auto res = build_kv_self_defrag(ctx_compute.get(), gf);
|
||||
-
|
||||
+ auto res = build_kv_self_defrag(ctx_compute.get(), gf, chunk);
|
||||
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
||||
-
|
||||
res->set_inputs(nullptr);
|
||||
-
|
||||
graph_compute(gf, false);
|
||||
-
|
||||
- need_reserve = true;
|
||||
}
|
||||
|
||||
kv->do_defrag = false;
|
||||
}
|
||||
-
|
||||
- // reserve a worst case graph if needed
|
||||
- if (need_reserve) {
|
||||
- LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
|
||||
-
|
||||
- // build worst-case graph
|
||||
- uint32_t n_seqs = 1; // TODO: worst-case number of sequences
|
||||
- uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
-
|
||||
- // simulate full KV cache
|
||||
- kv_self->n = kv_self->size;
|
||||
-
|
||||
- llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||
- llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
||||
-
|
||||
- auto * gf = graph_init();
|
||||
- graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
|
||||
-
|
||||
- // initialize scheduler with the worst-case graph
|
||||
- ggml_backend_sched_reset(sched.get());
|
||||
- if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||
- LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||
- }
|
||||
- }
|
||||
}
|
||||
|
||||
enum llama_pooling_type llama_context::pooling_type() const {
|
||||
@@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// find KV slot
|
||||
{
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
-
|
||||
- return 1;
|
||||
+ kv_self->defrag();
|
||||
+ kv_self_update();
|
||||
+ if (!kv_self->find_slot(ubatch)) {
|
||||
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
+ return 1;
|
||||
+ }
|
||||
}
|
||||
|
||||
if (!kv_self->recurrent) {
|
||||
diff --git a/src/llama-context.h b/src/llama-context.h
|
||||
index a50c4afa..30f84bfd 100644
|
||||
--- a/src/llama-context.h
|
||||
+++ b/src/llama-context.h
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "llama-cparams.h"
|
||||
#include "llama-graph.h"
|
||||
#include "llama-adapter.h"
|
||||
+#include "llama-kv-cache.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
@@ -179,7 +180,8 @@ private:
|
||||
|
||||
llm_graph_result_ptr build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
- ggml_cgraph * gf) const;
|
||||
+ ggml_cgraph * gf,
|
||||
+ const std::vector<struct llama_kv_defrag_move> & moves) const;
|
||||
|
||||
// TODO: read/write lora adapters and cvec
|
||||
size_t state_write_data(llama_io_write_i & io);
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 69f8d35a..35a750d3 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -781,17 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||
@@ -865,17 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
|
||||
assert(n_used <= n_kv);
|
||||
|
||||
@@ -263,7 +207,7 @@ index 69f8d35a..35a750d3 100644
|
||||
|
||||
// determine which KV cells to move where
|
||||
//
|
||||
@@ -799,10 +789,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
@@ -883,10 +863,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
//
|
||||
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
|
||||
//
|
||||
@@ -275,7 +219,7 @@ index 69f8d35a..35a750d3 100644
|
||||
|
||||
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
||||
const auto & cell0 = cells[i0];
|
||||
@@ -851,19 +838,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
@@ -935,19 +912,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
// are we moving a continuous block of memory?
|
||||
bool cont = false;
|
||||
|
||||
@@ -295,7 +239,7 @@ index 69f8d35a..35a750d3 100644
|
||||
cont = false;
|
||||
continue;
|
||||
}
|
||||
@@ -879,8 +858,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
@@ -963,8 +932,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
head = n_used;
|
||||
|
||||
if (!cont) {
|
||||
@@ -307,7 +251,7 @@ index 69f8d35a..35a750d3 100644
|
||||
}
|
||||
|
||||
nf++;
|
||||
@@ -890,22 +871,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
@@ -974,22 +945,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
}
|
||||
}
|
||||
|
||||
@@ -325,37 +269,47 @@ index 69f8d35a..35a750d3 100644
|
||||
return false;
|
||||
}
|
||||
|
||||
- LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
||||
- LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
|
||||
-
|
||||
- LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer);
|
||||
- LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
|
||||
+ // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
||||
|
||||
return true;
|
||||
}
|
||||
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
|
||||
index 56c74035..25cbcb56 100644
|
||||
index bf3b4b6a..928b9712 100644
|
||||
--- a/src/llama-kv-cache.h
|
||||
+++ b/src/llama-kv-cache.h
|
||||
@@ -43,6 +43,13 @@ private:
|
||||
@@ -82,6 +82,13 @@ struct llama_kv_cache_guard {
|
||||
private:
|
||||
llama_kv_cache * kv;
|
||||
};
|
||||
|
||||
+
|
||||
+// block of KV slots to move when defragging
|
||||
+struct llama_kv_defrag_move {
|
||||
+ uint32_t src;
|
||||
+ uint32_t dst;
|
||||
+ uint32_t len;
|
||||
+};
|
||||
+
|
||||
struct llama_kv_cell {
|
||||
llama_pos pos = -1;
|
||||
llama_pos delta = 0;
|
||||
@@ -131,7 +138,7 @@ public:
|
||||
// defrag
|
||||
|
||||
//
|
||||
// llama_kv_cache_unified
|
||||
@@ -207,7 +214,7 @@ private:
|
||||
|
||||
// defrag
|
||||
struct {
|
||||
- std::vector<uint32_t> ids;
|
||||
+ std::vector<llama_kv_defrag_move> moves;
|
||||
} defrag_info;
|
||||
|
||||
// return true if cells have been moved
|
||||
@@ -249,7 +256,8 @@ private:
|
||||
llm_graph_result_ptr build_graph_defrag(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
- ggml_cgraph * gf) const;
|
||||
+ ggml_cgraph * gf,
|
||||
+ const std::vector<llama_kv_defrag_move> & moves) const;
|
||||
|
||||
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
||||
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
||||
|
||||
@@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index 43d9fc4f..4c0d3824 100644
|
||||
index ddea5ad3..45918bf6 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
|
||||
@@ -9,7 +9,7 @@ disable amx as it reduces performance on some systems
|
||||
1 file changed, 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index 4c0d3824..79c26312 100644
|
||||
index 45918bf6..0beaed86 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
|
||||
@@ -53,15 +53,15 @@ index 381a9c7d..e45b453d 100644
|
||||
}
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 1306864e..d6515ff6 100644
|
||||
index 10f34d33..b098bb25 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
@@ -1471,7 +1471,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
||||
GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
|
||||
|
||||
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
||||
if (precompiled_charsmap_keyidx != -1) {
|
||||
- size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
||||
+ size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
||||
- const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
||||
+ const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
||||
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||||
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
||||
#ifdef IS_BIG_ENDIAN
|
||||
|
||||
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 34624cca..59bd3c62 100644
|
||||
index 835e6495..3902894b 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -15,6 +15,8 @@
|
||||
@@ -20,7 +20,7 @@ index 34624cca..59bd3c62 100644
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
||||
@@ -2859,6 +2861,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
@@ -2846,6 +2848,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
|
||||
@@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644
|
||||
const char * grammar_root,
|
||||
bool lazy,
|
||||
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
|
||||
index c0a5f934..75731053 100644
|
||||
index 804b11e0..15a10ca8 100644
|
||||
--- a/src/llama-sampling.cpp
|
||||
+++ b/src/llama-sampling.cpp
|
||||
@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
||||
|
||||
@@ -1,38 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Jesse Gross <jesse@kernel.org>
|
||||
Date: Thu, 1 May 2025 13:46:10 -0700
|
||||
Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222)
|
||||
|
||||
The following scenario will cause an assertion failure in the graph
|
||||
allocator:
|
||||
- Build and allocate a graph containing a tensor with a non-NULL data
|
||||
pointer
|
||||
- Build and allocate a new graph where that data is NULL
|
||||
|
||||
Result:
|
||||
ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed
|
||||
|
||||
This happens during revalidation because we think that memory should
|
||||
have been previously allocated based on the current graph but in
|
||||
reality the previous graph was different. In this situation, we
|
||||
should do a full reallocation pass.
|
||||
---
|
||||
ggml/src/ggml-alloc.c | 5 ++++-
|
||||
1 file changed, 4 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
|
||||
index a3d3f690..5fd379f6 100644
|
||||
--- a/ggml/src/ggml-alloc.c
|
||||
+++ b/ggml/src/ggml-alloc.c
|
||||
@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
||||
size_t node_size = 0;
|
||||
if (!node->data && !node->view_src) {
|
||||
- GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
|
||||
+ // If we previously had data but don't now then reallocate
|
||||
+ if (talloc->buffer_id < 0) {
|
||||
+ return false;
|
||||
+ }
|
||||
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
|
||||
}
|
||||
return talloc->size_max >= node_size;
|
||||
Reference in New Issue
Block a user