llama: update to commit e1e8e099 (#10513)

This commit is contained in:
Jeffrey Morgan
2025-05-01 18:24:09 -07:00
committed by GitHub
parent e6d2d04121
commit 8dd12c873d
68 changed files with 3783 additions and 1774 deletions

View File

@@ -22,10 +22,10 @@ multiple batches of processing until everything is complete.
4 files changed, 51 insertions(+), 106 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 0343ba8a..4b3e6a83 100644
index cd06ad91..77177c5e 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -594,13 +594,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
@@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
llm_graph_result_ptr llama_context::build_kv_self_defrag(
ggml_context * ctx0,
@@ -41,7 +41,7 @@ index 0343ba8a..4b3e6a83 100644
#if 0
// CPU defrag
//
@@ -672,32 +671,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
@@ -661,32 +660,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
}
#else
@@ -79,7 +79,7 @@ index 0343ba8a..4b3e6a83 100644
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
@@ -705,34 +692,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
@@ -694,34 +681,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
if (cparams.flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
@@ -122,7 +122,7 @@ index 0343ba8a..4b3e6a83 100644
#endif
return res;
@@ -741,8 +724,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
@@ -730,8 +713,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
void llama_context::kv_self_update() {
auto & kv = kv_self;
@@ -131,7 +131,7 @@ index 0343ba8a..4b3e6a83 100644
if (kv->has_shift) {
if (!kv->get_can_shift()) {
GGML_ABORT("The current context does not support K-shift");
@@ -763,8 +744,6 @@ void llama_context::kv_self_update() {
@@ -752,8 +733,6 @@ void llama_context::kv_self_update() {
res->set_inputs(nullptr);
graph_compute(gf, false);
@@ -140,7 +140,7 @@ index 0343ba8a..4b3e6a83 100644
}
{
@@ -779,49 +758,28 @@ void llama_context::kv_self_update() {
@@ -768,49 +747,28 @@ void llama_context::kv_self_update() {
// defragment the KV cache if needed
if (kv->do_defrag) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
@@ -202,7 +202,7 @@ index 0343ba8a..4b3e6a83 100644
}
enum llama_pooling_type llama_context::pooling_type() const {
@@ -1305,9 +1263,12 @@ int llama_context::decode(llama_batch & inp_batch) {
@@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) {
// find KV slot
{
if (!kv_self->find_slot(ubatch)) {
@@ -219,7 +219,7 @@ index 0343ba8a..4b3e6a83 100644
if (!kv_self->recurrent) {
diff --git a/src/llama-context.h b/src/llama-context.h
index baa03276..a59ff8fd 100644
index a50c4afa..30f84bfd 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -5,6 +5,7 @@
@@ -230,7 +230,7 @@ index baa03276..a59ff8fd 100644
#include "ggml-cpp.h"
@@ -180,7 +181,8 @@ private:
@@ -179,7 +180,8 @@ private:
llm_graph_result_ptr build_kv_self_defrag(
ggml_context * ctx0,