llama: update to commit de4c07f93 (#10655)

This commit is contained in:
Jeffrey Morgan
2025-05-12 12:17:26 -07:00
committed by GitHub
parent ad035ad595
commit 0cefd46f23
113 changed files with 8097 additions and 4383 deletions

View File

@@ -15,33 +15,102 @@ but this can leave a cache that still does not have adequate space
even after defragmentation is triggered. Instead, we should do
multiple batches of processing until everything is complete.
---
src/llama-context.cpp | 105 +++++++++++++----------------------------
src/llama-context.h | 4 +-
src/llama-kv-cache.cpp | 39 +++------------
src/llama-kv-cache.h | 9 +++-
4 files changed, 51 insertions(+), 106 deletions(-)
src/llama-context.h | 1 +
src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
src/llama-kv-cache.h | 12 ++++-
3 files changed, 47 insertions(+), 73 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index cd06ad91..77177c5e 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
diff --git a/src/llama-context.h b/src/llama-context.h
index c4ab242a..9970dfc6 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -5,6 +5,7 @@
#include "llama-cparams.h"
#include "llama-graph.h"
#include "llama-adapter.h"
+#include "llama-kv-cache.h"
llm_graph_result_ptr llama_context::build_kv_self_defrag(
ggml_context * ctx0,
- ggml_cgraph * gf) const {
+ ggml_cgraph * gf,
+ const std::vector<struct llama_kv_defrag_move> & moves) const {
#include "ggml-cpp.h"
#include "ggml-opt.h"
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index a7b0a7eb..1a50c034 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -372,8 +372,6 @@ void llama_kv_cache_unified::commit() {
}
bool llama_kv_cache_unified::update(llama_context & lctx) {
- bool need_reserve = false;
-
auto * sched = lctx.get_sched();
if (has_shift) {
@@ -396,8 +394,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
res->set_inputs(nullptr);
lctx.graph_compute(gf, false);
-
- need_reserve = true;
}
{
@@ -411,27 +407,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
if (do_defrag) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+ const uint32_t n_max_nodes = lctx.graph_max_nodes();
+ const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
+ if (!defrag_prepare(n_max_nodes)) {
+ LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
+ return false;
+ }
+
+ for (std::size_t i = 0; i < defrag_info.moves.size(); i += max_moves) {
+ std::vector<struct llama_kv_defrag_move> chunk;
+ auto end = std::min(i + max_moves, defrag_info.moves.size());
+ chunk.assign(defrag_info.moves.begin() + i, defrag_info.moves.begin() + end);
- if (defrag_prepare(lctx.graph_max_nodes())) {
ggml_backend_sched_reset(sched);
auto * gf = lctx.graph_init();
- auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
+ auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf, chunk);
ggml_backend_sched_alloc_graph(sched, gf);
res->set_inputs(nullptr);
lctx.graph_compute(gf, false);
-
- need_reserve = true;
}
do_defrag = false;
}
- return need_reserve;
+ // we never need to reserve a worst case graph
+ return false;
}
void llama_kv_cache_unified::defrag_sched(float thold) {
@@ -715,11 +720,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
const llama_cparams & cparams,
ggml_context * ctx,
- ggml_cgraph * gf) const {
+ ggml_cgraph * gf,
+ const std::vector<struct llama_kv_defrag_move> & moves) const {
auto res = std::make_unique<llm_graph_result>();
const auto & hparams = model.hparams;
- const auto & ids = kv_self->defrag_info.ids;
- const auto & ids = defrag_info.ids;
-
#if 0
// CPU defrag
//
@@ -661,32 +660,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
@@ -791,32 +795,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
}
#else
@@ -63,188 +132,63 @@ index cd06ad91..77177c5e 100644
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il],
ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il],
- n_embd_k_gqa, nm,
+ n_embd_k_gqa, move.len,
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i));
+ ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.src));
ggml_row_size(k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
+ ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.src));
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il],
ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il],
- n_embd_k_gqa, nm,
+ n_embd_k_gqa, move.len,
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id));
+ ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.dst));
ggml_row_size(k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
+ ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.dst));
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
@@ -694,34 +681,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
@@ -824,31 +816,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
if (cparams.flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
view_v_src = ggml_view_2d(ctx, v_l[il],
- n_embd_v_gqa, nm,
+ n_embd_v_gqa, move.len,
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i));
+ ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.src));
ggml_row_size(v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
+ ggml_row_size(v_l[il]->type, n_embd_v_gqa*move.dst));
view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
view_v_dst = ggml_view_2d(ctx, v_l[il],
- n_embd_v_gqa, nm,
+ n_embd_v_gqa, move.len,
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id));
+ ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.dst));
+ move.len, n_embd_v_gqa,
ggml_row_size(v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
+ ggml_row_size(v_l[il]->type, move.src));
} else {
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
view_v_src = ggml_view_2d(ctx, v_l[il],
- nm, n_embd_v_gqa,
+ move.len, n_embd_v_gqa,
ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
- ggml_row_size(kv_self->v_l[il]->type, i));
+ ggml_row_size(kv_self->v_l[il]->type, move.src));
ggml_row_size(v_l[il]->type, size),
- ggml_row_size(v_l[il]->type, i));
+ ggml_row_size(v_l[il]->type, move.src));
view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
view_v_dst = ggml_view_2d(ctx, v_l[il],
- nm, n_embd_v_gqa,
+ move.len, n_embd_v_gqa,
ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
- ggml_row_size(kv_self->v_l[il]->type, id));
+ ggml_row_size(kv_self->v_l[il]->type, move.dst));
ggml_row_size(v_l[il]->type, size),
- ggml_row_size(v_l[il]->type, id));
+ ggml_row_size(v_l[il]->type, move.dst));
}
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
}
-
- i += nm - 1;
}
-
- //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
#endif
return res;
@@ -730,8 +713,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
void llama_context::kv_self_update() {
auto & kv = kv_self;
- bool need_reserve = false;
-
if (kv->has_shift) {
if (!kv->get_can_shift()) {
GGML_ABORT("The current context does not support K-shift");
@@ -752,8 +733,6 @@ void llama_context::kv_self_update() {
res->set_inputs(nullptr);
graph_compute(gf, false);
-
- need_reserve = true;
}
{
@@ -768,49 +747,28 @@ void llama_context::kv_self_update() {
// defragment the KV cache if needed
if (kv->do_defrag) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+ const uint32_t n_max_nodes = graph_max_nodes();
+ const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
+ if (!kv->defrag_prepare(n_max_nodes)) {
+ LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
+ return;
+ }
- if (kv->defrag_prepare(graph_max_nodes())) {
- ggml_backend_sched_reset(sched.get());
+ for (std::size_t i = 0; i < kv_self->defrag_info.moves.size(); i += max_moves) {
+ std::vector<struct llama_kv_defrag_move> chunk;
+ auto end = std::min(i + max_moves, kv_self->defrag_info.moves.size());
+ chunk.assign(kv_self->defrag_info.moves.begin() + i, kv_self->defrag_info.moves.begin() + end);
+ ggml_backend_sched_reset(sched.get());
auto * gf = graph_init();
-
- auto res = build_kv_self_defrag(ctx_compute.get(), gf);
-
+ auto res = build_kv_self_defrag(ctx_compute.get(), gf, chunk);
ggml_backend_sched_alloc_graph(sched.get(), gf);
-
res->set_inputs(nullptr);
-
graph_compute(gf, false);
-
- need_reserve = true;
}
kv->do_defrag = false;
}
-
- // reserve a worst case graph if needed
- if (need_reserve) {
- LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
-
- // build worst-case graph
- uint32_t n_seqs = 1; // TODO: worst-case number of sequences
- uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
- // simulate full KV cache
- kv_self->n = kv_self->size;
-
- llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
- llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
- auto * gf = graph_init();
- graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
-
- // initialize scheduler with the worst-case graph
- ggml_backend_sched_reset(sched.get());
- if (!ggml_backend_sched_reserve(sched.get(), gf)) {
- LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
- }
- }
}
enum llama_pooling_type llama_context::pooling_type() const {
@@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) {
// find KV slot
{
if (!kv_self->find_slot(ubatch)) {
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
- return 1;
+ kv_self->defrag();
+ kv_self_update();
+ if (!kv_self->find_slot(ubatch)) {
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+ return 1;
+ }
}
if (!kv_self->recurrent) {
diff --git a/src/llama-context.h b/src/llama-context.h
index a50c4afa..30f84bfd 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -5,6 +5,7 @@
#include "llama-cparams.h"
#include "llama-graph.h"
#include "llama-adapter.h"
+#include "llama-kv-cache.h"
#include "ggml-cpp.h"
@@ -179,7 +180,8 @@ private:
llm_graph_result_ptr build_kv_self_defrag(
ggml_context * ctx0,
- ggml_cgraph * gf) const;
+ ggml_cgraph * gf,
+ const std::vector<struct llama_kv_defrag_move> & moves) const;
// TODO: read/write lora adapters and cvec
size_t state_write_data(llama_io_write_i & io);
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 69f8d35a..35a750d3 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -781,17 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
@@ -865,17 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
assert(n_used <= n_kv);
@@ -263,7 +207,7 @@ index 69f8d35a..35a750d3 100644
// determine which KV cells to move where
//
@@ -799,10 +789,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
@@ -883,10 +863,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
//
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
//
@@ -275,7 +219,7 @@ index 69f8d35a..35a750d3 100644
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
const auto & cell0 = cells[i0];
@@ -851,19 +838,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
@@ -935,19 +912,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
// are we moving a continuous block of memory?
bool cont = false;
@@ -295,7 +239,7 @@ index 69f8d35a..35a750d3 100644
cont = false;
continue;
}
@@ -879,8 +858,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
@@ -963,8 +932,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
head = n_used;
if (!cont) {
@@ -307,7 +251,7 @@ index 69f8d35a..35a750d3 100644
}
nf++;
@@ -890,22 +871,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
@@ -974,22 +945,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
}
}
@@ -325,37 +269,47 @@ index 69f8d35a..35a750d3 100644
return false;
}
- LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
- LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
-
- LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer);
- LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
+ // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
return true;
}
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 56c74035..25cbcb56 100644
index bf3b4b6a..928b9712 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -43,6 +43,13 @@ private:
@@ -82,6 +82,13 @@ struct llama_kv_cache_guard {
private:
llama_kv_cache * kv;
};
+
+// block of KV slots to move when defragging
+struct llama_kv_defrag_move {
+ uint32_t src;
+ uint32_t dst;
+ uint32_t len;
+};
+
struct llama_kv_cell {
llama_pos pos = -1;
llama_pos delta = 0;
@@ -131,7 +138,7 @@ public:
// defrag
//
// llama_kv_cache_unified
@@ -207,7 +214,7 @@ private:
// defrag
struct {
- std::vector<uint32_t> ids;
+ std::vector<llama_kv_defrag_move> moves;
} defrag_info;
// return true if cells have been moved
@@ -249,7 +256,8 @@ private:
llm_graph_result_ptr build_graph_defrag(
const llama_cparams & cparams,
ggml_context * ctx,
- ggml_cgraph * gf) const;
+ ggml_cgraph * gf,
+ const std::vector<llama_kv_defrag_move> & moves) const;
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;