kvcache: create cache ctx per layer

each cache layer creates and maintains its own context instead of using a large context for all layers
2025-12-10 07:46:59 +00:00 · 2025-02-25 12:57:49 -08:00
parent bfce55db3d
commit 764e199d67
4 changed files with 68 additions and 46 deletions
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -339,14 +339,15 @@ func (b *Backend) Get(name string) ml.Tensor {
 }

 func (b *Backend) NewContext() ml.Context {
-	maxTensors := max(8192, len(b.meta.Tensors().Items())*5)
+	maxGraphNodes := max(8192, len(b.meta.Tensors().Items())*5)
 	return &Context{
 		b:          b,
-		maxTensors: maxTensors,
 		ctx: C.ggml_init(C.struct_ggml_init_params{
-			mem_size: C.size_t(maxTensors)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(maxTensors), false),
+			mem_size: C.size_t(maxGraphNodes)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(maxGraphNodes), false),
 			no_alloc: true,
 		}),
+		backend:       C.ggml_backend_sched_get_backend(b.sched, 0),
+		maxGraphNodes: maxGraphNodes,
 	}
 }

@@ -363,13 +364,14 @@ type Context struct {

 	ctx   *C.struct_ggml_context
 	graph *C.struct_ggml_cgraph
+	backend *C.struct_ggml_backend

-	maxTensors int
+	maxGraphNodes int
 }

 func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
 	if c.graph == nil {
-		c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxTensors), false)
+		c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxGraphNodes), false)
 	}

 	for _, tensor := range tensors {
@@ -399,8 +401,8 @@ func (c *Context) Compute(tensors ...ml.Tensor) {
 	}
 }

-func (c *Context) MaxTensors() int {
-	return c.maxTensors
+func (c *Context) MaxGraphNodes() int {
+	return c.maxGraphNodes
 }

 func shapeToGGML(shape []int) *C.int64_t {
@@ -435,7 +437,7 @@ func newTensor(ctx Context, dtype ml.DType, shape []int) ml.Tensor {
 		panic("unsupported dtype")
 	}

-	b := C.ggml_backend_alloc_buffer(C.ggml_backend_sched_get_backend(ctx.b.sched, 0), C.ggml_nbytes(t))
+	b := C.ggml_backend_alloc_buffer(ctx.backend, C.ggml_nbytes(t))
 	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
 	C.ggml_set_input(t)
 	return &Tensor{b: ctx.b, t: t}
@@ -469,7 +471,7 @@ func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype u
 	}

 	t := C.ggml_new_tensor(ctx.ctx, dtype, C.int(len(shape)), shapeToGGML(shape))
-	b := C.ggml_backend_alloc_buffer(C.ggml_backend_sched_get_backend(ctx.b.sched, 0), C.ggml_nbytes(t))
+	b := C.ggml_backend_alloc_buffer(ctx.backend, C.ggml_nbytes(t))
 	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
 	C.ggml_backend_tensor_set(t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t))
 	C.ggml_set_input(t)
@@ -484,8 +486,8 @@ func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 	return fromSlice(c, s, shape, C.GGML_TYPE_I32)
 }

-func (c *Context) Close() {
-	if c != nil {
+func (c Context) Close() {
+	if c.ctx != nil {
 		C.ggml_free(c.ctx)
 	}
 }