kvcache: create cache ctx per layer

each cache layer creates and maintains its own context instead of using a large context for all layers
2025-12-10 15:57:04 +00:00 · 2025-02-25 12:57:49 -08:00
parent bfce55db3d
commit 764e199d67
4 changed files with 68 additions and 46 deletions
--- a/kvcache/encoder.go
+++ b/kvcache/encoder.go
@@ -35,13 +35,17 @@ type EncoderCache struct {
 	encoderPos int32

 	// ** cache data storage **
-
-	cacheCtx     ml.Context
-	keys, values []ml.Tensor
+	backend      ml.Backend
+	ctxs         map[int]ml.Context
+	keys, values map[int]ml.Tensor
 }

 func NewEncoderCache() *EncoderCache {
-	return &EncoderCache{}
+	return &EncoderCache{
+		ctxs:   make(map[int]ml.Context),
+		keys:   make(map[int]ml.Tensor),
+		values: make(map[int]ml.Tensor),
+	}
 }

 func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
@@ -57,7 +61,7 @@ func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, capacity int32)
 		panic(fmt.Errorf("encoder cache is unable to enforce requested CachePadding (%v)", c.config.CachePadding))
 	}

-	c.cacheCtx = backend.NewContext()
+	c.backend = backend
 }

 func (c *EncoderCache) SetConfig(config ml.CacheConfig) {
@@ -69,7 +73,9 @@ func (c *EncoderCache) SetConfig(config ml.CacheConfig) {
 }

 func (c *EncoderCache) Close() {
-	c.cacheCtx.Close()
+	for _, ctx := range c.ctxs {
+		ctx.Close()
+	}
 }

 func (c *EncoderCache) StartForward(ctx ml.Context, positions []int32, seqs []int) error {
@@ -80,11 +86,6 @@ func (c *EncoderCache) StartForward(ctx ml.Context, positions []int32, seqs []in
 }

 func (c *EncoderCache) SetLayer(layer int) {
-	if layer >= len(c.keys) {
-		c.keys = append(c.keys, make([]ml.Tensor, layer-len(c.keys)+1)...)
-		c.values = append(c.values, make([]ml.Tensor, layer-len(c.values)+1)...)
-	}
-
 	c.curLayer = layer
 }

@@ -104,9 +105,16 @@ func (c *EncoderCache) Put(ctx ml.Context, key, value ml.Tensor) {
 		value = value.Permute(ctx, 1, 2, 0, 3)
 	}

-	if c.keys[c.curLayer] == nil || c.values[c.curLayer] == nil {
-		c.keys[c.curLayer] = c.cacheCtx.Empty(key.DType(), key.Shape()...)
-		c.values[c.curLayer] = c.cacheCtx.Empty(value.DType(), value.Shape()...)
+	if _, ok := c.ctxs[c.curLayer]; !ok {
+		c.ctxs[c.curLayer] = c.backend.NewContext()
+	}
+
+	if _, ok := c.keys[c.curLayer]; !ok {
+		c.keys[c.curLayer] = c.ctxs[c.curLayer].Empty(key.DType(), key.Shape()...)
+	}
+
+	if _, ok := c.values[c.curLayer]; !ok {
+		c.values[c.curLayer] = c.ctxs[c.curLayer].Empty(value.DType(), value.Shape()...)
 	}

 	ctx.Forward(