mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
kvcache: create cache ctx per layer
each cache layer creates and maintains its own context instead of using a large context for all layers
This commit is contained in:
@@ -35,13 +35,17 @@ type EncoderCache struct {
|
||||
encoderPos int32
|
||||
|
||||
// ** cache data storage **
|
||||
|
||||
cacheCtx ml.Context
|
||||
keys, values []ml.Tensor
|
||||
backend ml.Backend
|
||||
ctxs map[int]ml.Context
|
||||
keys, values map[int]ml.Tensor
|
||||
}
|
||||
|
||||
func NewEncoderCache() *EncoderCache {
|
||||
return &EncoderCache{}
|
||||
return &EncoderCache{
|
||||
ctxs: make(map[int]ml.Context),
|
||||
keys: make(map[int]ml.Tensor),
|
||||
values: make(map[int]ml.Tensor),
|
||||
}
|
||||
}
|
||||
|
||||
func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
|
||||
@@ -57,7 +61,7 @@ func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, capacity int32)
|
||||
panic(fmt.Errorf("encoder cache is unable to enforce requested CachePadding (%v)", c.config.CachePadding))
|
||||
}
|
||||
|
||||
c.cacheCtx = backend.NewContext()
|
||||
c.backend = backend
|
||||
}
|
||||
|
||||
func (c *EncoderCache) SetConfig(config ml.CacheConfig) {
|
||||
@@ -69,7 +73,9 @@ func (c *EncoderCache) SetConfig(config ml.CacheConfig) {
|
||||
}
|
||||
|
||||
func (c *EncoderCache) Close() {
|
||||
c.cacheCtx.Close()
|
||||
for _, ctx := range c.ctxs {
|
||||
ctx.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func (c *EncoderCache) StartForward(ctx ml.Context, positions []int32, seqs []int) error {
|
||||
@@ -80,11 +86,6 @@ func (c *EncoderCache) StartForward(ctx ml.Context, positions []int32, seqs []in
|
||||
}
|
||||
|
||||
func (c *EncoderCache) SetLayer(layer int) {
|
||||
if layer >= len(c.keys) {
|
||||
c.keys = append(c.keys, make([]ml.Tensor, layer-len(c.keys)+1)...)
|
||||
c.values = append(c.values, make([]ml.Tensor, layer-len(c.values)+1)...)
|
||||
}
|
||||
|
||||
c.curLayer = layer
|
||||
}
|
||||
|
||||
@@ -104,9 +105,16 @@ func (c *EncoderCache) Put(ctx ml.Context, key, value ml.Tensor) {
|
||||
value = value.Permute(ctx, 1, 2, 0, 3)
|
||||
}
|
||||
|
||||
if c.keys[c.curLayer] == nil || c.values[c.curLayer] == nil {
|
||||
c.keys[c.curLayer] = c.cacheCtx.Empty(key.DType(), key.Shape()...)
|
||||
c.values[c.curLayer] = c.cacheCtx.Empty(value.DType(), value.Shape()...)
|
||||
if _, ok := c.ctxs[c.curLayer]; !ok {
|
||||
c.ctxs[c.curLayer] = c.backend.NewContext()
|
||||
}
|
||||
|
||||
if _, ok := c.keys[c.curLayer]; !ok {
|
||||
c.keys[c.curLayer] = c.ctxs[c.curLayer].Empty(key.DType(), key.Shape()...)
|
||||
}
|
||||
|
||||
if _, ok := c.values[c.curLayer]; !ok {
|
||||
c.values[c.curLayer] = c.ctxs[c.curLayer].Empty(value.DType(), value.Shape()...)
|
||||
}
|
||||
|
||||
ctx.Forward(
|
||||
|
||||
Reference in New Issue
Block a user