mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-12 16:57:04 +00:00
runner.go: Make KV entry accounting more robust
The structure of the accounting for KV cache shifting was carried over from the old runner but it now doesn't feel natural with the new runner. There are a number of invariants that should hold true but are difficult to reason about. There is at least one bug report that would imply that the invariants are not holding. This reduces the number of implicit assumptions and is more forgiving of unexpected situations. It also improves behavior around which input tokens are kept when truncation occurs. Bug #7545
This commit is contained in:
@@ -2,6 +2,7 @@ package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"reflect"
|
||||
"time"
|
||||
@@ -22,7 +23,11 @@ type InputCache struct {
|
||||
lc *llama.Context
|
||||
}
|
||||
|
||||
func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) *InputCache {
|
||||
func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) (*InputCache, error) {
|
||||
if kvSize/numSlots < 1 {
|
||||
return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
|
||||
}
|
||||
|
||||
slots := make([]InputCacheSlot, numSlots)
|
||||
|
||||
for i := range slots {
|
||||
@@ -37,7 +42,7 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
|
||||
slots: slots,
|
||||
multiUserCache: multiUserCache,
|
||||
lc: lc,
|
||||
}
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Locking: Operations on InputCacheSlot (including finding one
|
||||
@@ -58,7 +63,7 @@ type InputCacheSlot struct {
|
||||
lastUsed time.Time
|
||||
}
|
||||
|
||||
func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, int, error) {
|
||||
func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, error) {
|
||||
var slot *InputCacheSlot
|
||||
var numPast int
|
||||
var err error
|
||||
@@ -75,7 +80,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
|
||||
slot, numPast, err = c.findBestCacheSlot(prompt)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, nil, 0, err
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
if !cachePrompt {
|
||||
@@ -102,7 +107,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
|
||||
prompt = prompt[numPast:]
|
||||
slot.Inputs = slot.Inputs[:numPast]
|
||||
|
||||
return slot, prompt, numPast, nil
|
||||
return slot, prompt, nil
|
||||
}
|
||||
|
||||
func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int, error) {
|
||||
@@ -194,14 +199,30 @@ func countCommonPrefix(a []input, b []input) int {
|
||||
return count
|
||||
}
|
||||
|
||||
func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int, numDiscard int, numPast int) {
|
||||
// TODO (jessegross): KV cache removal can fail for certain types of models
|
||||
// server.cpp doesn't handle this, though we can be more graceful
|
||||
c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+numDiscard)
|
||||
c.lc.KvCacheSeqAdd(slot.Id, numKeep+numDiscard, numPast, -numDiscard)
|
||||
// Frees up space in the KV cache by deleting the oldest half of history and shifting
|
||||
// the newest half into that space (saving numKeep inputs at the beginning).
|
||||
//
|
||||
// Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
|
||||
func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) {
|
||||
targetFree := (c.numCtx - numKeep) / 2
|
||||
targetFree = max(targetFree, 1)
|
||||
|
||||
for i := numKeep + numDiscard; i < len(slot.Inputs); i++ {
|
||||
slot.Inputs[i-numDiscard] = slot.Inputs[i]
|
||||
currentFree := c.numCtx - len(slot.Inputs)
|
||||
discard := targetFree - currentFree
|
||||
|
||||
if discard <= 0 {
|
||||
return
|
||||
}
|
||||
slot.Inputs = slot.Inputs[:len(slot.Inputs)-numDiscard]
|
||||
|
||||
slog.Debug("context limit hit - shifting", "limit", c.numCtx, "input", len(slot.Inputs),
|
||||
"keep", numKeep, "discard", discard)
|
||||
|
||||
// TODO (jessegross): KV cache removal can fail for certain types of models
|
||||
c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+discard)
|
||||
c.lc.KvCacheSeqAdd(slot.Id, numKeep+discard, len(slot.Inputs), -discard)
|
||||
|
||||
for i := numKeep + discard; i < len(slot.Inputs); i++ {
|
||||
slot.Inputs[i-discard] = slot.Inputs[i]
|
||||
}
|
||||
slot.Inputs = slot.Inputs[:len(slot.Inputs)-discard]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user