kvcache: Add check for values that fall out of sliding window cache

The sliding window cache trims entries that are outside the window for the latest token. This works when we are extending the cache, such as when the conversation continues. However, if we have a partial overlap in conversation (including the BOS tokens), then we resume from a past point in the conversation and the needed tokens are no longer stored in memory. This verifies that the new window overlaps with the old one before reusing the cache. Co-authored-by: Jesse Gross <jesse@ollama.com>
2025-12-10 07:46:59 +00:00 · 2025-03-30 16:05:40 -07:00
parent 493385eb3e
commit b42970063d
7 changed files with 131 additions and 2 deletions
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -118,6 +118,10 @@ func (c *InputCache) LoadCacheSlot(prompt []input.Input) (*InputCacheSlot, []inp
 	}

 	if c.cache != nil {
+		if numPast > 0 && !c.cache.CanResume(slot.Id, numPast) {
+			numPast = 0
+		}
+
 		err = c.cache.Remove(slot.Id, numPast, math.MaxInt32)
 		if err != nil {
 			// Some models don't support partial erasure
--- a/runner/ollamarunner/cache_test.go
+++ b/runner/ollamarunner/cache_test.go
@@ -451,6 +451,7 @@ func (m *mockCache) Close()
 func (m *mockCache) StartForward(ctx ml.Context, batch input.Batch) error                          { return nil }
 func (m *mockCache) CopyPrefix(srcSeq, dstSeq int, len int32)                                      {}
 func (m *mockCache) SetConfig(ml.CacheConfig)                                                      {}
+func (m *mockCache) CanResume(seq int, pos int32) bool                                             { return true }

 func TestShiftCacheSlot(t *testing.T) {
 	tests := []struct {