runner.go: Retry decoding after defragmentation if needed

Fragmentation of the KV cache can occur due to cache shifting or different sequences getting processed. Decode uses a heuristic to decide if it should defrag. However, this heuristic isn't 100% accurate, so decoding can sometimes fail by surprise. For these cases, if decode indicates that there is no KV cache space, we should defrag and then try again.
2025-12-12 00:37:04 +00:00 · 2024-11-19 11:00:41 -08:00
parent 5f68fcab12
commit 7121dfa309
3 changed files with 50 additions and 6 deletions
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -426,8 +426,15 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)

 	err := s.lc.Decode(batch)
 	if err != nil {
-		slog.Error("failed to decode batch", "error", err)
-		return
+		if errors.Is(err, llama.ErrKvCacheFull) {
+			slog.Debug("defragmenting kv cache")
+			s.cache.lc.KvCacheDefrag()
+			err = s.lc.Decode(batch)
+		}
+		if err != nil {
+			slog.Error("failed to decode batch", "error", err)
+			return
+		}
 	}

 	if crossAttention {