mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-12 00:37:04 +00:00
runner.go: Retry decoding after defragmentation if needed
Fragmentation of the KV cache can occur due to cache shifting or different sequences getting processed. Decode uses a heuristic to decide if it should defrag. However, this heuristic isn't 100% accurate, so decoding can sometimes fail by surprise. For these cases, if decode indicates that there is no KV cache space, we should defrag and then try again.
This commit is contained in:
@@ -426,8 +426,15 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
||||
|
||||
err := s.lc.Decode(batch)
|
||||
if err != nil {
|
||||
slog.Error("failed to decode batch", "error", err)
|
||||
return
|
||||
if errors.Is(err, llama.ErrKvCacheFull) {
|
||||
slog.Debug("defragmenting kv cache")
|
||||
s.cache.lc.KvCacheDefrag()
|
||||
err = s.lc.Decode(batch)
|
||||
}
|
||||
if err != nil {
|
||||
slog.Error("failed to decode batch", "error", err)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if crossAttention {
|
||||
|
||||
Reference in New Issue
Block a user