ml: Add support for quantized KV cache

Similar to the llama engine, quantizing the KV cache requires flash attention to be enabled through the Ollama server.
2025-12-10 15:57:04 +00:00 · 2025-02-21 20:54:14 -08:00
parent f52b2615ef
commit 4100ed7bdd
3 changed files with 13 additions and 3 deletions
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -58,9 +58,9 @@ func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots
 func kvCacheTypeFromStr(s string) ml.DType {
 	switch s {
 	case "q8_0":
-		panic("kv cache quantization not yet implemented")
+		return ml.DTypeQ80
 	case "q4_0":
-		panic("kv cache quantization not yet implemented")
+		return ml.DTypeQ40
 	default:
 		return ml.DTypeF16
 	}