ml: Add support for quantized KV cache

Similar to the llama engine, quantizing the KV cache requires
flash attention to be enabled through the Ollama server.
This commit is contained in:
Jesse Gross
2025-02-21 20:54:14 -08:00
committed by Jesse Gross
parent f52b2615ef
commit 4100ed7bdd
3 changed files with 13 additions and 3 deletions

View File

@@ -58,9 +58,9 @@ func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots
func kvCacheTypeFromStr(s string) ml.DType {
switch s {
case "q8_0":
panic("kv cache quantization not yet implemented")
return ml.DTypeQ80
case "q4_0":
panic("kv cache quantization not yet implemented")
return ml.DTypeQ40
default:
return ml.DTypeF16
}