mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
ml: Add support for quantized KV cache
Similar to the llama engine, quantizing the KV cache requires flash attention to be enabled through the Ollama server.
This commit is contained in:
@@ -58,9 +58,9 @@ func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots
|
||||
func kvCacheTypeFromStr(s string) ml.DType {
|
||||
switch s {
|
||||
case "q8_0":
|
||||
panic("kv cache quantization not yet implemented")
|
||||
return ml.DTypeQ80
|
||||
case "q4_0":
|
||||
panic("kv cache quantization not yet implemented")
|
||||
return ml.DTypeQ40
|
||||
default:
|
||||
return ml.DTypeF16
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user