ml: Add support for quantized KV cache

Similar to the llama engine, quantizing the KV cache requires flash attention to be enabled through the Ollama server.
2025-12-10 15:57:04 +00:00 · 2025-02-21 20:54:14 -08:00
parent f52b2615ef
commit 4100ed7bdd
3 changed files with 13 additions and 3 deletions
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -535,6 +535,10 @@ func (c Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 		cdtype = C.GGML_TYPE_F32
 	case ml.DTypeF16:
 		cdtype = C.GGML_TYPE_F16
+	case ml.DTypeQ80:
+		cdtype = C.GGML_TYPE_Q8_0
+	case ml.DTypeQ40:
+		cdtype = C.GGML_TYPE_Q4_0
 	case ml.DTypeI32:
 		cdtype = C.GGML_TYPE_I32
 	default:
@@ -680,6 +684,10 @@ func (t *Tensor) DType() ml.DType {
 		return ml.DTypeF32
 	case C.GGML_TYPE_F16:
 		return ml.DTypeF16
+	case C.GGML_TYPE_Q8_0:
+		return ml.DTypeQ80
+	case C.GGML_TYPE_Q4_0:
+		return ml.DTypeQ40
 	case C.GGML_TYPE_I32:
 		return ml.DTypeI32
 	default: