ml: Add support for quantized KV cache

Similar to the llama engine, quantizing the KV cache requires flash attention to be enabled through the Ollama server.
2025-12-10 07:46:59 +00:00 · 2025-02-21 20:54:14 -08:00
parent f52b2615ef
commit 4100ed7bdd
3 changed files with 13 additions and 3 deletions
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -215,7 +215,7 @@ func Dump(ctx Context, t Tensor, opts ...DumpOptions) string {
 		return dump[[]float32](ctx, t, opts[0].Items, func(f float32) string {
 			return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
 		})
-	case DTypeF16:
+	case DTypeF16, DTypeQ80, DTypeQ40:
 		f32 := ctx.Empty(DTypeF32, t.Shape()...)
 		f32 = t.Copy(ctx, f32)
 		return dump[[]float32](ctx, f32, opts[0].Items, func(f float32) string {
@@ -283,5 +283,7 @@ const (
 	DTypeOther DType = iota
 	DTypeF32
 	DTypeF16
+	DTypeQ80
+	DTypeQ40
 	DTypeI32
 )