ml: Add support for quantized KV cache

Similar to the llama engine, quantizing the KV cache requires
flash attention to be enabled through the Ollama server.
This commit is contained in:
Jesse Gross
2025-02-21 20:54:14 -08:00
committed by Jesse Gross
parent f52b2615ef
commit 4100ed7bdd
3 changed files with 13 additions and 3 deletions

View File

@@ -215,7 +215,7 @@ func Dump(ctx Context, t Tensor, opts ...DumpOptions) string {
return dump[[]float32](ctx, t, opts[0].Items, func(f float32) string {
return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
})
case DTypeF16:
case DTypeF16, DTypeQ80, DTypeQ40:
f32 := ctx.Empty(DTypeF32, t.Shape()...)
f32 = t.Copy(ctx, f32)
return dump[[]float32](ctx, f32, opts[0].Items, func(f float32) string {
@@ -283,5 +283,7 @@ const (
DTypeOther DType = iota
DTypeF32
DTypeF16
DTypeQ80
DTypeQ40
DTypeI32
)