mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 07:46:59 +00:00
ml: Add support for quantized KV cache
Similar to the llama engine, quantizing the KV cache requires flash attention to be enabled through the Ollama server.
This commit is contained in:
@@ -215,7 +215,7 @@ func Dump(ctx Context, t Tensor, opts ...DumpOptions) string {
|
||||
return dump[[]float32](ctx, t, opts[0].Items, func(f float32) string {
|
||||
return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
|
||||
})
|
||||
case DTypeF16:
|
||||
case DTypeF16, DTypeQ80, DTypeQ40:
|
||||
f32 := ctx.Empty(DTypeF32, t.Shape()...)
|
||||
f32 = t.Copy(ctx, f32)
|
||||
return dump[[]float32](ctx, f32, opts[0].Items, func(f float32) string {
|
||||
@@ -283,5 +283,7 @@ const (
|
||||
DTypeOther DType = iota
|
||||
DTypeF32
|
||||
DTypeF16
|
||||
DTypeQ80
|
||||
DTypeQ40
|
||||
DTypeI32
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user