ml: Add support for quantized KV cache

Similar to the llama engine, quantizing the KV cache requires
flash attention to be enabled through the Ollama server.
This commit is contained in:
Jesse Gross
2025-02-21 20:54:14 -08:00
committed by Jesse Gross
parent f52b2615ef
commit 4100ed7bdd
3 changed files with 13 additions and 3 deletions

View File

@@ -535,6 +535,10 @@ func (c Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
cdtype = C.GGML_TYPE_F32
case ml.DTypeF16:
cdtype = C.GGML_TYPE_F16
case ml.DTypeQ80:
cdtype = C.GGML_TYPE_Q8_0
case ml.DTypeQ40:
cdtype = C.GGML_TYPE_Q4_0
case ml.DTypeI32:
cdtype = C.GGML_TYPE_I32
default:
@@ -680,6 +684,10 @@ func (t *Tensor) DType() ml.DType {
return ml.DTypeF32
case C.GGML_TYPE_F16:
return ml.DTypeF16
case C.GGML_TYPE_Q8_0:
return ml.DTypeQ80
case C.GGML_TYPE_Q4_0:
return ml.DTypeQ40
case C.GGML_TYPE_I32:
return ml.DTypeI32
default: