llm: introduce k/v context quantization (vRAM improvements) (#6279)

2025-12-10 15:57:04 +00:00 · 2024-12-04 10:57:19 +11:00
parent 2b82c5a8a1
commit 1bdab9fdb1
10 changed files with 147 additions and 21 deletions
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -360,7 +360,7 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	}, offset, nil
 }

-func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffload uint64) {
+func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
 	embedding := llm.KV().EmbeddingLength()
 	heads := llm.KV().HeadCount()
 	headsKV := llm.KV().HeadCountKV()
@@ -372,7 +372,8 @@ func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffloa

 	layers := llm.Tensors().Layers()

-	kv = 2 * context * llm.KV().BlockCount() * (embeddingHeadsK + embeddingHeadsV) * headsKV
+	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
+	kv = uint64(float64(context*llm.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)

 	switch llm.KV().Architecture() {
 	case "llama":
@@ -527,3 +528,34 @@ func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffloa

 	return
 }
+
+// SupportsKVCacheType checks if the requested cache type is supported
+func (ggml GGML) SupportsKVCacheType(cacheType string) bool {
+	validKVCacheTypes := []string{"f16", "q8_0", "q4_0"}
+	return slices.Contains(validKVCacheTypes, cacheType)
+}
+
+// SupportsFlashAttention checks if the model supports flash attention
+func (ggml GGML) SupportsFlashAttention() bool {
+	_, isEmbedding := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]
+	if isEmbedding {
+		return false
+	}
+
+	// Check head counts match and are non-zero
+	headCountK := ggml.KV().EmbeddingHeadCountK()
+	headCountV := ggml.KV().EmbeddingHeadCountV()
+	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
+}
+
+// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
+func kvCacheBytesPerElement(cacheType string) float64 {
+	switch cacheType {
+	case "q8_0":
+		return 1 // 1/2 of fp16
+	case "q4_0":
+		return 0.5 // 1/4 of fp16
+	default:
+		return 2 // f16 (default)
+	}
+}
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -123,7 +123,23 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 		slog.Warn("model missing blk.0 layer size")
 	}

-	kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
+	fa := envconfig.FlashAttention() &&
+		discover.GetGPUInfo().FlashAttentionSupported() &&
+		ggml.SupportsFlashAttention()
+
+	var kvct string
+	if fa {
+		requested := envconfig.KvCacheType()
+		if requested != "" && ggml.SupportsKVCacheType(requested) {
+			kvct = requested
+		}
+	}
+
+	kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)
+
+	// KV is proportional to the number of layers
+	layerSize += kv / ggml.KV().BlockCount()
+
 	if graphPartialOffload == 0 {
 		graphPartialOffload = ggml.KV().GQA() * kv / 6
 	}
@@ -131,9 +147,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 		graphFullOffload = graphPartialOffload
 	}

-	// KV is proportional to the number of layers
-	layerSize += kv / ggml.KV().BlockCount()
-
 	// on metal there's no partial offload overhead
 	if gpus[0].Library == "metal" {
 		graphPartialOffload = graphFullOffload
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -15,6 +15,7 @@ import (

 func TestEstimateGPULayers(t *testing.T) {
 	t.Setenv("OLLAMA_DEBUG", "1")
+	t.Setenv("OLLAMA_KV_CACHE_TYPE", "") // Ensure default f16

 	modelName := "dummy"
 	f, err := os.CreateTemp(t.TempDir(), modelName)
--- a/llm/server.go
+++ b/llm/server.go
@@ -214,15 +214,36 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		params = append(params, "--threads", strconv.Itoa(defaultThreads))
 	}

-	flashAttnEnabled := envconfig.FlashAttention()
+	fa := envconfig.FlashAttention()
+	if fa && !gpus.FlashAttentionSupported() {
+		slog.Warn("flash attention enabled but not supported by gpu")
+		fa = false
+	}

-	for _, g := range gpus {
-		// only cuda (compute capability 7+) and metal support flash attention
-		if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
-			flashAttnEnabled = false
+	if fa && !ggml.SupportsFlashAttention() {
+		slog.Warn("flash attention enabled but not supported by model")
+		fa = false
+	}
+
+	kvct := envconfig.KvCacheType()
+
+	if fa {
+		slog.Info("enabling flash attention")
+		params = append(params, "--flash-attn")
+
+		// Flash Attention also supports kv cache quantization
+		// Enable if the requested and kv cache type is supported by the model
+		if kvct != "" && ggml.SupportsKVCacheType(kvct) {
+			params = append(params, "--kv-cache-type", kvct)
+		} else {
+			slog.Warn("kv cache type not supported by model", "type", kvct)
 		}
+	} else if kvct != "" && kvct != "f16" {
+		slog.Warn("quantized kv cache requested but flash attention disabled", "type", kvct)
+	}

-		// mmap has issues with partial offloading on metal
+	// mmap has issues with partial offloading on metal
+	for _, g := range gpus {
 		if g.Library == "metal" &&
 			uint64(opts.NumGPU) > 0 &&
 			uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
@@ -231,10 +252,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		}
 	}

-	if flashAttnEnabled {
-		params = append(params, "--flash-attn")
-	}
-
 	// Windows CUDA should not use mmap for best performance
 	// Linux  with a model larger than free space, mmap leads to thrashing
 	// For CPU loads we want the memory to be allocated, not FS cache