llm: introduce k/v context quantization (vRAM improvements) (#6279)

This commit is contained in:
Sam
2024-12-04 10:57:19 +11:00
committed by GitHub
parent 2b82c5a8a1
commit 1bdab9fdb1
10 changed files with 147 additions and 21 deletions

View File

@@ -123,7 +123,23 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
slog.Warn("model missing blk.0 layer size")
}
kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
fa := envconfig.FlashAttention() &&
discover.GetGPUInfo().FlashAttentionSupported() &&
ggml.SupportsFlashAttention()
var kvct string
if fa {
requested := envconfig.KvCacheType()
if requested != "" && ggml.SupportsKVCacheType(requested) {
kvct = requested
}
}
kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)
// KV is proportional to the number of layers
layerSize += kv / ggml.KV().BlockCount()
if graphPartialOffload == 0 {
graphPartialOffload = ggml.KV().GQA() * kv / 6
}
@@ -131,9 +147,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
graphFullOffload = graphPartialOffload
}
// KV is proportional to the number of layers
layerSize += kv / ggml.KV().BlockCount()
// on metal there's no partial offload overhead
if gpus[0].Library == "metal" {
graphPartialOffload = graphFullOffload