update memory calcualtions

count each layer independently when deciding gpu offloading
2025-12-11 00:07:07 +00:00 · 2024-03-18 10:45:22 +01:00
parent d338d70492
commit 91b3e4d282
7 changed files with 125 additions and 89 deletions
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -39,7 +39,7 @@ import (

 type dynExtServer struct {
 	s       C.struct_dynamic_llama_server
-	options api.Options
+	options *api.Options
 }

 // Note: current implementation does not support concurrent instantiations
@@ -64,7 +64,7 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
 	return fmt.Errorf(C.GoString(resp.msg))
 }

-func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func newDynExtServer(library, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
 	if !mutex.TryLock() {
 		slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
 		mutex.Lock()
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"strings"
 )

 type GGML struct {
@@ -12,6 +13,16 @@ type GGML struct {
 	model
 }

+func (ggml *GGML) LayerSize(prefix string) (n int64) {
+	for _, t := range ggml.Tensors() {
+		if strings.HasPrefix(t.Name, prefix) {
+			n += int64(t.size())
+		}
+	}
+
+	return
+}
+
 const (
 	fileTypeF32 uint32 = iota
 	fileTypeF16
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -5,10 +5,11 @@ import (
 	"fmt"
 	"log/slog"
 	"os"
-	"runtime"
 	"slices"
+	"strings"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 )

@@ -24,7 +25,7 @@ var cpuOnlyFamilies = []string{
 	"mamba",
 }

-func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func New(model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
@@ -35,7 +36,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
 	}
 	defer f.Close()

-	ggml, size, err := DecodeGGML(f)
+	ggml, _, err := DecodeGGML(f)
 	if err != nil {
 		return nil, err
 	}
@@ -49,92 +50,101 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
 		opts.NumCtx = 4
 	}

-	vram, _ := gpu.CheckVRAM()
+	availableMemory, _ := gpu.CheckVRAM()
+	info := gpu.GetGPUInfo()

-	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
-	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) * int64(ggml.KV().HeadCountKV()) / int64(ggml.KV().HeadCount())
+	usedMemory := info.MinimumMemory
+	for _, projector := range projectors {
+		usedMemory += projectorMemoryRequirements(projector)
+
+		// multimodal models require at least 2048 context
+		opts.NumCtx = max(opts.NumCtx, 2048)
+	}
+
+	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
+	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())

 	// this amount is the overhead + tensors in memory
 	// TODO: get this from the llama.cpp's graph calculations instead of
 	// estimating it's 1/6 * kv_cache_size * num_gqa
 	graph := int64(ggml.KV().GQA()) * kv / 6
+	usedMemory += graph

-	if slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
-		opts.NumGPU = 0
+	if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
+		info.Library = "cpu"
 	}

-	info := gpu.GetGPUInfo()
-	switch runtime.GOOS {
-	case "darwin":
-		if opts.NumGPU == 0 {
-			break
-		}
+	requiredMemory := usedMemory

-		if size+kv+graph > vram {
-			slog.Info("not enough vram available, setting num_gpu=0")
-			opts.NumGPU = 0
-			break
-		}
+	var layers int
+	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
+		layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
+		requiredMemory += layerMemory

-		// TODO: implement layer splitting on macOS
-		opts.NumGPU = 999
-	default:
-		if info.Library == "cpu" {
-			slog.Info("GPU not available, falling back to CPU")
-			opts.NumGPU = 0
-			break
+		if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
+			usedMemory += layerMemory
+			layers++
 		}
-
-		// don't use GPU at all if no layers are loaded
-		if opts.NumGPU == 0 {
-			info.Library = "cpu"
-			info.Variant = gpu.GetCPUVariant()
-			break
-		}
-
-		// user-defined GPU count
-		if opts.NumGPU != -1 {
-			break
-		}
-
-		// the "main" GPU needs the most memory and determines the limit
-		// of how many layers can be loaded. It needs to fit:
-		// 1. the full compute graph allocation for all devices (graph)
-		// 2. the proportional kv cache for all devices (kv * % layers)
-		// 3. the proportional model (size * % layers / # devices)
-		// This estimates the number of layers
-		maxlayers := int64(ggml.KV().BlockCount()) + 1
-		devices := int64(info.DeviceCount)
-		avg := vram / devices
-		layers := maxlayers * (avg - graph) / (kv + size/devices)
-		if layers > maxlayers {
-			layers = maxlayers
-		}
-
-		// 1 + 2 must fit on the main gpu
-		min := graph + kv*layers/maxlayers
-		if layers <= 0 || min > avg {
-			slog.Info("not enough vram available, falling back to CPU only")
-			info.Library = "cpu"
-			info.Variant = gpu.GetCPUVariant()
-			opts.NumGPU = 0
-			break
-		}
-
-		opts.NumGPU = int(layers)
 	}

-	opts.RopeFrequencyBase = 0.0
-	opts.RopeFrequencyScale = 0.0
+	memOutputLayer := ggml.LayerSize("output.")
+	requiredMemory += memOutputLayer
+
+	// only offload output layer if all repeating layers are offloaded
+	if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
+		usedMemory += memOutputLayer
+		layers++
+	}
+
+	slog.Info(
+		"offload to gpu",
+		"layers", layers,
+		"required", format.HumanBytes2(requiredMemory),
+		"used", format.HumanBytes2(usedMemory),
+		"available", format.HumanBytes2(availableMemory),
+		"kv", format.HumanBytes2(kv),
+		"graph", format.HumanBytes2(graph),
+	)
+
+	if opts.NumGPU < 0 && info.Library != "cpu" {
+		opts.NumGPU = layers
+	}
+
 	return newLlmServer(info, model, adapters, projectors, opts)
 }

+func projectorMemoryRequirements(filename string) int64 {
+	file, err := os.Open(filename)
+	if err != nil {
+		return 0
+	}
+	defer file.Close()
+
+	ggml, _, err := DecodeGGML(file)
+	if err != nil {
+		return 0
+	}
+
+	prefixes := make(map[string]struct{})
+	for _, layer := range ggml.Tensors() {
+		parts := strings.Split(layer.Name, ".")
+		prefixes[strings.Join(parts[:2], ".")] = struct{}{}
+	}
+
+	var ask int64
+	for prefix := range prefixes {
+		ask += ggml.LayerSize(prefix)
+	}
+
+	return ask
+}
+
 // Give any native cgo implementations an opportunity to initialize
 func Init() error {
 	return nativeInit()
 }

-func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
 	dynLibs := getDynLibs(gpuInfo)

 	// Check to see if the user has requested a specific library instead of auto-detecting