review comments and coverage

2025-12-11 00:07:07 +00:00 · 2024-06-05 12:07:20 -07:00
parent ff4f0cbd1d
commit 6f351bf586
18 changed files with 375 additions and 456 deletions
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -1,7 +1,6 @@
 package llm

 import (
-	"fmt"
 	"log/slog"
 	"strconv"
 	"strings"
@@ -69,13 +68,9 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts

 	// Conditional output size on GPU 0
 	var memoryLayerOutput uint64
-	var includeOutput bool

-	// One extra layer as a pad for each GPU
-	var layerBuffer uint64
-
-	// The sizes of the main layers
-	var layerSizes []uint64
+	// The sizes of a layer
+	var layerSize uint64

 	// The sum of all the layer sizes (just for logging)
 	var memoryWeights uint64
@@ -102,12 +97,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	layers := ggml.Tensors().Layers()
 	// add one layer worth of memory as a buffer
 	if blk0, ok := layers["blk.0"]; ok {
-		layerBuffer = blk0.size()
+		layerSize = blk0.size()
+	} else {
+		slog.Warn("model missing blk.0 layer size")
 	}

 	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
 	var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()

+	// KV is proportional to the number of layers
+	layerSize += kv / ggml.KV().BlockCount()
+
 	graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
 	if graphPartialOffload == 0 {
 		graphPartialOffload = ggml.KV().GQA() * kv / 6
@@ -119,6 +119,9 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	// on metal there's no partial offload overhead
 	if gpus[0].Library == "metal" {
 		graphPartialOffload = graphFullOffload
+	} else if len(gpus) > 1 {
+		// multigpu should always use the partial graph size
+		graphFullOffload = graphPartialOffload
 	}

 	if layer, ok := layers["output_norm"]; ok {
@@ -130,16 +133,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		memoryLayerOutput += layer.size()
 	}

-	if gpus[0].Library == "metal" && opts.UseMMap {
-		includeOutput = true
-	} else if gpus[0].Library != "metal" || !opts.UseMMap {
-		includeOutput = true
-	}
-
+	// Output layer handled at the end if we have space
 	gpuZeroOverhead := projectorSize
-	if includeOutput {
-		gpuZeroOverhead += memoryLayerOutput
-	}

 	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 	var layerCount int
@@ -156,12 +151,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 			gzo = gpuZeroOverhead
 		}
 		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
-		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerBuffer {
+		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
 			slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
 			continue
 		}
 		gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
-		gpuAllocations[i] += gpus[i].MinimumMemory + layerBuffer // We hold off on graph until we know partial vs. full
+		gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
 	}

 	var gpuZeroID int
@@ -170,23 +165,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		gpuAllocations[gpuZeroID] += gpuZeroOverhead
 	}

-	layerSizes = make([]uint64, int(ggml.KV().BlockCount()))
-	for i := range int(ggml.KV().BlockCount()) {
-		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
-			memoryLayer := blk.size()
-
-			// KV is proportional to the number of layers
-			memoryLayer += kv / ggml.KV().BlockCount()
-			layerSizes[i] = memoryLayer
-			memoryWeights += memoryLayer
-		}
-	}
-
 	// For all the layers, find where they can fit on the GPU(s)
-	for i := range layerSizes {
-		if layerSizes[i] == 0 {
-			continue
-		}
+	for i := range int(ggml.KV().BlockCount()) {
+		memoryWeights += layerSize
+
 		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
 			// Stop allocating on GPU(s) once we hit the users target NumGPU
 			continue
@@ -196,8 +178,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		for j := len(gpusWithSpace); j > 0; j-- {
 			g := gpusWithSpace[i%j]
 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if g.g.FreeMemory > used+layerSizes[i] {
-				gpuAllocations[g.i] += layerSizes[i]
+			if g.g.FreeMemory > used+layerSize {
+				gpuAllocations[g.i] += layerSize
 				layerCounts[g.i]++
 				layerCount++
 				break
@@ -205,17 +187,18 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 				gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
 			}
 		}
-
 	}
 	if layerCount >= int(ggml.KV().BlockCount()) {
 		fullyLoaded = true
 	} else {
 		for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
-			overflow += layerSizes[i]
+			overflow += layerSize
 		}
 	}
-	// Find where the output fits
-	if includeOutput && memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
+
+	// Determine if we need to consider output then find where it fits
+	if ((gpus[0].Library == "metal" && opts.UseMMap) || (gpus[0].Library != "metal" || !opts.UseMMap)) &&
+		memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
 		for j := len(gpusWithSpace); j > 0; j-- {
 			g := gpusWithSpace[layerCount%j]
 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
@@ -226,6 +209,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 				break
 			}
 		}
+
 		if layerCount < int(ggml.KV().BlockCount())+1 {
 			fullyLoaded = false
 			overflow += memoryLayerOutput
@@ -253,7 +237,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	var memoryRequiredPartial, memoryRequiredTotal uint64
 	for i := range gpuAllocations {
 		memoryRequiredPartial += gpuAllocations[i]
-
 	}
 	memoryRequiredTotal = memoryRequiredPartial + overflow

--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -18,7 +18,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	envconfig.Debug = true
 	modelName := "dummy"
 	f, err := os.CreateTemp(t.TempDir(), modelName)
-	assert.Nil(t, err)
+	require.NoError(t, err)
 	defer f.Close()
 	gguf := NewGGUFV3(binary.LittleEndian)
 	inputLayerCount := 5
@@ -30,7 +30,7 @@ func TestEstimateGPULayers(t *testing.T) {
 		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
 	}
-	assert.Equal(t, inputLayerCount+1, len(tensors))
+	assert.Len(t, tensors, inputLayerCount+1)
 	err = gguf.Encode(f, KV{
 		"general.architecture":          "llama",
 		"general.name":                  "name",
@@ -56,9 +56,11 @@ func TestEstimateGPULayers(t *testing.T) {
 	}
 	projectors := []string{}
 	opts := api.DefaultOptions()
-	estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
-	assert.Equal(t, 0, estimate.Layers)
-	assert.Equal(t, uint64(0), estimate.Graph)
+	t.Run("cpu", func(t *testing.T) {
+		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+		assert.Equal(t, 0, estimate.Layers)
+		assert.Equal(t, uint64(0), estimate.Graph)
+	})

 	// derived from the dummy ggml file above
 	graphPartialOffload := uint64(202377216)
@@ -80,7 +82,10 @@ func TestEstimateGPULayers(t *testing.T) {
 		},
 	}
 	// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
-	for i, s := range [][]uint64{
+	for i, s := range []struct {
+		layer0, layer1   uint64
+		expect0, expect1 uint64
+	}{
 		{1, 1, 1, 1},
 		{2, 1, 2, 1},
 		{2, 2, 2, 2},
@@ -90,27 +95,33 @@ func TestEstimateGPULayers(t *testing.T) {
 		{6, 6, 3, 3},
 		{0, 3, 0, 3},
 	} {
-		gpus[0].FreeMemory = 0
-		gpus[1].FreeMemory = 0
-		gpus[0].FreeMemory += projectorSize + memoryLayerOutput
-		gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s[0]*layerSize + 1
-		gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s[1]*layerSize + 1
-		gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
-		gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
-		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
-		assert.Equal(t, int(s[2]+s[3]), estimate.Layers, "scenario %d: %v", i, s)
-		assert.Equal(t, fmt.Sprintf("%d,%d", s[2], s[3]), estimate.TensorSplit, "scenario %d: %v", i, s)
-		var layerSums uint64
-		for _, b := range estimate.GPUSizes {
-			layerSums += b
-		}
-		if estimate.Layers < inputLayerCount+1 {
-			assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
-			assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
-		} else {
-			assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
-			assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
-		}
+		t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
+			gpus[0].FreeMemory = 0
+			gpus[1].FreeMemory = 0
+			gpus[0].FreeMemory += projectorSize
+			if s.layer0 > 0 {
+				gpus[0].FreeMemory += memoryLayerOutput
+			} else {
+				gpus[1].FreeMemory += memoryLayerOutput
+			}
+			gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
+			gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
+			gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
+			gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
+			estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+			assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
+			assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
+			var layerSums uint64
+			for _, b := range estimate.GPUSizes {
+				layerSums += b
+			}
+			if estimate.Layers < inputLayerCount+1 {
+				assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
+				assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
+			} else {
+				assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
+				assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
+			}
+		})
 	}
-
 }
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := availableServers()
 	requested := info.Library
-	if info.Variant != "" {
-		requested += "_" + info.Variant
+	if info.Variant != gpu.CPUCapabilityNone {
+		requested += "_" + info.Variant.String()
 	}

 	servers := []string{}
@@ -117,14 +117,14 @@ func serversForGpu(info gpu.GpuInfo) []string {

 	// Load up the best CPU variant if not primary requested
 	if info.Library != "cpu" {
-		variant := gpu.GetCPUVariant()
+		variant := gpu.GetCPUCapability()
 		// If no variant, then we fall back to default
 		// If we have a variant, try that if we find an exact match
 		// Attempting to run the wrong CPU instructions will panic the
 		// process
-		if variant != "" {
+		if variant != gpu.CPUCapabilityNone {
 			for cmp := range availableServers {
-				if cmp == "cpu_"+variant {
+				if cmp == "cpu_"+variant.String() {
 					servers = append(servers, cmp)
 					break
 				}
@@ -146,11 +146,11 @@ func serverForCpu() string {
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		return "metal"
 	}
-	variant := gpu.GetCPUVariant()
+	variant := gpu.GetCPUCapability()
 	availableServers := availableServers()
-	if variant != "" {
+	if variant != gpu.CPUCapabilityNone {
 		for cmp := range availableServers {
-			if cmp == "cpu_"+variant {
+			if cmp == "cpu_"+variant.String() {
 				return cmp
 			}
 		}
--- a/llm/server.go
+++ b/llm/server.go
@@ -39,7 +39,7 @@ type LlamaServer interface {
 	Close() error
 	EstimatedVRAM() uint64 // Total VRAM across all GPUs
 	EstimatedTotal() uint64
-	EstimagedVRAMByGPU(gpuID string) uint64
+	EstimatedVRAMByGPU(gpuID string) uint64
 }

 // llmServer is an instance of the llama.cpp server
@@ -1016,7 +1016,7 @@ func (s *llmServer) EstimatedTotal() uint64 {
 	return s.estimate.TotalSize
 }

-func (s *llmServer) EstimagedVRAMByGPU(gpuID string) uint64 {
+func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
 	for i, gpu := range s.gpus {
 		if gpu.ID == gpuID {
 			return s.estimate.GPUSizes[i]