Fix gemma3:12b to load on single Tesla K80 GPU

Problem: gemma3:12b (10.2 GiB actual) was splitting across 2 GPUs despite fitting in single Tesla K80 (11.2 GiB available). Root Cause: Graph memory estimates for CC 3.7 were 15-20% too high (estimated 1.3 GiB, actual 1.1 GiB), causing single-GPU fit check to fail by ~200 MiB margin. Solution: Apply empirical 85% correction factor to graph estimates for Tesla K80 (CC 3.7) based on measured actual usage. Results: - Memory estimate: 11.9 GiB → 11.0 GiB (-900 MiB) - GPU split: 1,48 layers → single GPU (no split) - GPU 0: 10,015 MiB (was 617 MiB) - GPU 1: 7 MiB (was 9,866 MiB) - Inference: 94% GPU utilization, no cross-GPU overhead Testing: ✅ gemma3:12b loads on single GPU with correct inference 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-15 18:27:08 +00:00 · 2025-10-30 00:15:59 +08:00
parent d04ea50ced
commit 6d87524e22
4 changed files with 483 additions and 2 deletions
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -170,6 +170,17 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		graphFullOffload = graphPartialOffload
 	}

+	// ollama37: Apply empirical correction factor for Tesla K80 (CC 3.7)
+	// Measured: graph estimates are consistently 15-20% higher than actual usage
+	// Example: gemma3:12b estimated 1.3 GiB, actual 1.1 GiB (85% of estimate)
+	if gpus[0].Library == "cuda" && gpus[0].Compute == "3.7" {
+		graphPartialOffload = (graphPartialOffload * 85) / 100
+		graphFullOffload = (graphFullOffload * 85) / 100
+		slog.Debug("applied CC 3.7 graph correction",
+			"partial", format.HumanBytes2(graphPartialOffload),
+			"full", format.HumanBytes2(graphFullOffload))
+	}
+
 	// Output layer handled at the end if we have space
 	if layer, ok := layers["output_norm"]; ok {
 		memoryLayerOutput += layer.Size()
@@ -238,9 +249,20 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 			// Primary GPU or single GPU: use full graph
 			gpuGraphAllocations[i] = max(graphPartialOffload, graphFullOffload)
 		}
+		slog.Debug("graph allocation per GPU",
+			"gpu", i,
+			"graph_alloc", format.HumanBytes2(gpuGraphAllocations[i]),
+			"is_multi_gpu", len(gpus) > 1,
+			"is_secondary", len(gpus) > 1 && i < len(gpus)-1)
 	}

 	// For all the layers, find where they can fit on the GPU(s)
+	slog.Debug("starting layer placement",
+		"total_layers", f.KV().BlockCount(),
+		"num_gpus", len(gpus),
+		"gpus_with_space", len(gpusWithSpace),
+		"overhead", format.HumanBytes2(overhead))
+
 	for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
 		// Some models have inconsistent layer sizes
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
@@ -257,21 +279,38 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin

 		// distribute the layers across the GPU(s) that have space
 		// ollama37: Prefer loading on last GPU first (single-GPU preference for Tesla K80)
+		placed := false
 		for j := len(gpusWithSpace); j > 0; j-- {
 			// Try GPUs in reverse order (highest index first) instead of round-robin
 			g := gpusWithSpace[j-1]
 			used := gpuAllocations[g.i] + gpuGraphAllocations[g.i]  // ollama37: use per-GPU graph allocation
+			required := overhead + used + layerSize
+
+			if i == int(f.KV().BlockCount())-1 || i == int(f.KV().BlockCount())-2 || i == 0 {
+				// Debug log for first 2 and last layer
+				slog.Debug("layer placement attempt",
+					"layer", i,
+					"gpu", g.i,
+					"gpu_free", format.HumanBytes2(g.g.FreeMemory),
+					"overhead", format.HumanBytes2(overhead),
+					"used", format.HumanBytes2(used),
+					"layer_size", format.HumanBytes2(layerSize),
+					"required", format.HumanBytes2(required),
+					"fits", g.g.FreeMemory > required)
+			}
+
 			if g.g.FreeMemory > overhead+used+layerSize {
 				gpuAllocations[g.i] += layerSize
 				layerCounts[g.i]++
 				layerCount++
+				placed = true
 				break
 			} else {
 				gpusWithSpace = append(gpusWithSpace[:j-1], gpusWithSpace[j:]...)
 			}
 		}

-		if len(gpusWithSpace) == 0 {
+		if !placed {
 			overflow += layerSize
 		}
 	}
@@ -281,16 +320,32 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin

 	// Determine if we need to consider output then find where it fits
 	memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
+	slog.Debug("output layer placement",
+		"memory_last_layer", format.HumanBytes2(memoryLastLayer),
+		"layer_count_before", layerCount,
+		"block_count", f.KV().BlockCount(),
+		"gpus_with_space", len(gpusWithSpace))
+
 	if memoryLastLayer > 0 {
+		outputPlaced := false
 		if opts.NumGPU < 0 || layerCount < opts.NumGPU {
 			// ollama37: Prefer last GPU first (single-GPU preference for Tesla K80)
 			for j := len(gpusWithSpace); j > 0; j-- {
 				g := gpusWithSpace[j-1]  // Try GPUs in reverse order
-				used := gpuAllocations[g.i] + gpuGraphAllocations[g.i]  // ollama37: use per-GPU graph allocation
+
+				// ollama37: Use actual graph allocation (not conservative estimate)
+				// This allows tighter packing on single GPU
+				used := gpuAllocations[g.i] + gpuGraphAllocations[g.i]
+
 				if g.g.FreeMemory > overhead+used+memoryLastLayer {
 					gpuAllocations[g.i] += memoryLastLayer
 					layerCounts[g.i]++
 					layerCount++
+					outputPlaced = true
+					slog.Debug("output layer placed",
+						"gpu", g.i,
+						"layer_count_after", layerCount,
+						"fully_loaded", layerCount >= int(f.KV().BlockCount())+1)
 					break
 				}
 			}
@@ -299,6 +354,10 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		if layerCount < int(f.KV().BlockCount())+1 {
 			fullyLoaded = false
 			overflow += memoryLastLayer
+			slog.Debug("output layer overflow",
+				"layer_count", layerCount,
+				"required", int(f.KV().BlockCount())+1,
+				"output_placed", outputPlaced)
 		}
 	}