Revert Phase 1 memory optimization to fix multi-GPU stability

Problem: Phase 1 optimization (190 MiB for secondary GPUs) caused OOM errors on large multi-GPU models due to insufficient runtime buffer: - gemma3:27b: Estimated 10.9 GiB, used 10.8 GiB → only 400 MiB free - Failed when allocating 6 MiB for KV cache during graph reservation - Root cause: 190 MiB didn't account for runtime allocations Investigation: Studied upstream Ollama code (upstream/main:llm/memory.go) and confirmed official behavior allocates FULL graph to ALL GPUs with layers, not reduced allocation for secondary GPUs. Solution: Reverted llm/memory.go to upstream behavior: - Removed gpuGraphAllocations map and per-GPU logic - Restored original round-robin layer distribution (layerCount%j) - All GPUs with layers now get full graph allocation - Matches official Ollama for maximum stability Results with revert: - gemma3:27b: ✅ Works correctly with 31/31 layer split - Memory allocation: [10.0 GiB, 9.8 GiB] with proper headroom - nvidia-smi: GPU0 8.7 GiB, GPU1 8.7 GiB (even distribution) - Graph allocation: Both GPUs get 300 MiB (actual, not estimate) Trade-offs: - ❌ gemma3:12b will use 2 GPUs instead of trying single-GPU (stable) - ✅ Large models (27b+) work reliably with proper buffer - ✅ Matches upstream behavior (easier to maintain) - ✅ Conservative estimates prevent OOM errors 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-09 23:37:06 +00:00 · 2025-10-30 19:10:23 +08:00
parent d002de9af4
commit fabe2c5cb7
2 changed files with 24 additions and 82 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -110,17 +110,25 @@ These files contain specific line numbers, code blocks, and commands to execute

 ### Memory Estimation Optimization for Single-GPU Preference

-**Status**: ✅ **COMPLETED** - Fully implemented and tested (2025-10-30)
+**Status**: ⚠️ **OPTIMIZATIONS REVERTED** - Returned to upstream behavior for stability (2025-10-30)

-**Goal**: Eliminate unnecessary multi-GPU splits by fixing graph memory overestimation for Tesla K80.
+**Original Goal**: Eliminate unnecessary multi-GPU splits by fixing graph memory overestimation for Tesla K80.

-### Phase 1: Per-GPU Graph Allocation (2025-10-29)
+**Outcome**: Both Phase 1 and Phase 2 optimizations were too aggressive and caused OOM errors on multi-GPU models. Reverted to match upstream Ollama for maximum stability.

-**Problem**: Multi-GPU systems allocated full graph memory (1.3 GiB) to EACH GPU, causing 2.6 GiB total overestimation.
+### Phase 1: Per-GPU Graph Allocation (2025-10-29) - REVERTED

-**Solution**: Secondary GPUs use 190 MiB, primary GPU uses full 1.3 GiB (based on empirical measurements).
+**Status**: ⚠️ **REVERTED** - Caused insufficient headroom for multi-GPU models (2025-10-30)

-**Results**: gemma3:12b split improved from 25,24 → 1,48 layers, but still not single-GPU.
+**Original Goal**: Reduce graph allocation on secondary GPUs from full 1.3 GiB to 190 MiB.
+
+**Original Results**: gemma3:12b split improved from 25,24 → 1,48 layers.
+
+**Problem Discovered**: The 190 MiB optimization left insufficient buffer for runtime allocations (KV cache, execution buffers), causing OOM errors on larger multi-GPU models:
+- gemma3:27b: Failed with only 400 MiB headroom on GPU1
+- Memory estimate: 10.9 GiB, actual usage: 10.8 GiB → 0.4 GiB free → OOM on 6 MiB allocation
+
+**Resolution**: Reverted to upstream Ollama behavior - allocate full graph to ALL GPUs with layers. This matches official Ollama (confirmed via code review of `upstream/main:llm/memory.go`).

 ### Phase 2: CC 3.7 Graph Correction Factor (2025-10-30) - DISABLED

--- a/llm/memory.go
+++ b/llm/memory.go
@@ -238,31 +238,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		overflow += gpuZeroOverhead
 	}

-	// ollama37: Create per-GPU graph allocations for Tesla K80 multi-GPU optimization
-	// Secondary GPUs use measured 190 MiB, primary GPU uses full graph
-	gpuGraphAllocations := make(map[int]uint64)
-	for i := range gpus {
-		if len(gpus) > 1 && i < len(gpus)-1 {
-			// Secondary GPU: use empirically measured value (181 MiB, rounded to 190 MiB)
-			gpuGraphAllocations[i] = 190 * 1024 * 1024
-		} else {
-			// Primary GPU or single GPU: use full graph
-			gpuGraphAllocations[i] = max(graphPartialOffload, graphFullOffload)
-		}
-		slog.Debug("graph allocation per GPU",
-			"gpu", i,
-			"graph_alloc", format.HumanBytes2(gpuGraphAllocations[i]),
-			"is_multi_gpu", len(gpus) > 1,
-			"is_secondary", len(gpus) > 1 && i < len(gpus)-1)
-	}
-
 	// For all the layers, find where they can fit on the GPU(s)
-	slog.Debug("starting layer placement",
-		"total_layers", f.KV().BlockCount(),
-		"num_gpus", len(gpus),
-		"gpus_with_space", len(gpusWithSpace),
-		"overhead", format.HumanBytes2(overhead))
-
 	for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
 		// Some models have inconsistent layer sizes
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
@@ -278,41 +254,18 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		}

 		// distribute the layers across the GPU(s) that have space
-		// ollama37: Prefer loading on last GPU first (single-GPU preference for Tesla K80)
-		placed := false
 		for j := len(gpusWithSpace); j > 0; j-- {
-			// Try GPUs in reverse order (highest index first) instead of round-robin
-			g := gpusWithSpace[j-1]
-			used := gpuAllocations[g.i] + gpuGraphAllocations[g.i]  // ollama37: use per-GPU graph allocation
-			required := overhead + used + layerSize
-
-			if i == int(f.KV().BlockCount())-1 || i == int(f.KV().BlockCount())-2 || i == 0 {
-				// Debug log for first 2 and last layer
-				slog.Debug("layer placement attempt",
-					"layer", i,
-					"gpu", g.i,
-					"gpu_free", format.HumanBytes2(g.g.FreeMemory),
-					"overhead", format.HumanBytes2(overhead),
-					"used", format.HumanBytes2(used),
-					"layer_size", format.HumanBytes2(layerSize),
-					"required", format.HumanBytes2(required),
-					"fits", g.g.FreeMemory > required)
-			}
-
+			g := gpusWithSpace[layerCount%j]
+			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
 			if g.g.FreeMemory > overhead+used+layerSize {
 				gpuAllocations[g.i] += layerSize
 				layerCounts[g.i]++
 				layerCount++
-				placed = true
 				break
 			} else {
-				gpusWithSpace = append(gpusWithSpace[:j-1], gpusWithSpace[j:]...)
+				gpusWithSpace = append(gpusWithSpace[:layerCount%j], gpusWithSpace[layerCount%j+1:]...)
 			}
 		}
-
-		if !placed {
-			overflow += layerSize
-		}
 	}
 	if layerCount >= int(f.KV().BlockCount()) {
 		fullyLoaded = true
@@ -320,32 +273,15 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin

 	// Determine if we need to consider output then find where it fits
 	memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
-	slog.Debug("output layer placement",
-		"memory_last_layer", format.HumanBytes2(memoryLastLayer),
-		"layer_count_before", layerCount,
-		"block_count", f.KV().BlockCount(),
-		"gpus_with_space", len(gpusWithSpace))
-
 	if memoryLastLayer > 0 {
-		outputPlaced := false
 		if opts.NumGPU < 0 || layerCount < opts.NumGPU {
-			// ollama37: Prefer last GPU first (single-GPU preference for Tesla K80)
 			for j := len(gpusWithSpace); j > 0; j-- {
-				g := gpusWithSpace[j-1]  // Try GPUs in reverse order
-
-				// ollama37: Use actual graph allocation (not conservative estimate)
-				// This allows tighter packing on single GPU
-				used := gpuAllocations[g.i] + gpuGraphAllocations[g.i]
-
+				g := gpusWithSpace[layerCount%j]
+				used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
 				if g.g.FreeMemory > overhead+used+memoryLastLayer {
 					gpuAllocations[g.i] += memoryLastLayer
 					layerCounts[g.i]++
 					layerCount++
-					outputPlaced = true
-					slog.Debug("output layer placed",
-						"gpu", g.i,
-						"layer_count_after", layerCount,
-						"fully_loaded", layerCount >= int(f.KV().BlockCount())+1)
 					break
 				}
 			}
@@ -354,21 +290,19 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		if layerCount < int(f.KV().BlockCount())+1 {
 			fullyLoaded = false
 			overflow += memoryLastLayer
-			slog.Debug("output layer overflow",
-				"layer_count", layerCount,
-				"required", int(f.KV().BlockCount())+1,
-				"output_placed", outputPlaced)
 		}
 	}

 	// Add the applicable (full or partial) graph allocations
-	// ollama37: Use per-GPU graph allocations calculated earlier
-	// Secondary GPUs use measured 190 MiB, primary GPU uses full graph
 	for i := range gpus {
 		if layerCounts[i] <= 0 {
 			continue
 		}
-		gpuAllocations[i] += gpuGraphAllocations[i]
+		if fullyLoaded {
+			gpuAllocations[i] += graphFullOffload
+		} else {
+			gpuAllocations[i] += graphPartialOffload
+		}
 	}
 	if fullyLoaded {
 		graphOffload = graphFullOffload