Implement single GPU preference for multi-GPU model loading

Use minimum required GPUs instead of all available GPUs when loading models. The scheduler now tries progressively more GPUs (1, 2, 3, ...) until the model fits, then uses only that minimum subset. Benefits: - 20GB model on 4x Tesla K80s: uses 2 GPUs instead of 4 - 27GB model: uses 3 GPUs, leaves 1 GPU completely free - 10GB model: uses 1 GPU, leaves others free for other models - Reduces VRAM fragmentation across multi-GPU setups 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-09 23:37:06 +00:00 · 2025-09-15 22:54:49 +08:00
parent 29cb9d3a27
commit 3364327801
1 changed files with 10 additions and 5 deletions
--- a/server/sched.go
+++ b/server/sched.go
@@ -880,13 +880,18 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn
 		// - if multiple Libraries, see if any single GPU in any Library will fit
 		// - try subsets of GPUs instead of just falling back to 1 or all in a family

-		// Now try all the GPUs
+		// Try progressively more GPUs until model fits (use minimum required)
 		for _, p := range numParallelToTry {
 			req.opts.NumCtx = req.origNumCtx * p
-			if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
-				slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
-				*numParallel = p
-				return sgl
+
+			// Try from 1 GPU up to all available GPUs
+			for gpuCount := 1; gpuCount <= len(sgl); gpuCount++ {
+				subsetGPUs := sgl[:gpuCount]
+				if ok, estimatedVRAM = llm.PredictServerFit(subsetGPUs, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
+					slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "gpus_used", gpuCount, "gpus_available", len(sgl), "required", format.HumanBytes2(estimatedVRAM))
+					*numParallel = p
+					return subsetGPUs
+				}
 			}
 		}
 	}