mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-09 23:37:06 +00:00
Implement single GPU preference for multi-GPU model loading
Use minimum required GPUs instead of all available GPUs when loading models. The scheduler now tries progressively more GPUs (1, 2, 3, ...) until the model fits, then uses only that minimum subset. Benefits: - 20GB model on 4x Tesla K80s: uses 2 GPUs instead of 4 - 27GB model: uses 3 GPUs, leaves 1 GPU completely free - 10GB model: uses 1 GPU, leaves others free for other models - Reduces VRAM fragmentation across multi-GPU setups 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -880,13 +880,18 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn
|
||||
// - if multiple Libraries, see if any single GPU in any Library will fit
|
||||
// - try subsets of GPUs instead of just falling back to 1 or all in a family
|
||||
|
||||
// Now try all the GPUs
|
||||
// Try progressively more GPUs until model fits (use minimum required)
|
||||
for _, p := range numParallelToTry {
|
||||
req.opts.NumCtx = req.origNumCtx * p
|
||||
if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
|
||||
slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
|
||||
*numParallel = p
|
||||
return sgl
|
||||
|
||||
// Try from 1 GPU up to all available GPUs
|
||||
for gpuCount := 1; gpuCount <= len(sgl); gpuCount++ {
|
||||
subsetGPUs := sgl[:gpuCount]
|
||||
if ok, estimatedVRAM = llm.PredictServerFit(subsetGPUs, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
|
||||
slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "gpus_used", gpuCount, "gpus_available", len(sgl), "required", format.HumanBytes2(estimatedVRAM))
|
||||
*numParallel = p
|
||||
return subsetGPUs
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user