darwin: no partial offloading if required memory greater than system

2025-12-11 16:26:59 +00:00 · 2024-04-16 11:22:38 -07:00
parent f335722275
commit 41a272de9f
4 changed files with 17 additions and 9 deletions
--- a/llm/server.go
+++ b/llm/server.go
@@ -108,7 +108,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option

 	memoryLayerOutput := layers["output"].size()
 	memoryRequiredTotal += memoryLayerOutput
-	if memoryAvailable > memoryRequiredTotal {
+
+	if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
+		// disable partial offloading when model is greater than total system memory
+		opts.NumGPU = 0
+	} else if memoryAvailable > memoryRequiredTotal {
 		layerCount = int(ggml.KV().BlockCount()) + 1
 		memoryRequiredPartial = memoryRequiredTotal
 	}