darwin: no partial offloading if required memory greater than system

This commit is contained in:
Michael Yang
2024-04-16 11:22:38 -07:00
parent f335722275
commit 41a272de9f
4 changed files with 17 additions and 9 deletions

View File

@@ -108,7 +108,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
memoryLayerOutput := layers["output"].size()
memoryRequiredTotal += memoryLayerOutput
if memoryAvailable > memoryRequiredTotal {
if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
// disable partial offloading when model is greater than total system memory
opts.NumGPU = 0
} else if memoryAvailable > memoryRequiredTotal {
layerCount = int(ggml.KV().BlockCount()) + 1
memoryRequiredPartial = memoryRequiredTotal
}