scale graph based on gpu count

2025-12-11 00:07:07 +00:00 · 2024-04-16 14:44:13 -07:00
parent 7c9792a6e0
commit 26df674785
2 changed files with 4 additions and 1 deletions
--- a/llm/server.go
+++ b/llm/server.go
@@ -79,6 +79,9 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
 		graphFullOffload = graphPartialOffload
 	}

+	graphFullOffload *= uint64(info.DeviceCount)
+	graphPartialOffload *= uint64(info.DeviceCount)
+
 	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
 	memoryRequiredTotal := memoryMinimum + graphFullOffload