Offload layers to GPU based on new model size estimates (#1850)

* select layers based on estimated model memory usage * always account for scratch vram * dont load +1 layers * better estmation for graph alloc * Update gpu/gpu_darwin.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go * add overhead for cuda memory * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * fix build error on linux * address comments --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2025-12-11 16:26:59 +00:00 · 2024-01-08 16:42:00 -05:00
parent 7e8f7c8358
commit 08f1e18965
10 changed files with 161 additions and 154 deletions
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -8,7 +8,6 @@ import (
 	"fmt"
 	"os"
 	"os/exec"
-	"sync"
 	"time"

 	"github.com/jmorganca/ollama/api"
@@ -43,69 +42,11 @@ number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
 ws ::= ([ \t\n] ws)?
 `

-type llamaModel struct {
-	hyperparameters llamaHyperparameters
-}
-
-func (llm *llamaModel) ModelFamily() string {
-	return "llama"
-}
-
-func llamaModelType(numLayer uint32) string {
-	switch numLayer {
-	case 26:
-		return "3B"
-	case 32:
-		return "7B"
-	case 40:
-		return "13B"
-	case 48:
-		return "34B"
-	case 60:
-		return "30B"
-	case 80:
-		return "65B"
-	default:
-		return "unknown"
-	}
-}
-
-func (llm *llamaModel) ModelType() string {
-	return llamaModelType(llm.hyperparameters.NumLayer)
-}
-
-func (llm *llamaModel) FileType() string {
-	return fileType(llm.hyperparameters.FileType)
-}
-
-func (llm *llamaModel) NumLayers() int64 {
-	return int64(llm.hyperparameters.NumLayer)
-}
-
-type llamaHyperparameters struct {
-	// NumVocab is the size of the model's vocabulary.
-	NumVocab uint32
-
-	// NumEmbd is the size of the model's embedding layer.
-	NumEmbd uint32
-	NumMult uint32
-	NumHead uint32
-
-	// NumLayer is the number of layers in the model.
-	NumLayer uint32
-	NumRot   uint32
-
-	// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
-	FileType uint32
-}
-
 type Running struct {
 	Port          int
 	Cmd           *exec.Cmd
 	Cancel        context.CancelFunc
-	exitOnce      sync.Once
-	exitCh        chan error // channel to receive the exit status of the subprocess
-	*StatusWriter            // captures error messages from the llama runner process
+	*StatusWriter // captures error messages from the llama runner process
 }

 type ImageData struct {