ggml: Seperate tensor load from backend creation

Currently, when the backend is created, the tensors are loaded at the same time, which is a slow operation. This separates them to be two steps: - Create backend, including enumerating tensors and memory allocation - Loading tensor data This allows more flexibility in managing model loading.
2025-12-10 07:46:59 +00:00 · 2025-04-17 13:42:40 -07:00
parent d755577473
commit 94ab428e3f
13 changed files with 131 additions and 115 deletions
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -845,7 +845,7 @@ func (s *Server) loadModel(
 	multiUserCache bool,
 ) {
 	var err error
-	s.model, err = model.New(ctx, mpath, params)
+	s.model, err = model.New(mpath, params)
 	if err != nil {
 		panic(err)
 	}
@@ -874,6 +874,14 @@ func (s *Server) loadModel(
 		panic(err)
 	}

+	err = s.model.Backend().Load(ctx,
+		func(progress float32) {
+			s.progress = progress
+		})
+	if err != nil {
+		panic(err)
+	}
+
 	s.status = llm.ServerStatusReady
 	s.ready.Done()
 }
@@ -928,9 +936,6 @@ func Execute(args []string) error {
 	}

 	params := ml.BackendParams{
-		Progress: func(progress float32) {
-			server.progress = progress
-		},
 		NumThreads:     *threads,
 		NumGPULayers:   *numGPULayers,
 		MainGPU:        *mainGPU,