mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 07:46:59 +00:00
ggml: Seperate tensor load from backend creation
Currently, when the backend is created, the tensors are loaded at the same time, which is a slow operation. This separates them to be two steps: - Create backend, including enumerating tensors and memory allocation - Loading tensor data This allows more flexibility in managing model loading.
This commit is contained in:
@@ -845,7 +845,7 @@ func (s *Server) loadModel(
|
||||
multiUserCache bool,
|
||||
) {
|
||||
var err error
|
||||
s.model, err = model.New(ctx, mpath, params)
|
||||
s.model, err = model.New(mpath, params)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
@@ -874,6 +874,14 @@ func (s *Server) loadModel(
|
||||
panic(err)
|
||||
}
|
||||
|
||||
err = s.model.Backend().Load(ctx,
|
||||
func(progress float32) {
|
||||
s.progress = progress
|
||||
})
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
s.status = llm.ServerStatusReady
|
||||
s.ready.Done()
|
||||
}
|
||||
@@ -928,9 +936,6 @@ func Execute(args []string) error {
|
||||
}
|
||||
|
||||
params := ml.BackendParams{
|
||||
Progress: func(progress float32) {
|
||||
server.progress = progress
|
||||
},
|
||||
NumThreads: *threads,
|
||||
NumGPULayers: *numGPULayers,
|
||||
MainGPU: *mainGPU,
|
||||
|
||||
Reference in New Issue
Block a user