ollamarunner: Re-enable worst case graph preallocation.

Worst case graph preallocation was disabled by a27462b "ollamarunner: Temporarily disable worst case graph preallocation" since it caused crashes with large batches when not using the GPU. This backports upstream llama.cpp commit f057808 "ggml: Don't assert fail when tensor data changes (#13222)", which fixes the underlying bug and allows reverting the previous workaround.
2025-12-11 00:07:07 +00:00 · 2025-05-02 11:24:19 -07:00
parent 57fb759f3c
commit c2f5d6662b
3 changed files with 46 additions and 7 deletions
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -715,9 +715,7 @@ func (m *multiLPath) String() string {
 	return strings.Join(*m, ", ")
 }

-// TODO(jessegross): This is causing tensor allocation failures with large batches when not offloaded
-// to the GPU
-/*func (s *Server) reserveWorstCaseGraph() error {
+func (s *Server) reserveWorstCaseGraph() error {
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()

@@ -760,7 +758,7 @@ func (m *multiLPath) String() string {
 	}

 	return nil
-}*/
+}

 func (s *Server) loadModel(
 	ctx context.Context,
@@ -797,10 +795,10 @@ func (s *Server) loadModel(
 	s.seqs = make([]*Sequence, s.parallel)
 	s.seqsSem = semaphore.NewWeighted(int64(s.parallel))

-	/*err = s.reserveWorstCaseGraph()
+	err = s.reserveWorstCaseGraph()
 	if err != nil {
 		panic(err)
-	}*/
+	}

 	s.status = llm.ServerStatusReady
 	s.ready.Done()