ollamarunner: Re-enable worst case graph preallocation.

Worst case graph preallocation was disabled by a27462b
"ollamarunner: Temporarily disable worst case graph preallocation"
since it caused crashes with large batches when not using the GPU.

This backports upstream llama.cpp commit f057808
"ggml: Don't assert fail when tensor data changes (#13222)", which
fixes the underlying bug and allows reverting the previous workaround.
This commit is contained in:
Jesse Gross
2025-05-02 11:24:19 -07:00
committed by Jesse Gross
parent 57fb759f3c
commit c2f5d6662b
3 changed files with 46 additions and 7 deletions

View File

@@ -715,9 +715,7 @@ func (m *multiLPath) String() string {
return strings.Join(*m, ", ")
}
// TODO(jessegross): This is causing tensor allocation failures with large batches when not offloaded
// to the GPU
/*func (s *Server) reserveWorstCaseGraph() error {
func (s *Server) reserveWorstCaseGraph() error {
ctx := s.model.Backend().NewContext()
defer ctx.Close()
@@ -760,7 +758,7 @@ func (m *multiLPath) String() string {
}
return nil
}*/
}
func (s *Server) loadModel(
ctx context.Context,
@@ -797,10 +795,10 @@ func (s *Server) loadModel(
s.seqs = make([]*Sequence, s.parallel)
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
/*err = s.reserveWorstCaseGraph()
err = s.reserveWorstCaseGraph()
if err != nil {
panic(err)
}*/
}
s.status = llm.ServerStatusReady
s.ready.Done()