ollamarunner: Memory usage reporting

This provides granular information about the backend memory allocations
required by the runner:
 - Per backend
 - Per layer
 - Weights, cache and graph
 - Allocation status

This can be used for debugging and validating memory estimates.
This commit is contained in:
Jesse Gross
2025-04-17 11:00:25 -07:00
committed by Jesse Gross
parent 6db8a3771c
commit 73d6a82cce
5 changed files with 224 additions and 78 deletions

View File

@@ -826,16 +826,12 @@ func (s *Server) reserveWorstCaseGraph() error {
return err
}
err = ctx.Forward(t).Reserve()
if err != nil {
return err
}
ctx.Forward(t).Reserve()
return nil
}
func (s *Server) loadModel(
ctx context.Context,
func (s *Server) initModel(
mpath string,
params ml.BackendParams,
lpath multiLPath,
@@ -843,21 +839,21 @@ func (s *Server) loadModel(
kvCacheType string,
kvSize int,
multiUserCache bool,
) {
) error {
var err error
s.model, err = model.New(mpath, params)
if err != nil {
panic(err)
return err
}
// TODO(jessegross): LoRA loading
if lpath.String() != "" {
panic("loras are not yet implemented")
return errors.New("loras are not yet implemented")
}
s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, s.batchSize, multiUserCache)
if err != nil {
panic(err)
return err
}
if !s.cache.enabled && parallel > 1 {
@@ -869,11 +865,25 @@ func (s *Server) loadModel(
s.seqs = make([]*Sequence, s.parallel)
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
err = s.reserveWorstCaseGraph()
return s.reserveWorstCaseGraph()
}
func (s *Server) load(
ctx context.Context,
mpath string,
params ml.BackendParams,
lpath multiLPath,
parallel int,
kvCacheType string,
kvSize int,
multiUserCache bool) {
err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
if err != nil {
panic(err)
}
slog.Debug("memory", "allocated", s.model.Backend().BackendMemory())
err = s.model.Backend().Load(ctx,
func(progress float32) {
s.progress = progress
@@ -921,9 +931,14 @@ func Execute(args []string) error {
status: llm.ServerStatusLoadingModel,
}
server.cond = sync.NewCond(&server.mu)
server.ready.Add(1)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// TODO(jessegross): Parameters that need to be implemented:
// no-mmap
// mlock
var tensorSplitFloats []float32
if *tensorSplit != "" {
@@ -943,14 +958,7 @@ func Execute(args []string) error {
FlashAttention: *flashAttention,
}
server.ready.Add(1)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
go server.loadModel(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
server.cond = sync.NewCond(&server.mu)
go server.load(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
go server.run(ctx)
addr := "127.0.0.1:" + strconv.Itoa(*port)