mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-11 08:17:03 +00:00
ollamarunner: Memory usage reporting
This provides granular information about the backend memory allocations required by the runner: - Per backend - Per layer - Weights, cache and graph - Allocation status This can be used for debugging and validating memory estimates.
This commit is contained in:
@@ -826,16 +826,12 @@ func (s *Server) reserveWorstCaseGraph() error {
|
||||
return err
|
||||
}
|
||||
|
||||
err = ctx.Forward(t).Reserve()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
ctx.Forward(t).Reserve()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Server) loadModel(
|
||||
ctx context.Context,
|
||||
func (s *Server) initModel(
|
||||
mpath string,
|
||||
params ml.BackendParams,
|
||||
lpath multiLPath,
|
||||
@@ -843,21 +839,21 @@ func (s *Server) loadModel(
|
||||
kvCacheType string,
|
||||
kvSize int,
|
||||
multiUserCache bool,
|
||||
) {
|
||||
) error {
|
||||
var err error
|
||||
s.model, err = model.New(mpath, params)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
return err
|
||||
}
|
||||
|
||||
// TODO(jessegross): LoRA loading
|
||||
if lpath.String() != "" {
|
||||
panic("loras are not yet implemented")
|
||||
return errors.New("loras are not yet implemented")
|
||||
}
|
||||
|
||||
s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, s.batchSize, multiUserCache)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
return err
|
||||
}
|
||||
|
||||
if !s.cache.enabled && parallel > 1 {
|
||||
@@ -869,11 +865,25 @@ func (s *Server) loadModel(
|
||||
s.seqs = make([]*Sequence, s.parallel)
|
||||
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
|
||||
|
||||
err = s.reserveWorstCaseGraph()
|
||||
return s.reserveWorstCaseGraph()
|
||||
}
|
||||
|
||||
func (s *Server) load(
|
||||
ctx context.Context,
|
||||
mpath string,
|
||||
params ml.BackendParams,
|
||||
lpath multiLPath,
|
||||
parallel int,
|
||||
kvCacheType string,
|
||||
kvSize int,
|
||||
multiUserCache bool) {
|
||||
err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
slog.Debug("memory", "allocated", s.model.Backend().BackendMemory())
|
||||
|
||||
err = s.model.Backend().Load(ctx,
|
||||
func(progress float32) {
|
||||
s.progress = progress
|
||||
@@ -921,9 +931,14 @@ func Execute(args []string) error {
|
||||
status: llm.ServerStatusLoadingModel,
|
||||
}
|
||||
|
||||
server.cond = sync.NewCond(&server.mu)
|
||||
server.ready.Add(1)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// TODO(jessegross): Parameters that need to be implemented:
|
||||
// no-mmap
|
||||
// mlock
|
||||
|
||||
var tensorSplitFloats []float32
|
||||
if *tensorSplit != "" {
|
||||
@@ -943,14 +958,7 @@ func Execute(args []string) error {
|
||||
FlashAttention: *flashAttention,
|
||||
}
|
||||
|
||||
server.ready.Add(1)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
go server.loadModel(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
|
||||
|
||||
server.cond = sync.NewCond(&server.mu)
|
||||
|
||||
go server.load(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
|
||||
go server.run(ctx)
|
||||
|
||||
addr := "127.0.0.1:" + strconv.Itoa(*port)
|
||||
|
||||
Reference in New Issue
Block a user