mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 07:46:59 +00:00
ollamarunner: Memory usage reporting
This provides granular information about the backend memory allocations required by the runner: - Per backend - Per layer - Weights, cache and graph - Allocation status This can be used for debugging and validating memory estimates.
This commit is contained in:
@@ -15,6 +15,10 @@ import (
|
||||
|
||||
type Backend interface {
|
||||
Load(ctx context.Context, progress func(float32)) error
|
||||
|
||||
// BackendMemory returns the memory allocations that were made for this model
|
||||
BackendMemory() BackendMemory
|
||||
|
||||
Config() fs.Config
|
||||
Get(name string) Tensor
|
||||
NewContext() Context
|
||||
@@ -68,6 +72,84 @@ type BackendParams struct {
|
||||
FlashAttention bool
|
||||
}
|
||||
|
||||
// ErrNoMem is returned when panicing due to insufficient memory. It includes
|
||||
// the attempted memory allocation.
|
||||
type ErrNoMem struct {
|
||||
BackendMemory
|
||||
}
|
||||
|
||||
func (e ErrNoMem) Error() string {
|
||||
return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
|
||||
}
|
||||
|
||||
type AllocationStatus int
|
||||
|
||||
const (
|
||||
// Unallocated memory - have not yet attempted to allocate
|
||||
Unallocated AllocationStatus = iota
|
||||
|
||||
// Failed memory - tried to allocate the memory and did not succeed
|
||||
Failed
|
||||
|
||||
// Allocated memory = tried and succeeded to allocate memory
|
||||
Allocated
|
||||
)
|
||||
|
||||
// Memory is the size of an allocation and whether it was successful.
|
||||
type Memory struct {
|
||||
Size uint64
|
||||
Status AllocationStatus
|
||||
}
|
||||
|
||||
func (m Memory) String() string {
|
||||
s := fmt.Sprint(m.Size)
|
||||
|
||||
switch m.Status {
|
||||
case Unallocated:
|
||||
s += "U"
|
||||
case Failed:
|
||||
s += "F"
|
||||
case Allocated:
|
||||
s += "A"
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// DeviceMemory provides a breakdown of the memory needed
|
||||
// per device, such as a CPU or GPU.
|
||||
type DeviceMemory struct {
|
||||
// Name is the name of the device as labeled by the backend. It
|
||||
// may not be persistent across instances of the runner.
|
||||
Name string
|
||||
|
||||
// Weights is the per-layer memory needed for the model weights.
|
||||
Weights []Memory
|
||||
|
||||
// Cache is the per-layer memory needed for the KV cache.
|
||||
Cache []Memory
|
||||
|
||||
// Graph is the size of the compute graph. It is not per-layer.
|
||||
Graph Memory
|
||||
}
|
||||
|
||||
// BackendMemory provides the amount of memory required to load the model
|
||||
// per device based on the BackendParams. In some cases, not all required
|
||||
// allocations will be known at this point. However, the size of the most recent
|
||||
// allocation is guaranteed to be provided so that if it failed, the caller can
|
||||
// accommodate that to make forward progress.
|
||||
type BackendMemory struct {
|
||||
// InputsWeights are always located on the CPU and cannot be moved
|
||||
InputWeights Memory
|
||||
|
||||
// CPU model components are located in system memory. This does not
|
||||
// include unified memory allocated through the GPU.
|
||||
CPU DeviceMemory
|
||||
|
||||
// GPU model components are located on one or more GPUs.
|
||||
GPUs []DeviceMemory
|
||||
}
|
||||
|
||||
var backends = make(map[string]func(string, BackendParams) (Backend, error))
|
||||
|
||||
func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
|
||||
@@ -102,7 +184,7 @@ type Context interface {
|
||||
// graph, simply preallocates memory. Typically called with a
|
||||
// worst case graph to ensure all resources are available for
|
||||
// for future inference.
|
||||
Reserve() error
|
||||
Reserve()
|
||||
|
||||
MaxGraphNodes() int
|
||||
Close()
|
||||
|
||||
Reference in New Issue
Block a user