Sync with upstream ollama/ollama and restore Tesla K80 (compute 3.7) support

This commit represents a complete rework after pulling the latest changes from
official ollama/ollama repository and re-applying Tesla K80 compatibility patches.

## Key Changes

### CUDA Compute Capability 3.7 Support (Tesla K80)
- Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt
- Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset
- Using 37-virtual (PTX with JIT compilation) for maximum compatibility

### Legacy Toolchain Compatibility
- **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80)
- **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7)
- **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h)

### CPU Architecture Trade-offs
Due to GCC 10.5 limitation, sacrificed newer CPU optimizations:
- Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+)
- Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA
- Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility)

### Build System Updates
- Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7
- Added -Wno-deprecated-gpu-targets flag to suppress warnings
- Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI

### Upstream Sync
Merged latest llama.cpp changes including:
- Enhanced KV cache management with ISWA and hybrid memory support
- Improved multi-modal support (mtmd framework)
- New model architectures (Gemma3, Llama4, Qwen3, etc.)
- GPU backend improvements for CUDA, Metal, and ROCm
- Updated quantization support and GGUF format handling

### Documentation
- Updated CLAUDE.md with comprehensive build instructions
- Documented toolchain constraints and CPU architecture trade-offs
- Removed outdated CI/CD workflows (tesla-k80-*.yml)
- Cleaned up temporary development artifacts

## Rationale

This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in
official Ollama due to legacy driver/CUDA requirements. The toolchain constraint
creates a deadlock:
- K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI

We accept the loss of cutting-edge CPU optimizations to enable running modern
LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Shang Chieh Tseng
2025-11-05 14:03:05 +08:00
parent fabe2c5cb7
commit ef14fb5b26
817 changed files with 241634 additions and 70888 deletions

View File

@@ -5,12 +5,9 @@ import (
"errors"
"fmt"
"log/slog"
"os"
"reflect"
"runtime"
"slices"
"sort"
"strconv"
"strings"
"sync"
"time"
@@ -21,6 +18,8 @@ import (
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/types/model"
)
@@ -28,7 +27,6 @@ type LlmRequest struct {
ctx context.Context //nolint:containedctx
model *Model
opts api.Options
origNumCtx int // Track the initial ctx request
sessionDuration *api.Duration
successCh chan *runnerRef
errCh chan error
@@ -41,14 +39,21 @@ type Scheduler struct {
expiredCh chan *runnerRef
unloadedCh chan any
loaded map[string]*runnerRef
// loadedMu protects loaded and activeLoading
loadedMu sync.Mutex
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int)
newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
getGpuFn func() discover.GpuInfoList
getCpuFn func() discover.GpuInfoList
reschedDelay time.Duration
// activeLoading is the model that we are currently working on loading,
// including by evicting one or more other models. We can only load
// one model at a time but new requests to models that already loaded can
// happen in parallel
activeLoading llm.LlamaServer
loaded map[string]*runnerRef
loadFn func(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
getGpuFn func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo
getSystemInfoFn func() ml.SystemInfo
waitForRecovery time.Duration
}
// Default automatic value for number of models we allow per GPU
@@ -56,47 +61,56 @@ type Scheduler struct {
// on a large GPU can cause stalling
var defaultModelsPerGPU = 3
// Default automatic value for parallel setting
var defaultParallel = 1
var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded")
func InitScheduler(ctx context.Context) *Scheduler {
maxQueue := envconfig.MaxQueue()
sched := &Scheduler{
pendingReqCh: make(chan *LlmRequest, maxQueue),
finishedReqCh: make(chan *LlmRequest, maxQueue),
expiredCh: make(chan *runnerRef, maxQueue),
unloadedCh: make(chan any, maxQueue),
loaded: make(map[string]*runnerRef),
newServerFn: llm.NewLlamaServer,
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
pendingReqCh: make(chan *LlmRequest, maxQueue),
finishedReqCh: make(chan *LlmRequest, maxQueue),
expiredCh: make(chan *runnerRef, maxQueue),
unloadedCh: make(chan any, maxQueue),
loaded: make(map[string]*runnerRef),
newServerFn: llm.NewLlamaServer,
getGpuFn: discover.GPUDevices,
getSystemInfoFn: discover.GetSystemInfo,
waitForRecovery: 5 * time.Second,
}
sched.loadFn = sched.load
return sched
}
// context must be canceled to decrement ref count and release the runner
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
func (s *Scheduler) GetRunner(c context.Context, m *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
if opts.NumCtx < 4 {
opts.NumCtx = 4
}
if m.CheckCapabilities(model.CapabilityVision) == nil {
// multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048)
}
req := &LlmRequest{
ctx: c,
model: model,
model: m,
opts: opts,
sessionDuration: sessionDuration,
successCh: make(chan *runnerRef),
successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1),
}
select {
case s.pendingReqCh <- req:
default:
req.errCh <- ErrMaxQueue
s.loadedMu.Lock()
runner := s.loaded[req.model.ModelPath]
s.loadedMu.Unlock()
if runner != nil && !runner.needsReload(c, req) {
req.useLoadedRunner(runner, s.finishedReqCh)
} else {
select {
case s.pendingReqCh <- req:
default:
req.errCh <- ErrMaxQueue
}
}
return req.successCh, req.errCh
}
@@ -114,6 +128,8 @@ func (s *Scheduler) Run(ctx context.Context) {
}
func (s *Scheduler) processPending(ctx context.Context) {
maxRunners := envconfig.MaxRunners()
for {
select {
case <-ctx.Done():
@@ -122,28 +138,23 @@ func (s *Scheduler) processPending(ctx context.Context) {
case pending := <-s.pendingReqCh:
// Block other requests until we get this pending request running
pending.schedAttempts++
if pending.origNumCtx == 0 {
pending.origNumCtx = pending.opts.NumCtx
}
if pending.ctx.Err() != nil {
slog.Debug("pending request cancelled or timed out, skipping scheduling")
continue
}
numParallel := int(envconfig.NumParallel())
// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
// ref: https://github.com/ollama/ollama/issues/4165
if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
numParallel = 1
slog.Warn("mllama does not currently support parallel requests")
}
for {
var runnerToExpire *runnerRef
s.loadedMu.Lock()
runner := s.loaded[pending.model.ModelPath]
loadedCount := len(s.loaded)
runnersSnapshot := make([]ml.FilteredRunnerDiscovery, 0, len(s.loaded))
for _, r := range s.loaded {
runnersSnapshot = append(runnersSnapshot, r)
}
s.loadedMu.Unlock()
if runner != nil {
if runner.needsReload(ctx, pending) {
slog.Debug("reloading", "runner", runner)
@@ -153,39 +164,29 @@ func (s *Scheduler) processPending(ctx context.Context) {
pending.useLoadedRunner(runner, s.finishedReqCh)
break
}
} else if envconfig.MaxRunners() > 0 && loadedCount >= int(envconfig.MaxRunners()) {
} else if maxRunners > 0 && loadedCount >= int(maxRunners) {
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
runnerToExpire = s.findRunnerToUnload()
} else {
// Either no models are loaded or below envconfig.MaxRunners
// Get a refreshed GPU list
var gpus discover.GpuInfoList
var gpus []ml.DeviceInfo
if pending.opts.NumGPU == 0 {
gpus = s.getCpuFn()
gpus = []ml.DeviceInfo{}
} else {
gpus = s.getGpuFn()
gpus = s.getGpuFn(ctx, runnersSnapshot)
}
if envconfig.MaxRunners() <= 0 {
// No user specified MaxRunners, so figure out what automatic setting to use
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
// if any GPU has unreliable free memory reporting, 1x the number of GPUs
allReliable := true
for _, gpu := range gpus {
if gpu.UnreliableFreeMemory {
allReliable = false
break
}
}
if allReliable {
// HACK
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(defaultModelsPerGPU*len(gpus)))
slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners(), "gpu_count", len(gpus))
systemInfo := s.getSystemInfoFn()
if maxRunners <= 0 {
// No user specified MaxRunners, so figure out what automatic setting to use for the next load attempt
if pending.opts.NumGPU == 0 {
// Need to get actual GPU list to set the correct default max models
g := s.getGpuFn(ctx, runnersSnapshot)
maxRunners = uint(defaultModelsPerGPU * max(len(g), 1))
} else {
// HACK
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(len(gpus)))
slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
maxRunners = uint(defaultModelsPerGPU * max(len(gpus), 1))
}
slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "gpu_count", len(gpus))
}
// Load model for fitting
@@ -195,198 +196,54 @@ func (s *Scheduler) processPending(ctx context.Context) {
break
}
// Embedding models should always be loaded with parallel=1
if pending.model.CheckCapabilities(model.CapabilityCompletion) != nil {
numParallel = 1
}
// Update free memory from currently loaded models
s.updateFreeSpace(gpus)
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
if len(gpus) == 1 && gpus[0].Library == "cpu" {
// simplifying assumption of defaultParallel when in CPU mode
if numParallel <= 0 {
numParallel = defaultParallel
}
pending.opts.NumCtx = pending.origNumCtx * numParallel
if loadedCount == 0 {
slog.Debug("cpu mode with first model, loading")
s.loadFn(pending, ggml, gpus, numParallel)
break
}
runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus)
if runnerToExpire == nil {
slog.Debug("cpu mode with available system memory or first model, loading")
s.loadFn(pending, ggml, gpus, numParallel)
break
}
// else we need to expire a runner
} else if loadedCount == 0 {
if loadedCount == 0 {
// No models loaded. Load the model but prefer the best fit.
slog.Debug("loading first model", "model", pending.model.ModelPath)
g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel)
if g != nil {
gpus = g
} else {
// Only allow partial loads when this is the first model
gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
}
s.loadFn(pending, ggml, gpus, numParallel)
s.loadFn(pending, ggml, systemInfo, gpus, false)
break
}
if runnerToExpire == nil {
// More than one loaded model, so we have to see if the
// new one fits
//
// We want to avoid loading on any GPUs that have other
// models still loading on them to avoid potential races
// with VRAM consumption ramping up during load
availGpus := s.filterGPUsWithoutLoadingModels(gpus)
// More than one loaded model, so we have to see if the
// new one fits
// Update free memory from currently loaded models
s.updateFreeSpace(availGpus)
// Check if this model requires multiple GPUs (Tesla K80 fix)
// If so, we need to ensure ALL required GPUs are clear of other models
fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
if fitGpus != nil {
// Check if this is a multi-GPU model request
if len(fitGpus) > 1 {
slog.Debug("multi-GPU model detected, checking for conflicts",
"target_model", pending.model.ModelPath,
"gpu_count", len(fitGpus))
// Check if any of the target GPUs have loaded models
hasConflict := false
s.loadedMu.Lock()
for _, loadedRunner := range s.loaded {
if loadedRunner.loading {
slog.Debug("skipping loading model", "model", loadedRunner.modelPath)
continue // Skip models that are still loading
}
slog.Debug("checking loaded model for conflicts",
"loaded_model", loadedRunner.modelPath,
"loaded_gpus", len(loadedRunner.gpus))
// Check if any loaded model is using any of our target GPUs
for _, targetGpu := range fitGpus {
for _, loadedGpu := range loadedRunner.gpus {
if targetGpu.ID == loadedGpu.ID {
slog.Warn("multi-GPU model conflicts with loaded model",
"target_model", pending.model.ModelPath,
"loaded_model", loadedRunner.modelPath,
"conflicting_gpu", targetGpu.ID)
hasConflict = true
break
}
}
if hasConflict {
break
}
}
if hasConflict {
break
}
}
s.loadedMu.Unlock()
if hasConflict {
// Check if conflicting models are still active (have refCount > 0)
conflictingRunner := s.findConflictingRunnerToUnload(fitGpus)
if conflictingRunner != nil {
conflictingRunner.refMu.Lock()
isActive := conflictingRunner.refCount > 0
conflictingRunner.refMu.Unlock()
if isActive {
// Conflicting model is still processing, delay this request
slog.Warn("conflicting model is still active, delaying multi-GPU request",
"conflicting_model", conflictingRunner.modelPath,
"target_model", pending.model.ModelPath)
go func() {
time.Sleep(s.reschedDelay)
s.pendingReqCh <- pending
}()
break
} else {
// Conflicting model is idle, can unload it
slog.Warn("found idle conflicting runner to unload",
"runner", conflictingRunner.modelPath,
"refCount", conflictingRunner.refCount)
runnerToExpire = conflictingRunner
slog.Warn("setting runnerToExpire to trigger unload", "runner", runnerToExpire.modelPath)
// Don't break here - let the normal flow handle the unload
}
} else {
slog.Error("failed to find conflicting runner despite detecting conflict!")
}
} else {
slog.Debug("no conflicts detected for multi-GPU model")
}
}
if runnerToExpire == nil {
slog.Debug("new model fits with existing models, loading")
s.loadFn(pending, ggml, fitGpus, numParallel)
break
}
}
// We couldn't find a set of GPUs to fully load the new
// model. If no other models are loading (both GPU lists
// are the same) then we need to unload another model to
// make room
if runnerToExpire == nil && len(availGpus) < len(gpus) {
// There are other requests pending, and this one
// needs more time, so put it on the back of the
// queue so that we might satisfy other pending
// requests that aren't blocked
go func() {
// Process in a go routine to avoid deadlocking
// the scheduler if our queue is full
slog.Debug("delaying scheduling while other models finish loading", "attempts", pending.schedAttempts, "model", pending.model.ModelPath)
time.Sleep(s.reschedDelay)
s.pendingReqCh <- pending
}()
break
}
if runnerToExpire == nil {
runnerToExpire = s.findRunnerToUnload()
}
needEvict := s.loadFn(pending, ggml, systemInfo, gpus, true)
if !needEvict {
slog.Debug("new model fits with existing models, loading")
break
}
runnerToExpire = s.findRunnerToUnload()
}
slog.Warn("exited model selection, checking runnerToExpire", "runnerToExpire", runnerToExpire != nil)
if runnerToExpire == nil {
// Shouildn't happen
slog.Error("runner to expire was nil!")
// While we were performing load calculations, the loaded runner(s) unloaded in parallel
// so findRunnerToUnload returned no runners. We'll try again and the loadedCount should be zero
slog.Debug("runner to expire was nil, retrying")
continue
}
// Trigger an expiration to unload once it's done
slog.Warn("attempting to unload runner", "runner", runnerToExpire.modelPath)
runnerToExpire.refMu.Lock()
slog.Warn("resetting model to expire immediately to make room", "runner", runnerToExpire.modelPath, "refCount", runnerToExpire.refCount)
slog.Debug("resetting model to expire immediately to make room", "runner", runnerToExpire, "refCount", runnerToExpire.refCount)
if runnerToExpire.expireTimer != nil {
runnerToExpire.expireTimer.Stop()
runnerToExpire.expireTimer = nil
}
runnerToExpire.sessionDuration = 0
if runnerToExpire.refCount <= 0 {
slog.Warn("sending idle runner to expired channel", "runner", runnerToExpire.modelPath)
s.expiredCh <- runnerToExpire
} else {
slog.Warn("runner still has references, waiting for refCount to reach 0", "runner", runnerToExpire.modelPath, "refCount", runnerToExpire.refCount)
}
runnerToExpire.refMu.Unlock()
// Wait for the unload to happen
// Note: at this point we're queueing up all incoming requests, even if they were for
// a different model that's loaded and not scheduled to be removed.
slog.Debug("waiting for pending requests to complete and unload to occur", "runner", runnerToExpire)
select {
case <-ctx.Done():
slog.Debug("shutting down scheduler pending loop")
return
case <-s.unloadedCh:
slog.Warn("unload completed, retrying model load", "runner", runnerToExpire)
slog.Debug("unload completed", "runner", runnerToExpire)
continue
}
}
@@ -483,7 +340,11 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
runner.refMu.Unlock()
} else {
slog.Debug("starting background wait for VRAM recovery", "runner", runner)
finished := runner.waitForVRAMRecovery()
runnersSnapshot := make([]ml.FilteredRunnerDiscovery, 0, len(s.loaded))
for _, r := range s.loaded {
runnersSnapshot = append(runnersSnapshot, r)
}
finished := s.waitForVRAMRecovery(runner, runnersSnapshot)
runner.unload()
delete(s.loaded, runner.modelPath)
s.loadedMu.Unlock()
@@ -519,35 +380,100 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
}()
}
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int) {
if numParallel < 1 {
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
numParallel := max(int(envconfig.NumParallel()), 1)
// Embedding models should always be loaded with parallel=1
if req.model.CheckCapabilities(model.CapabilityCompletion) != nil {
numParallel = 1
}
// `mllama`, `qwen3vl`, and `qwen3vlmoe` are snowflakes and uses an encoder cache which cannot be used with num_parallel > 1
// ref: https://github.com/ollama/ollama/issues/4165
if slices.Contains([]string{"mllama", "qwen3vl", "qwen3vlmoe"}, req.model.Config.ModelFamily) && numParallel != 1 {
numParallel = 1
slog.Warn("model architecture does not currently support parallel requests", "architecture", req.model.Config.ModelFamily)
}
sessionDuration := envconfig.KeepAlive()
if req.sessionDuration != nil {
sessionDuration = req.sessionDuration.Duration
}
llama, err := s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
if err != nil {
// some older models are not compatible with newer versions of llama.cpp
// show a generalized compatibility error until there is a better way to
// check for model compatibility
if errors.Is(err, ggml.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
s.loadedMu.Lock()
llama := s.activeLoading
if llama == nil {
var err error
llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
if err != nil {
// some older models are not compatible with newer versions of llama.cpp
// show a generalized compatibility error until there is a better way to
// check for model compatibility
if errors.Is(err, ggml.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
}
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
req.errCh <- err
s.loadedMu.Unlock()
return false
}
s.activeLoading = llama
} else {
if s.activeLoading.ModelPath() != req.model.ModelPath {
panic(fmt.Errorf("attempting to load different model after eviction (original %v new %v)", s.activeLoading.ModelPath(), req.model.ModelPath))
}
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
req.errCh <- err
return
}
s.loadedMu.Unlock()
gpuIDs, err := llama.Load(req.ctx, systemInfo, gpus, requireFull)
if err != nil {
if errors.Is(err, llm.ErrLoadRequiredFull) {
if !requireFull {
// No other models loaded, yet we still don't fit, so report an error
slog.Info("model is too large for system memory", "requireFull", requireFull)
s.activeLoading.Close()
s.activeLoading = nil
req.errCh <- err
}
return true
}
slog.Info("Load failed", "model", req.model.ModelPath, "error", err)
s.activeLoading.Close()
s.activeLoading = nil
req.errCh <- err
return false
}
// Determine if we have discrete GPUs which we should monitor VRAM usage on during shutdown
discreteGPUs := false
iGPUScan:
for _, devid := range gpuIDs {
for _, dev := range gpus {
if dev.DeviceID == devid {
if !dev.Integrated {
discreteGPUs = true
break iGPUScan
}
}
}
}
runner := &runnerRef{
model: req.model,
modelPath: req.model.ModelPath,
llama: llama,
Options: &req.opts,
sessionDuration: sessionDuration,
gpus: gpus,
estimatedVRAM: llama.EstimatedVRAM(),
estimatedTotal: llama.EstimatedTotal(),
gpus: gpuIDs,
discreteGPUs: discreteGPUs,
vramSize: llama.VRAMSize(),
totalSize: llama.TotalSize(),
loading: true,
pid: llama.Pid(),
}
@@ -562,6 +488,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
oldRunner.unload()
oldRunner.refMu.Unlock()
}
s.activeLoading = nil
s.loaded[req.model.ModelPath] = runner
slog.Info("loaded runners", "count", len(s.loaded))
s.loadedMu.Unlock()
@@ -588,14 +515,15 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
}()
req.successCh <- runner
}()
return false
}
func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
type predKey struct {
Library string
ID string
func (s *Scheduler) updateFreeSpace(allGpus []ml.DeviceInfo) {
if len(allGpus) == 0 {
return
}
predMap := map[predKey]uint64{} // Sum up the total predicted usage per GPU for all runners
predMap := map[ml.DeviceID]uint64{} // Sum up the total predicted usage per GPU for all runners
s.loadedMu.Lock()
runners := make([]*runnerRef, 0, len(s.loaded))
for _, r := range s.loaded {
@@ -606,7 +534,7 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
r.refMu.Lock()
if r.llama != nil {
for _, gpu := range allGpus {
predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimatedVRAMByGPU(gpu.ID)
predMap[gpu.DeviceID] += r.llama.VRAMByGPU(gpu.DeviceID)
}
} else {
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
@@ -616,7 +544,7 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
// Now that we've summed up all the GPU usage predictions across all the loaded runners, update the gpu list
for i := range allGpus {
if p, ok := predMap[predKey{allGpus[i].Library, allGpus[i].ID}]; ok {
if p, ok := predMap[allGpus[i].DeviceID]; ok {
slog.Debug("gpu reported", "gpu", allGpus[i].ID, "library", allGpus[i].Library, "available", format.HumanBytes2(allGpus[i].FreeMemory))
if p > allGpus[i].TotalMemory {
// Shouldn't happen
@@ -633,41 +561,18 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
}
}
// While models are loading the VRAM consumption numbers will be indeterminate, so we have
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
// This routine returns the set of GPUs that do not have an active loading model.
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) discover.GpuInfoList {
ret := append(discover.GpuInfoList{}, allGpus...)
s.loadedMu.Lock()
defer s.loadedMu.Unlock()
for _, runner := range s.loaded {
if runner.loading {
slog.Debug("overlapping loads detected", "gpus", runner.gpus, "model", runner.modelPath)
for _, busyGPU := range runner.gpus {
for i := range ret {
if ret[i].ID == busyGPU.ID {
ret = append(ret[:i], ret[i+1:]...)
break
}
}
}
}
}
return ret
}
// TODO consolidate sched_types.go
type runnerRef struct {
refMu sync.Mutex
refCount uint // prevent unloading if > 0
llama llm.LlamaServer
pid int
loading bool // True only during initial load, then false forever
gpus discover.GpuInfoList // Recorded at time of provisioning
estimatedVRAM uint64
estimatedTotal uint64
llama llm.LlamaServer
pid int
loading bool // True only during initial load, then false forever
gpus []ml.DeviceID // Recorded at time of provisioning
discreteGPUs bool // True if all devices are discrete GPUs - used to skip VRAM recovery check for iGPUs
vramSize uint64
totalSize uint64
sessionDuration time.Duration
expireTimer *time.Timer
@@ -689,7 +594,6 @@ func (runner *runnerRef) unload() {
runner.llama.Close()
}
runner.model = nil
runner.llama = nil
runner.Options = nil
runner.gpus = nil
}
@@ -716,9 +620,6 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
optsNew.NumGPU = -1
}
// Normalize the NumCtx for parallelism
optsExisting.NumCtx = optsExisting.NumCtx / runner.numParallel
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
if !reflect.DeepEqual(runner.model.AdapterPaths, req.model.AdapterPaths) || // have the adapters changed?
@@ -739,14 +640,12 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
// a before and after GPU memory allocation. The returned channel
// will be notified when we're done waiting, or have timed out and should
// proceed anyway
func (runner *runnerRef) waitForVRAMRecovery() chan any {
func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []ml.FilteredRunnerDiscovery) chan any {
finished := make(chan any, 1)
// CPU or Metal don't need checking, so no waiting required
// windows can page VRAM, only cuda currently can report accurate used vram usage
if len(runner.gpus) == 0 ||
(len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) ||
(runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") {
// CPU, Metal and iGPUs don't need checking, so no waiting required
if len(runner.gpus) == 0 || !runner.discreteGPUs ||
(len(runner.gpus) == 1 && runner.gpus[0].Library == "Metal") {
finished <- struct{}{}
slog.Debug("no need to wait for VRAM recovery", "runner", runner)
return finished
@@ -754,33 +653,45 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any {
start := time.Now()
// Establish a baseline before we unload
gpusBefore := discover.GetGPUInfo()
gpusBefore := s.getGpuFn(context.Background(), runners)
var totalMemoryBefore, freeMemoryBefore uint64
for _, gpu := range gpusBefore {
totalMemoryBefore += gpu.TotalMemory
freeMemoryBefore += gpu.FreeMemory
}
totalMemoryNow := totalMemoryBefore
freeMemoryNow := freeMemoryBefore
go func() {
expiresAt := start.Add(5 * time.Second) // typical convergence is 0.5-1.5s
// typical convergence is 0.5-1.5s - If it takes too long to discover and converge, let the scheduler estimate VRAM usage
ctx, cancel := context.WithTimeout(context.Background(), s.waitForRecovery)
defer cancel()
ticker := time.NewTicker(250 * time.Millisecond)
defer ticker.Stop()
for {
<-ticker.C
if time.Now().After(expiresAt) {
slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds(), "runner", runner)
finished <- struct{}{}
}
// Query GPUs, look for free to go back up
gpusNow := discover.GetGPUInfo()
var totalMemoryNow, freeMemoryNow uint64
for _, gpu := range gpusNow {
totalMemoryNow += gpu.TotalMemory
freeMemoryNow += gpu.FreeMemory
}
// If we're within ~80% of the estimated memory usage recovered, bail out
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "runner", runner)
select {
case <-ticker.C:
// Query GPUs, look for free to go back up
gpusNow := s.getGpuFn(ctx, runners)
totalMemoryNow = 0
freeMemoryNow = 0
for _, gpu := range gpusNow {
totalMemoryNow += gpu.TotalMemory
freeMemoryNow += gpu.FreeMemory
}
if freeMemoryNow > freeMemoryBefore {
logutil.Trace("gpu VRAM convergence", "percent", int(float32(freeMemoryNow-freeMemoryBefore)/float32(runner.vramSize)*100))
} else {
logutil.Trace("gpu VRAM convergence", "percent", 0)
}
// If we're within ~75% of the estimated memory usage recovered, bail out
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.vramSize)*0.75 {
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "free_before", format.HumanBytes2(freeMemoryBefore), "free_now", format.HumanBytes2(freeMemoryNow), "runner", runner)
finished <- struct{}{}
return
}
case <-ctx.Done():
slog.Debug("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds(), "free_before", format.HumanBytes2(freeMemoryBefore), "free_now", format.HumanBytes2(freeMemoryNow), "runner", runner)
finished <- struct{}{}
return
}
@@ -799,13 +710,12 @@ func (runner *runnerRef) LogValue() slog.Value {
}
if len(runner.gpus) > 0 {
attrs = append(attrs,
slog.String("inference", runner.gpus[0].Library),
slog.Int("devices", len(runner.gpus)),
slog.Any("inference", runner.gpus),
)
}
attrs = append(attrs,
slog.String("size", format.HumanBytes2(runner.estimatedTotal)),
slog.String("vram", format.HumanBytes2(runner.estimatedVRAM)),
slog.String("size", format.HumanBytes2(runner.totalSize)),
slog.String("vram", format.HumanBytes2(runner.vramSize)),
slog.Int("parallel", runner.numParallel),
slog.Int("pid", runner.pid),
slog.String("model", runner.modelPath),
@@ -816,6 +726,32 @@ func (runner *runnerRef) LogValue() slog.Value {
return slog.GroupValue(attrs...)
}
// Implements discover.RunnerDiscovery
func (runner *runnerRef) GetPort() int {
if runner.llama != nil {
return runner.llama.GetPort()
}
return -1
}
func (runner *runnerRef) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
if runner.llama != nil {
return runner.llama.GetDeviceInfos(ctx)
}
return nil
}
func (runner *runnerRef) GetActiveDeviceIDs() []ml.DeviceID {
return runner.gpus
}
func (runner *runnerRef) HasExited() bool {
if runner.llama != nil {
return runner.llama.HasExited()
}
return true
}
type ByDurationAndName []*runnerRef
func (a ByDurationAndName) Len() int { return len(a) }
@@ -835,113 +771,7 @@ func (a ByDurationAndName) Less(i, j int) bool {
// type BySize []*runnerRef
// func (a BySize) Len() int { return len(a) }
// func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
// The list of GPUs returned will always be the same brand (library)
// If the model can not be fit fully within the available GPU(s) nil is returned
// If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
// opts.NumCtx accordingly
func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
var estimatedVRAM uint64
var numParallelToTry []int
if *numParallel <= 0 {
// If no specific parallel setting was provided, try larger then smaller, always end with 1
numParallelToTry = append(numParallelToTry, defaultParallel, 1)
} else {
numParallelToTry = []int{*numParallel}
}
for _, gl := range gpus.ByLibrary() {
var ok bool
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
// TODO - potentially sort by performance capability, existing models loaded, etc.
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
// First attempt to fit the model into a single GPU
for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p
if !envconfig.SchedSpread() {
for _, g := range sgl {
if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
*numParallel = p
return []discover.GpuInfo{g}
}
}
}
}
// TODO future refinements
// - if multiple Libraries, see if any single GPU in any Library will fit
// - try subsets of GPUs instead of just falling back to 1 or all in a family
// Now try all the GPUs
for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p
if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
*numParallel = p
return sgl
}
}
}
return nil
}
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
func pickBestPartialFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
if *numParallel <= 0 {
*numParallel = 1
req.opts.NumCtx = req.origNumCtx
}
byLibrary := gpus.ByLibrary()
if len(byLibrary) <= 1 {
return gpus
}
var bestEstimate uint64
var bestFit int
for i, gl := range byLibrary {
_, estimatedVRAM := llm.PredictServerFit(gl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, *numParallel)
if estimatedVRAM > bestEstimate {
bestEstimate = estimatedVRAM
bestFit = i
}
}
return byLibrary[bestFit]
}
// findConflictingRunnerToUnload finds a specific runner that conflicts with target GPUs
func (s *Scheduler) findConflictingRunnerToUnload(targetGpus discover.GpuInfoList) *runnerRef {
s.loadedMu.Lock()
defer s.loadedMu.Unlock()
// Find the first loaded model that uses any of our target GPUs
for _, loadedRunner := range s.loaded {
if loadedRunner.loading {
continue // Skip models that are still loading
}
// Check if this loaded model is using any of our target GPUs
for _, targetGpu := range targetGpus {
for _, loadedGpu := range loadedRunner.gpus {
if targetGpu.ID == loadedGpu.ID {
slog.Debug("found conflicting runner using GPU",
"runner", loadedRunner.modelPath,
"gpu", targetGpu.ID)
return loadedRunner
}
}
}
}
slog.Debug("no conflicting runner found for target GPUs")
return nil
}
// func (a BySize) Less(i, j int) bool { return a[i].vramSize < a[j].vramSize }
// findRunnerToUnload finds a runner to unload to make room for a new model
func (s *Scheduler) findRunnerToUnload() *runnerRef {
@@ -978,6 +808,13 @@ func (s *Scheduler) findRunnerToUnload() *runnerRef {
func (s *Scheduler) unloadAllRunners() {
s.loadedMu.Lock()
defer s.loadedMu.Unlock()
if s.activeLoading != nil {
slog.Debug("shutting down currently loading runner")
s.activeLoading.Close()
s.activeLoading = nil
}
for model, runner := range s.loaded {
if runner.llama != nil {
slog.Debug("shutting down runner", "model", model)
@@ -1004,18 +841,3 @@ func (s *Scheduler) expireRunner(model *Model) {
runner.refMu.Unlock()
}
}
// If other runners are loaded, make sure the pending request will fit in system memory
// If not, pick a runner to unload, else return nil and the request can be loaded
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList) *runnerRef {
slog.Debug("evaluating if CPU model load will fit in available system memory")
estimate := llm.EstimateGPULayers(gpus, f, req.model.ProjectorPaths, req.opts, req.opts.NumCtx/req.origNumCtx)
if estimate.TotalSize <= gpus[0].FreeMemory {
slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
return nil
}
// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
return s.findRunnerToUnload()
}