mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-20 12:47:00 +00:00
Add timing instrumentation and user progress messages for model loading
Problem: Model loading takes 2-3 minutes on first load with no user feedback, causing confusion about whether the system is frozen or working. Root Cause: GPU initialization (reserveWorstCaseGraph) takes ~164 seconds on Tesla K80 GPUs due to CUDA kernel compilation (PTX JIT for compute 3.7). This is by design - it validates GPU compatibility before committing to full load. Solution: 1. Add comprehensive timing instrumentation to identify bottlenecks 2. Add user-facing progress messages explaining the delay Changes: - cmd/cmd.go: Update spinner with informative message for users - llama/llama.go: Add timing logs for CGO model loading - runner/llamarunner/runner.go: Add detailed timing for llama runner - runner/ollamarunner/runner.go: Add timing + stderr messages for new engine - server/sched.go: Add timing for scheduler load operation User Experience: Before: Silent wait with blinking cursor for 2-3 minutes After: Rotating spinner with message "loading model (may take 1-3 min on first load)" Performance Metrics Captured: - GGUF file reading: ~0.4s - GPU kernel compilation: ~164s (bottleneck identified) - Model weight loading: ~0.002s - Total end-to-end: ~165s 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -876,40 +876,56 @@ func (s *Server) loadModel(
|
||||
threads int,
|
||||
multiUserCache bool,
|
||||
) {
|
||||
loadStartTime := time.Now()
|
||||
slog.Info("loadModel: starting model load", "model_path", mpath, "num_gpu_layers", params.NumGpuLayers)
|
||||
|
||||
var err error
|
||||
modelLoadStart := time.Now()
|
||||
s.model, err = llama.LoadModelFromFile(mpath, params)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
slog.Info("loadModel: model weights loaded from disk", "duration_sec", time.Since(modelLoadStart).Seconds())
|
||||
|
||||
ctxStart := time.Now()
|
||||
ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention, kvCacheType)
|
||||
s.lc, err = llama.NewContextWithModel(s.model, ctxParams)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
slog.Info("loadModel: context and KV cache initialized", "kv_size", kvSize, "duration_sec", time.Since(ctxStart).Seconds())
|
||||
|
||||
for _, path := range lpath {
|
||||
err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
if len(lpath) > 0 {
|
||||
loraStart := time.Now()
|
||||
for _, path := range lpath {
|
||||
err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
slog.Info("loadModel: LoRA adapters applied", "count", len(lpath), "duration_sec", time.Since(loraStart).Seconds())
|
||||
}
|
||||
|
||||
if ppath != "" {
|
||||
projectorStart := time.Now()
|
||||
var err error
|
||||
s.image, err = NewImageContext(s.lc, ppath)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
slog.Info("loadModel: vision projector loaded", "projector_path", ppath, "duration_sec", time.Since(projectorStart).Seconds())
|
||||
}
|
||||
|
||||
cacheStart := time.Now()
|
||||
s.cache, err = NewInputCache(s.lc, kvSize, s.parallel, multiUserCache)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
slog.Info("loadModel: input cache initialized", "duration_sec", time.Since(cacheStart).Seconds())
|
||||
|
||||
s.status = llm.ServerStatusReady
|
||||
s.ready.Done()
|
||||
slog.Info("loadModel: COMPLETE - model ready for inference", "total_duration_sec", time.Since(loadStartTime).Seconds())
|
||||
}
|
||||
|
||||
// load is the handler called by the Ollama server to process different
|
||||
|
||||
Reference in New Issue
Block a user