mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-09 23:37:06 +00:00
Add timing instrumentation and user progress messages for model loading
Problem: Model loading takes 2-3 minutes on first load with no user feedback, causing confusion about whether the system is frozen or working. Root Cause: GPU initialization (reserveWorstCaseGraph) takes ~164 seconds on Tesla K80 GPUs due to CUDA kernel compilation (PTX JIT for compute 3.7). This is by design - it validates GPU compatibility before committing to full load. Solution: 1. Add comprehensive timing instrumentation to identify bottlenecks 2. Add user-facing progress messages explaining the delay Changes: - cmd/cmd.go: Update spinner with informative message for users - llama/llama.go: Add timing logs for CGO model loading - runner/llamarunner/runner.go: Add detailed timing for llama runner - runner/ollamarunner/runner.go: Add timing + stderr messages for new engine - server/sched.go: Add timing for scheduler load operation User Experience: Before: Silent wait with blinking cursor for 2-3 minutes After: Rotating spinner with message "loading model (may take 1-3 min on first load)" Performance Metrics Captured: - GGUF file reading: ~0.4s - GPU kernel compilation: ~164s (bottleneck identified) - Model weight loading: ~0.002s - Total end-to-end: ~165s 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -51,6 +51,7 @@ import (
|
||||
"slices"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
"unsafe"
|
||||
|
||||
_ "github.com/ollama/ollama/llama/llama.cpp/common"
|
||||
@@ -329,6 +330,9 @@ func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool {
|
||||
// - *Model: Loaded model ready for inference
|
||||
// - error: If file not found, incompatible format, or out of memory
|
||||
func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
|
||||
loadStart := time.Now()
|
||||
slog.Info("LoadModelFromFile: starting", "path", modelPath, "num_gpu_layers", params.NumGpuLayers, "use_mmap", params.UseMmap)
|
||||
|
||||
// Initialize C parameters structure
|
||||
cparams := C.llama_model_default_params()
|
||||
cparams.n_gpu_layers = C.int(params.NumGpuLayers) // Layers to offload to GPU
|
||||
@@ -367,11 +371,16 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
|
||||
// 2. Allocates CPU/GPU memory for tensors
|
||||
// 3. Loads/mmaps weights into memory
|
||||
// 4. For Tesla K80: compiles CUDA kernels via PTX JIT (compute 3.7)
|
||||
slog.Info("LoadModelFromFile: calling llama_model_load_from_file (CGO -> C++)")
|
||||
cgoStart := time.Now()
|
||||
m := Model{c: C.llama_model_load_from_file(C.CString(modelPath), cparams)}
|
||||
slog.Info("LoadModelFromFile: llama_model_load_from_file returned", "duration_sec", time.Since(cgoStart).Seconds())
|
||||
|
||||
if m.c == nil {
|
||||
return nil, fmt.Errorf("unable to load model: %s", modelPath)
|
||||
}
|
||||
|
||||
slog.Info("LoadModelFromFile: COMPLETE", "total_duration_sec", time.Since(loadStart).Seconds())
|
||||
return &m, nil
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user