From 7d9b59c5208e4a44dd39915f2616efbcd52bbcb4 Mon Sep 17 00:00:00 2001
From: Shang Chieh Tseng <shangchieh.tseng@tsengsyu.com>
Date: Tue, 11 Nov 2025 23:28:00 +0800
Subject: [PATCH] Improve GPU detection and add detailed model loading logs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Fix binary path resolution using symlink (docker/runtime/Dockerfile)
   - Build binary to source directory (./ollama)
   - Create symlink from /usr/local/bin/ollama to /usr/local/src/ollama37/ollama
   - Allows ml/path.go to resolve libraries via filepath.EvalSymlinks()
   - Fixes "total vram=0 B" issue without requiring -w flag

2. Add comprehensive logging for model loading phases (llm/server.go)
   - Log runner subprocess startup and readiness
   - Log each memory allocation phase (FIT, ALLOC, COMMIT)
   - Log layer allocation adjustments during convergence
   - Log when model weights are being loaded (slowest phase)
   - Log progress during waitUntilRunnerLaunched (every 1s)
   - Improves visibility during 1-2 minute first-time model loads

3. Fix flash attention compute capability check (ml/device.go)
   - Changed DriverMajor to ComputeMajor for correct capability detection
   - Flash attention requires compute capability >= 7.0, not driver version

These changes improve user experience during model loading by providing
clear feedback at each stage, especially during the slow COMMIT phase
where GGUF weights are loaded and CUDA kernels compile.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 docker/runtime/Dockerfile |  8 +++++++-
 llm/server.go             | 14 ++++++++++++++
 ml/device.go              |  2 +-
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/docker/runtime/Dockerfile b/docker/runtime/Dockerfile
index aed30509..9f95776c 100644
--- a/docker/runtime/Dockerfile
+++ b/docker/runtime/Dockerfile
@@ -31,7 +31,13 @@ RUN bash -c 'LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib64:/usr/lib64:$LD_LIBR
 
 # Build Go binary
 # VCS info is embedded automatically since we cloned from git
-RUN go build -o /usr/local/bin/ollama .
+# Build to source directory so binary can find libraries via relative path
+RUN go build -o ./ollama .
+
+# Create symlink to standard binary location
+# The code in ml/path.go uses filepath.EvalSymlinks() which resolves this symlink
+# to /usr/local/src/ollama37/ollama, allowing it to find libraries at build/lib/ollama
+RUN ln -s /usr/local/src/ollama37/ollama /usr/local/bin/ollama
 
 # Setup library paths for runtime
 # The binary expects libraries in these exact paths:
diff --git a/llm/server.go b/llm/server.go
index a854963c..c2a4e257 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -714,12 +714,15 @@ func (s *ollamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus
 		return nil, err
 	}
 
+	slog.Info("waiting for runner subprocess to start responding...")
 	if err := s.waitUntilRunnerLaunched(ctx); err != nil {
 		return nil, err
 	}
+	slog.Info("runner subprocess ready")
 
 nextOperation:
 	for operation := LoadOperationFit; operation < LoadOperationCommit; operation++ {
+		slog.Info("starting memory allocation phase", "operation", operation.String(), "layers", gpuLayers)
 	nextLoad:
 		for {
 			s.loadRequest.GPULayers = gpuLayers
@@ -747,6 +750,7 @@ nextOperation:
 				// this layout before and it doesn't have more layers than the last one, we can keep
 				// trying to see if we can do better.
 				if _, ok := pastAllocations[newGPULayers.Hash()]; !ok && newGPULayers.Sum() <= gpuLayers.Sum() {
+					slog.Info("adjusting layer allocation for better fit", "old_layers", gpuLayers.Sum(), "new_layers", newGPULayers.Sum())
 					gpuLayers = newGPULayers
 					continue nextLoad
 				}
@@ -832,10 +836,12 @@ nextOperation:
 	}
 
 	s.loadRequest.GPULayers = gpuLayers
+	slog.Info("loading model weights into memory", "operation", "commit", "layers", gpuLayers)
 	resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
 	if err != nil {
 		return nil, err
 	}
+	slog.Info("model weights loaded successfully")
 
 	success = resp.Success
 	s.mem = &resp.Memory
@@ -1105,12 +1111,20 @@ func greedyFit(layers []uint64, gpus []ml.DeviceInfo, capacity float32, requeste
 // waitUntilRunnerLaunched sleeps until the runner subprocess is alive enough
 // to respond to status requests
 func (s *llmServer) waitUntilRunnerLaunched(ctx context.Context) error {
+	start := time.Now()
+	lastLog := start
 	for {
 		_, err := s.getServerStatus(ctx)
 		if err == nil {
 			break
 		}
 
+		// Log progress every second so user knows we're waiting
+		if time.Since(lastLog) > time.Second {
+			slog.Debug("still waiting for runner subprocess to respond", "elapsed", time.Since(start).Round(time.Millisecond))
+			lastLog = time.Now()
+		}
+
 		t := time.NewTimer(10 * time.Millisecond)
 		select {
 		case <-t.C:
diff --git a/ml/device.go b/ml/device.go
index 1fbe365e..b4f81520 100644
--- a/ml/device.go
+++ b/ml/device.go
@@ -431,7 +431,7 @@ func FlashAttentionSupported(l []DeviceInfo) bool {
 	for _, gpu := range l {
 		supportsFA := gpu.Library == "cpu" ||
 			gpu.Name == "Metal" || gpu.Library == "Metal" ||
-			(gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) ||
+			(gpu.Library == "CUDA" && gpu.ComputeMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) ||
 			gpu.Library == "ROCm"
 
 		if !supportsFA {