diff --git a/docker/runtime/Dockerfile b/docker/runtime/Dockerfile
index aed30509..9f95776c 100644
--- a/docker/runtime/Dockerfile
+++ b/docker/runtime/Dockerfile
@@ -31,7 +31,13 @@ RUN bash -c 'LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib64:/usr/lib64:$LD_LIBR
 
 # Build Go binary
 # VCS info is embedded automatically since we cloned from git
-RUN go build -o /usr/local/bin/ollama .
+# Build to source directory so binary can find libraries via relative path
+RUN go build -o ./ollama .
+
+# Create symlink to standard binary location
+# The code in ml/path.go uses filepath.EvalSymlinks() which resolves this symlink
+# to /usr/local/src/ollama37/ollama, allowing it to find libraries at build/lib/ollama
+RUN ln -s /usr/local/src/ollama37/ollama /usr/local/bin/ollama
 
 # Setup library paths for runtime
 # The binary expects libraries in these exact paths:
diff --git a/llm/server.go b/llm/server.go
index a854963c..c2a4e257 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -714,12 +714,15 @@ func (s *ollamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus
 		return nil, err
 	}
 
+	slog.Info("waiting for runner subprocess to start responding...")
 	if err := s.waitUntilRunnerLaunched(ctx); err != nil {
 		return nil, err
 	}
+	slog.Info("runner subprocess ready")
 
 nextOperation:
 	for operation := LoadOperationFit; operation < LoadOperationCommit; operation++ {
+		slog.Info("starting memory allocation phase", "operation", operation.String(), "layers", gpuLayers)
 	nextLoad:
 		for {
 			s.loadRequest.GPULayers = gpuLayers
@@ -747,6 +750,7 @@ nextOperation:
 				// this layout before and it doesn't have more layers than the last one, we can keep
 				// trying to see if we can do better.
 				if _, ok := pastAllocations[newGPULayers.Hash()]; !ok && newGPULayers.Sum() <= gpuLayers.Sum() {
+					slog.Info("adjusting layer allocation for better fit", "old_layers", gpuLayers.Sum(), "new_layers", newGPULayers.Sum())
 					gpuLayers = newGPULayers
 					continue nextLoad
 				}
@@ -832,10 +836,12 @@ nextOperation:
 	}
 
 	s.loadRequest.GPULayers = gpuLayers
+	slog.Info("loading model weights into memory", "operation", "commit", "layers", gpuLayers)
 	resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
 	if err != nil {
 		return nil, err
 	}
+	slog.Info("model weights loaded successfully")
 
 	success = resp.Success
 	s.mem = &resp.Memory
@@ -1105,12 +1111,20 @@ func greedyFit(layers []uint64, gpus []ml.DeviceInfo, capacity float32, requeste
 // waitUntilRunnerLaunched sleeps until the runner subprocess is alive enough
 // to respond to status requests
 func (s *llmServer) waitUntilRunnerLaunched(ctx context.Context) error {
+	start := time.Now()
+	lastLog := start
 	for {
 		_, err := s.getServerStatus(ctx)
 		if err == nil {
 			break
 		}
 
+		// Log progress every second so user knows we're waiting
+		if time.Since(lastLog) > time.Second {
+			slog.Debug("still waiting for runner subprocess to respond", "elapsed", time.Since(start).Round(time.Millisecond))
+			lastLog = time.Now()
+		}
+
 		t := time.NewTimer(10 * time.Millisecond)
 		select {
 		case <-t.C:
diff --git a/ml/device.go b/ml/device.go
index 1fbe365e..b4f81520 100644
--- a/ml/device.go
+++ b/ml/device.go
@@ -431,7 +431,7 @@ func FlashAttentionSupported(l []DeviceInfo) bool {
 	for _, gpu := range l {
 		supportsFA := gpu.Library == "cpu" ||
 			gpu.Name == "Metal" || gpu.Library == "Metal" ||
-			(gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) ||
+			(gpu.Library == "CUDA" && gpu.ComputeMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) ||
 			gpu.Library == "ROCm"
 
 		if !supportsFA {