diff --git a/docker/runtime/Dockerfile b/docker/runtime/Dockerfile index aed30509..9f95776c 100644 --- a/docker/runtime/Dockerfile +++ b/docker/runtime/Dockerfile @@ -31,7 +31,13 @@ RUN bash -c 'LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib64:/usr/lib64:$LD_LIBR # Build Go binary # VCS info is embedded automatically since we cloned from git -RUN go build -o /usr/local/bin/ollama . +# Build to source directory so binary can find libraries via relative path +RUN go build -o ./ollama . + +# Create symlink to standard binary location +# The code in ml/path.go uses filepath.EvalSymlinks() which resolves this symlink +# to /usr/local/src/ollama37/ollama, allowing it to find libraries at build/lib/ollama +RUN ln -s /usr/local/src/ollama37/ollama /usr/local/bin/ollama # Setup library paths for runtime # The binary expects libraries in these exact paths: diff --git a/llm/server.go b/llm/server.go index a854963c..c2a4e257 100644 --- a/llm/server.go +++ b/llm/server.go @@ -714,12 +714,15 @@ func (s *ollamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus return nil, err } + slog.Info("waiting for runner subprocess to start responding...") if err := s.waitUntilRunnerLaunched(ctx); err != nil { return nil, err } + slog.Info("runner subprocess ready") nextOperation: for operation := LoadOperationFit; operation < LoadOperationCommit; operation++ { + slog.Info("starting memory allocation phase", "operation", operation.String(), "layers", gpuLayers) nextLoad: for { s.loadRequest.GPULayers = gpuLayers @@ -747,6 +750,7 @@ nextOperation: // this layout before and it doesn't have more layers than the last one, we can keep // trying to see if we can do better. if _, ok := pastAllocations[newGPULayers.Hash()]; !ok && newGPULayers.Sum() <= gpuLayers.Sum() { + slog.Info("adjusting layer allocation for better fit", "old_layers", gpuLayers.Sum(), "new_layers", newGPULayers.Sum()) gpuLayers = newGPULayers continue nextLoad } @@ -832,10 +836,12 @@ nextOperation: } s.loadRequest.GPULayers = gpuLayers + slog.Info("loading model weights into memory", "operation", "commit", "layers", gpuLayers) resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit) if err != nil { return nil, err } + slog.Info("model weights loaded successfully") success = resp.Success s.mem = &resp.Memory @@ -1105,12 +1111,20 @@ func greedyFit(layers []uint64, gpus []ml.DeviceInfo, capacity float32, requeste // waitUntilRunnerLaunched sleeps until the runner subprocess is alive enough // to respond to status requests func (s *llmServer) waitUntilRunnerLaunched(ctx context.Context) error { + start := time.Now() + lastLog := start for { _, err := s.getServerStatus(ctx) if err == nil { break } + // Log progress every second so user knows we're waiting + if time.Since(lastLog) > time.Second { + slog.Debug("still waiting for runner subprocess to respond", "elapsed", time.Since(start).Round(time.Millisecond)) + lastLog = time.Now() + } + t := time.NewTimer(10 * time.Millisecond) select { case <-t.C: diff --git a/ml/device.go b/ml/device.go index 1fbe365e..b4f81520 100644 --- a/ml/device.go +++ b/ml/device.go @@ -431,7 +431,7 @@ func FlashAttentionSupported(l []DeviceInfo) bool { for _, gpu := range l { supportsFA := gpu.Library == "cpu" || gpu.Name == "Metal" || gpu.Library == "Metal" || - (gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) || + (gpu.Library == "CUDA" && gpu.ComputeMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) || gpu.Library == "ROCm" if !supportsFA {