mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 07:46:59 +00:00
1. Fix binary path resolution using symlink (docker/runtime/Dockerfile) - Build binary to source directory (./ollama) - Create symlink from /usr/local/bin/ollama to /usr/local/src/ollama37/ollama - Allows ml/path.go to resolve libraries via filepath.EvalSymlinks() - Fixes "total vram=0 B" issue without requiring -w flag 2. Add comprehensive logging for model loading phases (llm/server.go) - Log runner subprocess startup and readiness - Log each memory allocation phase (FIT, ALLOC, COMMIT) - Log layer allocation adjustments during convergence - Log when model weights are being loaded (slowest phase) - Log progress during waitUntilRunnerLaunched (every 1s) - Improves visibility during 1-2 minute first-time model loads 3. Fix flash attention compute capability check (ml/device.go) - Changed DriverMajor to ComputeMajor for correct capability detection - Flash attention requires compute capability >= 7.0, not driver version These changes improve user experience during model loading by providing clear feedback at each stage, especially during the slow COMMIT phase where GGUF weights are loaded and CUDA kernels compile. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
71 lines
2.9 KiB
Docker
71 lines
2.9 KiB
Docker
# Ollama37 Runtime Image
|
|
# Single-stage build: compiles and packages the binary in one image
|
|
# The runtime needs access to the build directory for GGML CUDA libraries
|
|
# This ensures the compiled binary can find all required runtime libraries at:
|
|
# /usr/local/src/ollama37/build/lib/ollama
|
|
|
|
# Base image: ollama37-builder contains GCC 10, CUDA 11.4, and build tools
|
|
FROM ollama37-builder
|
|
|
|
# Clone ollama37 source code from GitHub
|
|
RUN cd /usr/local/src\
|
|
&& git clone https://github.com/dogkeeper886/ollama37.git
|
|
|
|
# Set working directory for build
|
|
WORKDIR /usr/local/src/ollama37
|
|
|
|
# Configure build with CMake
|
|
# Use "CUDA 11" preset for Tesla K80 compute capability 3.7 support
|
|
# Set LD_LIBRARY_PATH during build so CMake can locate GCC 10 runtime libraries
|
|
# and properly link against them (required for C++ standard library and atomics)
|
|
RUN bash -c 'LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib64:/usr/lib64:$LD_LIBRARY_PATH \
|
|
CC=/usr/local/bin/gcc CXX=/usr/local/bin/g++ \
|
|
cmake --preset "CUDA 11"'
|
|
|
|
# Build C/C++/CUDA libraries with CMake
|
|
# Compile all GGML CUDA kernels and Ollama native libraries
|
|
# Use all available CPU cores (-j) for parallel compilation to speed up build
|
|
RUN bash -c 'LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib64:/usr/lib64:$LD_LIBRARY_PATH \
|
|
CC=/usr/local/bin/gcc CXX=/usr/local/bin/g++ \
|
|
cmake --build build -j$(nproc)'
|
|
|
|
# Build Go binary
|
|
# VCS info is embedded automatically since we cloned from git
|
|
# Build to source directory so binary can find libraries via relative path
|
|
RUN go build -o ./ollama .
|
|
|
|
# Create symlink to standard binary location
|
|
# The code in ml/path.go uses filepath.EvalSymlinks() which resolves this symlink
|
|
# to /usr/local/src/ollama37/ollama, allowing it to find libraries at build/lib/ollama
|
|
RUN ln -s /usr/local/src/ollama37/ollama /usr/local/bin/ollama
|
|
|
|
# Setup library paths for runtime
|
|
# The binary expects libraries in these exact paths:
|
|
# /usr/local/src/ollama37/build/lib/ollama - Ollama CUDA/GGML libraries
|
|
# /usr/local/lib64 - GCC 10 runtime libraries (libstdc++, libgcc_s)
|
|
# /usr/local/cuda-11.4/lib64 - CUDA 11.4 runtime libraries
|
|
# /usr/lib64 - System libraries
|
|
ENV LD_LIBRARY_PATH=/usr/local/src/ollama37/build/lib/ollama:/usr/local/lib64:/usr/local/cuda-11.4/lib64:/usr/lib64
|
|
|
|
# Configure Ollama server to listen on all interfaces
|
|
ENV OLLAMA_HOST=0.0.0.0:11434
|
|
|
|
# Expose Ollama API port
|
|
EXPOSE 11434
|
|
|
|
# Create persistent volume for model storage
|
|
# Models downloaded by Ollama will be stored here
|
|
RUN mkdir -p /root/.ollama
|
|
VOLUME ["/root/.ollama"]
|
|
|
|
# Configure health check to verify Ollama is running
|
|
# Uses 'ollama list' command to check if the service is responsive
|
|
# This validates both API availability and model registry access
|
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
|
CMD /usr/local/bin/ollama list || exit 1
|
|
|
|
# Set entrypoint and default command
|
|
# Container runs 'ollama serve' by default to start the API server
|
|
ENTRYPOINT ["/usr/local/bin/ollama"]
|
|
CMD ["serve"]
|