mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-17 19:27:00 +00:00
Fix Tesla K80 CUBLAS compatibility with two-tier fallback strategy
This commit implements comprehensive Tesla K80 (Kepler, compute 3.7) compatibility for batched matrix multiplication operations. **Problem:** Modern CUBLAS functions fail on Tesla K80 with CUBLAS_STATUS_ARCH_MISMATCH: 1. CUBLAS_GEMM_DEFAULT_TENSOR_OP requires Tensor Cores (Volta+ only) 2. cublasGemmStridedBatchedEx/cublasGemmBatchedEx have architectural requirements beyond algorithm selection **Solution - Two-Tier Fallback:** Tier 1: Algorithm Selection - Volta+ (cc >= 7.0): CUBLAS_GEMM_DEFAULT_TENSOR_OP - Pre-Volta (cc < 7.0): CUBLAS_GEMM_DEFAULT Tier 2: Function Selection - Volta+ or non-FP32: Use *Ex variants (flexible precision) - Kepler/Maxwell/Pascal with FP32: Use legacy type-specific functions (cublasSgemmStridedBatched, cublasSgemmBatched) **Changes:** CUDA Implementation: - ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu * ggml_cuda_op_mul_mat_cublas: Algorithm selection for non-batched ops * ggml_cuda_mul_mat_batched_cublas_impl: Two-tier fallback for batched ops * Added GGML_CUDA_DEBUG environment variable for conditional debug logging * Comprehensive function documentation explaining fallback strategy Documentation: - CLAUDE.md * Added Tesla K80 CUBLAS Compatibility section * Documented GGML_CUDA_DEBUG environment variable * Enhanced "Running Ollama" section with log capture examples * Updated Files Modified list Code Comments: - Added detailed comments throughout CUDA code explaining: * Why TENSOR_OP fails on pre-Volta GPUs * Why *Ex functions require architectural support * Compute capability checks and fallback logic * Debug logging usage **Testing:** All models verified working on Tesla K80: - ✅ gemma3:4b - ✅ gpt-oss - ✅ deepseek-r1 Debug flag tested in both enabled and disabled states. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -380,12 +380,31 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
||||
}()
|
||||
}
|
||||
|
||||
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
|
||||
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
|
||||
// load creates a new model based on req and loads it into memory
|
||||
//
|
||||
// This is THE critical function that:
|
||||
// 1. Spawns the runner subprocess (ollama runner --model <path>)
|
||||
// 2. Loads GGUF weights into memory via llama.cpp
|
||||
// 3. Distributes model layers across CPU/GPU(s) based on available VRAM
|
||||
// 4. Allocates KV cache on GPU (for Tesla K80: compute 3.7)
|
||||
// 5. Initializes inference context with threading and batch parameters
|
||||
//
|
||||
// Parameters:
|
||||
// - req: LlmRequest containing model path, options, capabilities
|
||||
// - f: GGML metadata (parsed from GGUF file headers)
|
||||
// - systemInfo: CPU/RAM information
|
||||
// - gpus: List of available GPUs (e.g., Tesla K80)
|
||||
// - requireFull: If true, model must fit entirely on GPU(s)
|
||||
//
|
||||
// Returns:
|
||||
// - bool: true if scheduler needs to evict other models to make room
|
||||
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
|
||||
// NumParallel controls how many requests can be processed simultaneously
|
||||
// Each parallel slot requires additional KV cache memory
|
||||
numParallel := max(int(envconfig.NumParallel()), 1)
|
||||
|
||||
// Embedding models should always be loaded with parallel=1
|
||||
// (they don't benefit from parallel processing like generation does)
|
||||
if req.model.CheckCapabilities(model.CapabilityCompletion) != nil {
|
||||
numParallel = 1
|
||||
}
|
||||
@@ -405,8 +424,27 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo
|
||||
s.loadedMu.Lock()
|
||||
llama := s.activeLoading
|
||||
|
||||
// Create new llama server instance if not already loading
|
||||
if llama == nil {
|
||||
var err error
|
||||
// *** SPAWN RUNNER SUBPROCESS ***
|
||||
// s.newServerFn points to NewLlamaServer() in llm/server.go:148
|
||||
//
|
||||
// This function:
|
||||
// 1. Calculates memory requirements from GGML metadata
|
||||
// 2. Determines GPU layer distribution (num_gpu_layers)
|
||||
// 3. Spawns subprocess: exec.Command("ollama", "runner", "--model", modelPath, "--port", port)
|
||||
// 4. Subprocess starts HTTP server listening on local port
|
||||
// 5. Returns LlamaServer interface for IPC communication
|
||||
//
|
||||
// Parameters:
|
||||
// - systemInfo: CPU/RAM stats
|
||||
// - gpus: Available GPUs (Tesla K80 with 12GB VRAM, compute 3.7)
|
||||
// - ModelPath: Path to GGUF file (e.g., ~/.ollama/models/blobs/sha256-abc123...)
|
||||
// - AdapterPaths: LoRA adapters (if any)
|
||||
// - ProjectorPaths: Vision projectors for multimodal (if any)
|
||||
// - opts: User options (num_gpu, num_thread, num_ctx, etc.)
|
||||
// - numParallel: How many parallel inference slots to allocate
|
||||
llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
||||
if err != nil {
|
||||
// some older models are not compatible with newer versions of llama.cpp
|
||||
@@ -423,6 +461,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo
|
||||
|
||||
s.activeLoading = llama
|
||||
} else {
|
||||
// Reusing existing server (e.g., after eviction attempt failed)
|
||||
if s.activeLoading.ModelPath() != req.model.ModelPath {
|
||||
panic(fmt.Errorf("attempting to load different model after eviction (original %v new %v)", s.activeLoading.ModelPath(), req.model.ModelPath))
|
||||
}
|
||||
@@ -430,16 +469,28 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo
|
||||
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
// *** LOAD MODEL WEIGHTS INTO MEMORY ***
|
||||
// llama.Load() triggers the runner subprocess to:
|
||||
// 1. Call llama_model_load_from_file() via CGO (llama/llama.go)
|
||||
// 2. Read GGUF file and mmap() weights into memory
|
||||
// 3. Distribute layers across CPU/GPU based on num_gpu_layers
|
||||
// 4. Allocate KV cache on GPU (if using GPU)
|
||||
// 5. Compile CUDA kernels for Tesla K80 (compute 3.7, via PTX JIT)
|
||||
//
|
||||
// Returns:
|
||||
// - gpuIDs: List of GPU device IDs where model layers were loaded
|
||||
// - err: Error if model doesn't fit or loading fails
|
||||
gpuIDs, err := llama.Load(req.ctx, systemInfo, gpus, requireFull)
|
||||
if err != nil {
|
||||
if errors.Is(err, llm.ErrLoadRequiredFull) {
|
||||
if !requireFull {
|
||||
// No other models loaded, yet we still don't fit, so report an error
|
||||
// Model doesn't fit fully on GPU, need to evict other models
|
||||
slog.Info("model is too large for system memory", "requireFull", requireFull)
|
||||
s.activeLoading.Close()
|
||||
s.activeLoading = nil
|
||||
req.errCh <- err
|
||||
}
|
||||
// Signal scheduler to evict models and retry
|
||||
return true
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user