Fix Tesla K80 CUBLAS compatibility with two-tier fallback strategy

This commit implements comprehensive Tesla K80 (Kepler, compute 3.7) compatibility for batched matrix multiplication operations. **Problem:** Modern CUBLAS functions fail on Tesla K80 with CUBLAS_STATUS_ARCH_MISMATCH: 1. CUBLAS_GEMM_DEFAULT_TENSOR_OP requires Tensor Cores (Volta+ only) 2. cublasGemmStridedBatchedEx/cublasGemmBatchedEx have architectural requirements beyond algorithm selection **Solution - Two-Tier Fallback:** Tier 1: Algorithm Selection - Volta+ (cc >= 7.0): CUBLAS_GEMM_DEFAULT_TENSOR_OP - Pre-Volta (cc < 7.0): CUBLAS_GEMM_DEFAULT Tier 2: Function Selection - Volta+ or non-FP32: Use *Ex variants (flexible precision) - Kepler/Maxwell/Pascal with FP32: Use legacy type-specific functions (cublasSgemmStridedBatched, cublasSgemmBatched) **Changes:** CUDA Implementation: - ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu * ggml_cuda_op_mul_mat_cublas: Algorithm selection for non-batched ops * ggml_cuda_mul_mat_batched_cublas_impl: Two-tier fallback for batched ops * Added GGML_CUDA_DEBUG environment variable for conditional debug logging * Comprehensive function documentation explaining fallback strategy Documentation: - CLAUDE.md * Added Tesla K80 CUBLAS Compatibility section * Documented GGML_CUDA_DEBUG environment variable * Enhanced "Running Ollama" section with log capture examples * Updated Files Modified list Code Comments: - Added detailed comments throughout CUDA code explaining: * Why TENSOR_OP fails on pre-Volta GPUs * Why *Ex functions require architectural support * Compute capability checks and fallback logic * Debug logging usage **Testing:** All models verified working on Tesla K80: - ✅ gemma3:4b - ✅ gpt-oss - ✅ deepseek-r1 Debug flag tested in both enabled and disabled states. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-17 19:27:00 +00:00 · 2025-11-05 23:52:45 +08:00
parent ef14fb5b26
commit d948926581
8 changed files with 616 additions and 153 deletions
--- a/server/sched.go
+++ b/server/sched.go
@@ -380,12 +380,31 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 	}()
 }

-// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
-// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
+// load creates a new model based on req and loads it into memory
+//
+// This is THE critical function that:
+// 1. Spawns the runner subprocess (ollama runner --model <path>)
+// 2. Loads GGUF weights into memory via llama.cpp
+// 3. Distributes model layers across CPU/GPU(s) based on available VRAM
+// 4. Allocates KV cache on GPU (for Tesla K80: compute 3.7)
+// 5. Initializes inference context with threading and batch parameters
+//
+// Parameters:
+// - req: LlmRequest containing model path, options, capabilities
+// - f: GGML metadata (parsed from GGUF file headers)
+// - systemInfo: CPU/RAM information
+// - gpus: List of available GPUs (e.g., Tesla K80)
+// - requireFull: If true, model must fit entirely on GPU(s)
+//
+// Returns:
+// - bool: true if scheduler needs to evict other models to make room
 func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
+	// NumParallel controls how many requests can be processed simultaneously
+	// Each parallel slot requires additional KV cache memory
 	numParallel := max(int(envconfig.NumParallel()), 1)

 	// Embedding models should always be loaded with parallel=1
+	// (they don't benefit from parallel processing like generation does)
 	if req.model.CheckCapabilities(model.CapabilityCompletion) != nil {
 		numParallel = 1
 	}
@@ -405,8 +424,27 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo
 	s.loadedMu.Lock()
 	llama := s.activeLoading

+	// Create new llama server instance if not already loading
 	if llama == nil {
 		var err error
+		// *** SPAWN RUNNER SUBPROCESS ***
+		// s.newServerFn points to NewLlamaServer() in llm/server.go:148
+		//
+		// This function:
+		// 1. Calculates memory requirements from GGML metadata
+		// 2. Determines GPU layer distribution (num_gpu_layers)
+		// 3. Spawns subprocess: exec.Command("ollama", "runner", "--model", modelPath, "--port", port)
+		// 4. Subprocess starts HTTP server listening on local port
+		// 5. Returns LlamaServer interface for IPC communication
+		//
+		// Parameters:
+		// - systemInfo: CPU/RAM stats
+		// - gpus: Available GPUs (Tesla K80 with 12GB VRAM, compute 3.7)
+		// - ModelPath: Path to GGUF file (e.g., ~/.ollama/models/blobs/sha256-abc123...)
+		// - AdapterPaths: LoRA adapters (if any)
+		// - ProjectorPaths: Vision projectors for multimodal (if any)
+		// - opts: User options (num_gpu, num_thread, num_ctx, etc.)
+		// - numParallel: How many parallel inference slots to allocate
 		llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
 		if err != nil {
 			// some older models are not compatible with newer versions of llama.cpp
@@ -423,6 +461,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo

 		s.activeLoading = llama
 	} else {
+		// Reusing existing server (e.g., after eviction attempt failed)
 		if s.activeLoading.ModelPath() != req.model.ModelPath {
 			panic(fmt.Errorf("attempting to load different model after eviction (original %v new %v)", s.activeLoading.ModelPath(), req.model.ModelPath))
 		}
@@ -430,16 +469,28 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo

 	s.loadedMu.Unlock()

+	// *** LOAD MODEL WEIGHTS INTO MEMORY ***
+	// llama.Load() triggers the runner subprocess to:
+	// 1. Call llama_model_load_from_file() via CGO (llama/llama.go)
+	// 2. Read GGUF file and mmap() weights into memory
+	// 3. Distribute layers across CPU/GPU based on num_gpu_layers
+	// 4. Allocate KV cache on GPU (if using GPU)
+	// 5. Compile CUDA kernels for Tesla K80 (compute 3.7, via PTX JIT)
+	//
+	// Returns:
+	// - gpuIDs: List of GPU device IDs where model layers were loaded
+	// - err: Error if model doesn't fit or loading fails
 	gpuIDs, err := llama.Load(req.ctx, systemInfo, gpus, requireFull)
 	if err != nil {
 		if errors.Is(err, llm.ErrLoadRequiredFull) {
 			if !requireFull {
-				// No other models loaded, yet we still don't fit, so report an error
+				// Model doesn't fit fully on GPU, need to evict other models
 				slog.Info("model is too large for system memory", "requireFull", requireFull)
 				s.activeLoading.Close()
 				s.activeLoading = nil
 				req.errCh <- err
 			}
+			// Signal scheduler to evict models and retry
 			return true
 		}