Fix Tesla K80 CUBLAS compatibility with two-tier fallback strategy

This commit implements comprehensive Tesla K80 (Kepler, compute 3.7) compatibility for batched matrix multiplication operations. **Problem:** Modern CUBLAS functions fail on Tesla K80 with CUBLAS_STATUS_ARCH_MISMATCH: 1. CUBLAS_GEMM_DEFAULT_TENSOR_OP requires Tensor Cores (Volta+ only) 2. cublasGemmStridedBatchedEx/cublasGemmBatchedEx have architectural requirements beyond algorithm selection **Solution - Two-Tier Fallback:** Tier 1: Algorithm Selection - Volta+ (cc >= 7.0): CUBLAS_GEMM_DEFAULT_TENSOR_OP - Pre-Volta (cc < 7.0): CUBLAS_GEMM_DEFAULT Tier 2: Function Selection - Volta+ or non-FP32: Use *Ex variants (flexible precision) - Kepler/Maxwell/Pascal with FP32: Use legacy type-specific functions (cublasSgemmStridedBatched, cublasSgemmBatched) **Changes:** CUDA Implementation: - ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu * ggml_cuda_op_mul_mat_cublas: Algorithm selection for non-batched ops * ggml_cuda_mul_mat_batched_cublas_impl: Two-tier fallback for batched ops * Added GGML_CUDA_DEBUG environment variable for conditional debug logging * Comprehensive function documentation explaining fallback strategy Documentation: - CLAUDE.md * Added Tesla K80 CUBLAS Compatibility section * Documented GGML_CUDA_DEBUG environment variable * Enhanced "Running Ollama" section with log capture examples * Updated Files Modified list Code Comments: - Added detailed comments throughout CUDA code explaining: * Why TENSOR_OP fails on pre-Volta GPUs * Why *Ex functions require architectural support * Compute capability checks and fallback logic * Debug logging usage **Testing:** All models verified working on Tesla K80: - ✅ gemma3:4b - ✅ gpt-oss - ✅ deepseek-r1 Debug flag tested in both enabled and disabled states. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-09 23:37:06 +00:00 · 2025-11-05 23:52:45 +08:00
parent ef14fb5b26
commit d948926581
8 changed files with 616 additions and 153 deletions
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -1,3 +1,18 @@
+// Package llama provides Go bindings to llama.cpp via CGO
+//
+// This is the bridge between Go code and C/C++/CUDA inference engine.
+// All actual model inference happens through these CGO calls.
+//
+// Key components:
+// - LoadModelFromFile(): Loads GGUF model file into memory
+// - Context.Decode(): Runs inference (GPU/CPU matrix operations)
+// - SamplingContext: Selects next token from logits
+// - Batch: Groups tokens for efficient parallel processing
+//
+// For Tesla K80 (compute 3.7):
+// - CUDA kernels compiled with PTX (JIT at runtime)
+// - Model layers distributed across CPU/GPU based on num_gpu_layers
+// - KV cache allocated on GPU VRAM (12GB available)
 package llama

 /*
@@ -12,13 +27,12 @@ package llama
 #cgo CPPFLAGS: -I${SRCDIR}/../ml/backend/ggml/ggml/include

 #include <stdlib.h>
-#include "ggml.h"
-#include "llama.h"
-#include "mtmd.h"
+#include "ggml.h"       // GGML tensor library (CPU/GPU operations)
+#include "llama.h"      // llama.cpp model loading and inference
+#include "mtmd.h"       // Multi-turn multi-document support
 #include "mtmd-helper.h"
-#include "gguf.h"
-
-#include "sampling_ext.h"
+#include "gguf.h"       // GGUF file format parsing
+#include "sampling_ext.h" // Token sampling (temperature, top_p, etc.)

 extern bool llamaProgressCallback(float progress, void *user_data);
 extern void llamaLog(int level, char* text, void* user_data);
@@ -58,9 +72,22 @@ func llamaLog(level C.int, text *C.char, _ unsafe.Pointer) {
 	}
 }

+// BackendInit initializes the llama.cpp backend
+//
+// This must be called once before loading any models.
+// It initializes:
+// - CUDA backend (if GPUs available)
+// - CPU backend
+// - Memory allocators
+// - Threading infrastructure
+//
+// For Tesla K80 (compute 3.7):
+// - Detects CUDA device
+// - Verifies compute capability support
+// - Initializes cuBLAS for matrix operations
 func BackendInit() {
-	ggml.OnceLoad()
-	C.llama_backend_init()
+	ggml.OnceLoad() // Load GGML shared library
+	C.llama_backend_init() // Initialize llama.cpp backend
 }

 func EnumerateGPUs() []ml.DeviceID {
@@ -145,15 +172,50 @@ func kvCacheTypeFromStr(s string) C.enum_ggml_type {
 	}
 }

+// Context represents an active inference context
+//
+// This wraps llama.cpp's llama_context which holds:
+// - Model pointer
+// - KV cache (key-value cache for attention)
+// - Thread pool for CPU operations
+// - RNG state for sampling
+//
+// Each Context can handle multiple parallel sequences (controlled by numParallel)
 type Context struct {
-	c          *C.struct_llama_context
-	numThreads int
+	c          *C.struct_llama_context // C pointer to llama_context
+	numThreads int                     // Number of CPU threads for inference
 }

 var ErrKvCacheFull = errors.New("could not find a kv cache slot")

+// Decode runs one inference step on a batch of tokens
+//
+// *** THIS IS WHERE ACTUAL INFERENCE HAPPENS ***
+//
+// For each token in the batch, this:
+// 1. Retrieves token embeddings from model
+// 2. Runs through transformer layers:
+//    - Attention (uses KV cache)
+//    - Feed-forward network
+//    - Layer normalization
+// 3. Stores KV states in cache for future tokens
+// 4. Produces output logits (probabilities for next token)
+//
+// GPU execution (Tesla K80, compute 3.7):
+// - Matrix multiplications via cuBLAS
+// - Attention via CUDA kernels
+// - LayerNorm/RoPE/Softmax via CUDA kernels
+// - Data transferred between CPU/GPU as needed
+//
+// Returns:
+// - nil: success
+// - ErrKvCacheFull: no space in KV cache (increase num_ctx or reduce batch size)
+// - error: fatal error during inference
 func (c *Context) Decode(batch *Batch) error {
-	// Positive return values does not mean a fatal error, but rather a warning.
+	// Call C function: int llama_decode(llama_context*, llama_batch)
+	// This executes the actual neural network forward pass
+	//
+	// Return codes:
 	//   0 - success
 	//   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
 	// < 0 - error
@@ -234,13 +296,48 @@ func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool {
 	return true
 }

+// LoadModelFromFile loads a GGUF model file into memory
+//
+// *** THIS IS THE CORE MODEL LOADING FUNCTION ***
+//
+// This reads the GGUF file and loads model weights into memory.
+// The process:
+// 1. Parse GGUF file headers (metadata, architecture, tensors)
+// 2. Memory-map (mmap) or read model weights
+// 3. Distribute layers across devices based on NumGpuLayers:
+//    - First NumGpuLayers transformer layers → GPU
+//    - Remaining layers → CPU
+//    - Embeddings and output layer handling varies
+// 4. Allocate device buffers for tensors
+//
+// For Tesla K80 (compute 3.7) with 12GB VRAM:
+// - Example: gemma3-2b with Q4_0 quantization
+//   - Full model ~1.5GB, can fit entirely on GPU (NumGpuLayers=99)
+// - Example: llama3-8b with Q4_0 quantization
+//   - Full model ~4.5GB, can fit entirely on GPU
+// - Example: llama3-70b with Q4_0 quantization
+//   - Full model ~40GB, needs CPU offload (NumGpuLayers=20-30)
+//
+// Parameters:
+// - modelPath: Path to GGUF file (e.g., ~/.ollama/models/blobs/sha256-abc123...)
+// - params.NumGpuLayers: How many transformer layers to put on GPU
+// - params.MainGpu: Which GPU to use (if multiple)
+// - params.TensorSplit: How to split layers across multiple GPUs
+// - params.UseMmap: Whether to use memory mapping (faster load, less RAM)
+//
+// Returns:
+// - *Model: Loaded model ready for inference
+// - error: If file not found, incompatible format, or out of memory
 func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
+	// Initialize C parameters structure
 	cparams := C.llama_model_default_params()
-	cparams.n_gpu_layers = C.int(params.NumGpuLayers)
-	cparams.main_gpu = C.int32_t(params.MainGpu)
-	cparams.use_mmap = C.bool(params.UseMmap)
-	cparams.vocab_only = C.bool(params.VocabOnly)
+	cparams.n_gpu_layers = C.int(params.NumGpuLayers) // Layers to offload to GPU
+	cparams.main_gpu = C.int32_t(params.MainGpu)       // Primary GPU device ID
+	cparams.use_mmap = C.bool(params.UseMmap)          // Memory-map file (faster)
+	cparams.vocab_only = C.bool(params.VocabOnly)      // Load vocabulary only

+	// Multi-GPU tensor split (for systems with multiple GPUs)
+	// Defines proportion of model to put on each GPU
 	if len(params.TensorSplit) > 0 {
 		tensorSplitData := &params.TensorSplit[0]

@@ -251,6 +348,7 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
 		cparams.tensor_split = (*C.float)(unsafe.Pointer(tensorSplitData))
 	}

+	// Progress callback (reports loading progress percentage)
 	if params.Progress != nil {
 		handle := cgo.NewHandle(params.Progress)
 		defer handle.Delete()
@@ -263,6 +361,12 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
 		cparams.progress_callback_user_data = unsafe.Pointer(&handle)
 	}

+	// *** CALL C FUNCTION TO LOAD MODEL ***
+	// This:
+	// 1. Opens and parses GGUF file
+	// 2. Allocates CPU/GPU memory for tensors
+	// 3. Loads/mmaps weights into memory
+	// 4. For Tesla K80: compiles CUDA kernels via PTX JIT (compute 3.7)
 	m := Model{c: C.llama_model_load_from_file(C.CString(modelPath), cparams)}
 	if m.c == nil {
 		return nil, fmt.Errorf("unable to load model: %s", modelPath)