Fix Tesla K80 CUBLAS compatibility with two-tier fallback strategy

This commit implements comprehensive Tesla K80 (Kepler, compute 3.7)
compatibility for batched matrix multiplication operations.

**Problem:**
Modern CUBLAS functions fail on Tesla K80 with CUBLAS_STATUS_ARCH_MISMATCH:
1. CUBLAS_GEMM_DEFAULT_TENSOR_OP requires Tensor Cores (Volta+ only)
2. cublasGemmStridedBatchedEx/cublasGemmBatchedEx have architectural
   requirements beyond algorithm selection

**Solution - Two-Tier Fallback:**

Tier 1: Algorithm Selection
- Volta+ (cc >= 7.0): CUBLAS_GEMM_DEFAULT_TENSOR_OP
- Pre-Volta (cc < 7.0): CUBLAS_GEMM_DEFAULT

Tier 2: Function Selection
- Volta+ or non-FP32: Use *Ex variants (flexible precision)
- Kepler/Maxwell/Pascal with FP32: Use legacy type-specific functions
  (cublasSgemmStridedBatched, cublasSgemmBatched)

**Changes:**

CUDA Implementation:
- ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
  * ggml_cuda_op_mul_mat_cublas: Algorithm selection for non-batched ops
  * ggml_cuda_mul_mat_batched_cublas_impl: Two-tier fallback for batched ops
  * Added GGML_CUDA_DEBUG environment variable for conditional debug logging
  * Comprehensive function documentation explaining fallback strategy

Documentation:
- CLAUDE.md
  * Added Tesla K80 CUBLAS Compatibility section
  * Documented GGML_CUDA_DEBUG environment variable
  * Enhanced "Running Ollama" section with log capture examples
  * Updated Files Modified list

Code Comments:
- Added detailed comments throughout CUDA code explaining:
  * Why TENSOR_OP fails on pre-Volta GPUs
  * Why *Ex functions require architectural support
  * Compute capability checks and fallback logic
  * Debug logging usage

**Testing:**
All models verified working on Tesla K80:
-  gemma3:4b
-  gpt-oss
-  deepseek-r1

Debug flag tested in both enabled and disabled states.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Shang Chieh Tseng
2025-11-05 23:52:45 +08:00
parent ef14fb5b26
commit d948926581
8 changed files with 616 additions and 153 deletions

View File

@@ -171,8 +171,29 @@ func signinURL() (string, error) {
return fmt.Sprintf(signinURLStr, url.PathEscape(h), encKey), nil
}
// GenerateHandler is the HTTP handler for POST /api/generate endpoint
// This is the main server-side entry point for model inference requests
//
// Flow:
// 1. Parse and validate the GenerateRequest JSON body
// 2. Load model metadata (config, template, system prompt)
// 3. Schedule/acquire a runner instance from the scheduler
// 4. Apply chat template to format the prompt
// 5. Call runner's Completion() method for actual inference
// 6. Stream responses back to client as Server-Sent Events (SSE)
//
// Request structure (api.GenerateRequest):
// - Model: string (e.g., "gemma3", "llama3")
// - Prompt: string (user input)
// - Images: []string (base64 for multimodal models)
// - System: string (system prompt override)
// - Template: string (template override)
// - Options: map[string]any (temperature, top_p, num_gpu, etc.)
// - KeepAlive: Duration (how long to keep model in memory)
func (s *Server) GenerateHandler(c *gin.Context) {
checkpointStart := time.Now()
// Parse JSON request body into GenerateRequest struct
var req api.GenerateRequest
if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@@ -182,6 +203,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
return
}
// Parse and validate model name (format: [registry/][namespace/]model[:tag])
name := model.ParseName(req.Model)
if !name.IsValid() {
// Ideally this is "invalid model name" but we're keeping with
@@ -190,6 +212,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
return
}
// Resolve the actual model name (handles aliases and version resolution)
// We cannot currently consolidate this into GetModel because all we'll
// induce infinite recursion given the current code structure.
name, err := getExistingName(name)
@@ -198,6 +221,13 @@ func (s *Server) GenerateHandler(c *gin.Context) {
return
}
// Load model metadata from disk (server/images.go:320)
// This reads the GGUF file headers and Modelfile to extract:
// - Model config (architecture, quantization, context size)
// - Chat template (how to format messages for this model)
// - System prompt (default instructions)
// - Model capabilities (vision, tools, thinking)
// - Model options (temperature defaults, etc.)
m, err := GetModel(name.String())
if err != nil {
switch {
@@ -357,6 +387,23 @@ func (s *Server) GenerateHandler(c *gin.Context) {
}
}
// Schedule a runner instance from the scheduler (server/sched.go:84)
// This is THE critical step that loads the model into memory
//
// The scheduler will:
// 1. Check if model is already loaded in memory (cache hit)
// 2. If not loaded:
// a. Analyze GGML file to determine layer distribution (CPU vs GPU)
// b. Spawn a runner subprocess: "ollama runner --model <path> --port <port>"
// c. Load GGUF weights into memory via llama.cpp
// d. Allocate KV cache on GPU (if using GPU)
// e. Initialize inference context
// 3. Return (LlamaServer, Model, Options) tuple
//
// Parameters:
// - caps: required capabilities (completion, vision, thinking, etc.)
// - req.Options: user-provided options (num_gpu, temperature, etc.)
// - req.KeepAlive: how long to keep model loaded after request completes
r, m, opts, err := s.scheduleRunner(c.Request.Context(), name.String(), caps, req.Options, req.KeepAlive)
if errors.Is(err, errCapabilityCompletion) {
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support generate", req.Model)})
@@ -368,7 +415,8 @@ func (s *Server) GenerateHandler(c *gin.Context) {
checkpointLoaded := time.Now()
// load the model
// If prompt is empty, this is just a model load request (warmup)
// Return immediately without running inference
if req.Prompt == "" {
c.JSON(http.StatusOK, api.GenerateResponse{
Model: req.Model,
@@ -384,13 +432,20 @@ func (s *Server) GenerateHandler(c *gin.Context) {
return
}
// Prepare image data for multimodal models (if any)
images := make([]llm.ImageData, len(req.Images))
for i := range req.Images {
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
}
// Apply chat template to format the prompt
// Chat templates convert structured messages into model-specific format
// Example for Gemma3:
// Input: [{"role": "user", "content": "Hello"}]
// Output: "<bos><start_of_turn>user\nHello<end_of_turn>\n<start_of_turn>model\n"
prompt := req.Prompt
if !req.Raw {
// Get template from model config (or use override from request)
tmpl := m.Template
if req.Template != "" {
tmpl, err = template.Parse(req.Template)
@@ -402,20 +457,25 @@ func (s *Server) GenerateHandler(c *gin.Context) {
var values template.Values
if req.Suffix != "" {
// Fill-in-the-middle mode (for code completion)
values.Prompt = prompt
values.Suffix = req.Suffix
} else {
// Normal chat mode: build message list
var msgs []api.Message
// Add system prompt (instructions for the model)
if req.System != "" {
msgs = append(msgs, api.Message{Role: "system", Content: req.System})
} else if m.System != "" {
msgs = append(msgs, api.Message{Role: "system", Content: m.System})
}
// Add conversation history (for multi-turn chats)
if req.Context == nil {
msgs = append(msgs, m.Messages...)
}
// Add current user message with any images
userMsg := api.Message{Role: "user", Content: req.Prompt}
for _, i := range images {
userMsg.Images = append(userMsg.Images, i.Data)
@@ -495,11 +555,31 @@ func (s *Server) GenerateHandler(c *gin.Context) {
}
}
// Create channel for streaming responses from inference engine
ch := make(chan any)
go func() {
// TODO (jmorganca): avoid building the response twice both here and below
var sb strings.Builder
defer close(ch)
// *** THIS IS THE CORE INFERENCE CALL ***
// r.Completion() bridges Go → runner subprocess → C/C++ → CUDA
//
// Flow:
// 1. This sends HTTP POST to runner subprocess at http://127.0.0.1:<port>/completion
// 2. Runner subprocess (llamarunner/runner.go) receives request
// 3. Runner tokenizes prompt and creates inference batch
// 4. Runner calls context.Decode() repeatedly (llama/llama.go CGO binding)
// 5. context.Decode() calls C.llama_decode() from llama.cpp
// 6. llama_decode() executes CUDA kernels on GPU (Tesla K80 compute 3.7)
// 7. Each generated token is sampled and streamed back via callback
//
// CompletionRequest fields:
// - Prompt: formatted text (after template application)
// - Images: base64 image data (for vision models)
// - Options: temperature, top_p, top_k, num_gpu, etc.
// - Shift: whether to shift context window when full
// - Truncate: whether to truncate prompt if too long
if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
Prompt: prompt,
Images: images,
@@ -508,6 +588,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
Shift: req.Shift == nil || *req.Shift,
Truncate: req.Truncate == nil || *req.Truncate,
}, func(cr llm.CompletionResponse) {
// Callback function called for each generated token (streaming)
res := api.GenerateResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),