mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-18 11:47:07 +00:00
Fix Tesla K80 CUBLAS compatibility with two-tier fallback strategy
This commit implements comprehensive Tesla K80 (Kepler, compute 3.7) compatibility for batched matrix multiplication operations. **Problem:** Modern CUBLAS functions fail on Tesla K80 with CUBLAS_STATUS_ARCH_MISMATCH: 1. CUBLAS_GEMM_DEFAULT_TENSOR_OP requires Tensor Cores (Volta+ only) 2. cublasGemmStridedBatchedEx/cublasGemmBatchedEx have architectural requirements beyond algorithm selection **Solution - Two-Tier Fallback:** Tier 1: Algorithm Selection - Volta+ (cc >= 7.0): CUBLAS_GEMM_DEFAULT_TENSOR_OP - Pre-Volta (cc < 7.0): CUBLAS_GEMM_DEFAULT Tier 2: Function Selection - Volta+ or non-FP32: Use *Ex variants (flexible precision) - Kepler/Maxwell/Pascal with FP32: Use legacy type-specific functions (cublasSgemmStridedBatched, cublasSgemmBatched) **Changes:** CUDA Implementation: - ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu * ggml_cuda_op_mul_mat_cublas: Algorithm selection for non-batched ops * ggml_cuda_mul_mat_batched_cublas_impl: Two-tier fallback for batched ops * Added GGML_CUDA_DEBUG environment variable for conditional debug logging * Comprehensive function documentation explaining fallback strategy Documentation: - CLAUDE.md * Added Tesla K80 CUBLAS Compatibility section * Documented GGML_CUDA_DEBUG environment variable * Enhanced "Running Ollama" section with log capture examples * Updated Files Modified list Code Comments: - Added detailed comments throughout CUDA code explaining: * Why TENSOR_OP fails on pre-Volta GPUs * Why *Ex functions require architectural support * Compute capability checks and fallback logic * Debug logging usage **Testing:** All models verified working on Tesla K80: - ✅ gemma3:4b - ✅ gpt-oss - ✅ deepseek-r1 Debug flag tested in both enabled and disabled states. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -171,8 +171,29 @@ func signinURL() (string, error) {
|
||||
return fmt.Sprintf(signinURLStr, url.PathEscape(h), encKey), nil
|
||||
}
|
||||
|
||||
// GenerateHandler is the HTTP handler for POST /api/generate endpoint
|
||||
// This is the main server-side entry point for model inference requests
|
||||
//
|
||||
// Flow:
|
||||
// 1. Parse and validate the GenerateRequest JSON body
|
||||
// 2. Load model metadata (config, template, system prompt)
|
||||
// 3. Schedule/acquire a runner instance from the scheduler
|
||||
// 4. Apply chat template to format the prompt
|
||||
// 5. Call runner's Completion() method for actual inference
|
||||
// 6. Stream responses back to client as Server-Sent Events (SSE)
|
||||
//
|
||||
// Request structure (api.GenerateRequest):
|
||||
// - Model: string (e.g., "gemma3", "llama3")
|
||||
// - Prompt: string (user input)
|
||||
// - Images: []string (base64 for multimodal models)
|
||||
// - System: string (system prompt override)
|
||||
// - Template: string (template override)
|
||||
// - Options: map[string]any (temperature, top_p, num_gpu, etc.)
|
||||
// - KeepAlive: Duration (how long to keep model in memory)
|
||||
func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
checkpointStart := time.Now()
|
||||
|
||||
// Parse JSON request body into GenerateRequest struct
|
||||
var req api.GenerateRequest
|
||||
if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
|
||||
@@ -182,6 +203,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// Parse and validate model name (format: [registry/][namespace/]model[:tag])
|
||||
name := model.ParseName(req.Model)
|
||||
if !name.IsValid() {
|
||||
// Ideally this is "invalid model name" but we're keeping with
|
||||
@@ -190,6 +212,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// Resolve the actual model name (handles aliases and version resolution)
|
||||
// We cannot currently consolidate this into GetModel because all we'll
|
||||
// induce infinite recursion given the current code structure.
|
||||
name, err := getExistingName(name)
|
||||
@@ -198,6 +221,13 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// Load model metadata from disk (server/images.go:320)
|
||||
// This reads the GGUF file headers and Modelfile to extract:
|
||||
// - Model config (architecture, quantization, context size)
|
||||
// - Chat template (how to format messages for this model)
|
||||
// - System prompt (default instructions)
|
||||
// - Model capabilities (vision, tools, thinking)
|
||||
// - Model options (temperature defaults, etc.)
|
||||
m, err := GetModel(name.String())
|
||||
if err != nil {
|
||||
switch {
|
||||
@@ -357,6 +387,23 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// Schedule a runner instance from the scheduler (server/sched.go:84)
|
||||
// This is THE critical step that loads the model into memory
|
||||
//
|
||||
// The scheduler will:
|
||||
// 1. Check if model is already loaded in memory (cache hit)
|
||||
// 2. If not loaded:
|
||||
// a. Analyze GGML file to determine layer distribution (CPU vs GPU)
|
||||
// b. Spawn a runner subprocess: "ollama runner --model <path> --port <port>"
|
||||
// c. Load GGUF weights into memory via llama.cpp
|
||||
// d. Allocate KV cache on GPU (if using GPU)
|
||||
// e. Initialize inference context
|
||||
// 3. Return (LlamaServer, Model, Options) tuple
|
||||
//
|
||||
// Parameters:
|
||||
// - caps: required capabilities (completion, vision, thinking, etc.)
|
||||
// - req.Options: user-provided options (num_gpu, temperature, etc.)
|
||||
// - req.KeepAlive: how long to keep model loaded after request completes
|
||||
r, m, opts, err := s.scheduleRunner(c.Request.Context(), name.String(), caps, req.Options, req.KeepAlive)
|
||||
if errors.Is(err, errCapabilityCompletion) {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support generate", req.Model)})
|
||||
@@ -368,7 +415,8 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
|
||||
checkpointLoaded := time.Now()
|
||||
|
||||
// load the model
|
||||
// If prompt is empty, this is just a model load request (warmup)
|
||||
// Return immediately without running inference
|
||||
if req.Prompt == "" {
|
||||
c.JSON(http.StatusOK, api.GenerateResponse{
|
||||
Model: req.Model,
|
||||
@@ -384,13 +432,20 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// Prepare image data for multimodal models (if any)
|
||||
images := make([]llm.ImageData, len(req.Images))
|
||||
for i := range req.Images {
|
||||
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
|
||||
}
|
||||
|
||||
// Apply chat template to format the prompt
|
||||
// Chat templates convert structured messages into model-specific format
|
||||
// Example for Gemma3:
|
||||
// Input: [{"role": "user", "content": "Hello"}]
|
||||
// Output: "<bos><start_of_turn>user\nHello<end_of_turn>\n<start_of_turn>model\n"
|
||||
prompt := req.Prompt
|
||||
if !req.Raw {
|
||||
// Get template from model config (or use override from request)
|
||||
tmpl := m.Template
|
||||
if req.Template != "" {
|
||||
tmpl, err = template.Parse(req.Template)
|
||||
@@ -402,20 +457,25 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
|
||||
var values template.Values
|
||||
if req.Suffix != "" {
|
||||
// Fill-in-the-middle mode (for code completion)
|
||||
values.Prompt = prompt
|
||||
values.Suffix = req.Suffix
|
||||
} else {
|
||||
// Normal chat mode: build message list
|
||||
var msgs []api.Message
|
||||
// Add system prompt (instructions for the model)
|
||||
if req.System != "" {
|
||||
msgs = append(msgs, api.Message{Role: "system", Content: req.System})
|
||||
} else if m.System != "" {
|
||||
msgs = append(msgs, api.Message{Role: "system", Content: m.System})
|
||||
}
|
||||
|
||||
// Add conversation history (for multi-turn chats)
|
||||
if req.Context == nil {
|
||||
msgs = append(msgs, m.Messages...)
|
||||
}
|
||||
|
||||
// Add current user message with any images
|
||||
userMsg := api.Message{Role: "user", Content: req.Prompt}
|
||||
for _, i := range images {
|
||||
userMsg.Images = append(userMsg.Images, i.Data)
|
||||
@@ -495,11 +555,31 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// Create channel for streaming responses from inference engine
|
||||
ch := make(chan any)
|
||||
go func() {
|
||||
// TODO (jmorganca): avoid building the response twice both here and below
|
||||
var sb strings.Builder
|
||||
defer close(ch)
|
||||
|
||||
// *** THIS IS THE CORE INFERENCE CALL ***
|
||||
// r.Completion() bridges Go → runner subprocess → C/C++ → CUDA
|
||||
//
|
||||
// Flow:
|
||||
// 1. This sends HTTP POST to runner subprocess at http://127.0.0.1:<port>/completion
|
||||
// 2. Runner subprocess (llamarunner/runner.go) receives request
|
||||
// 3. Runner tokenizes prompt and creates inference batch
|
||||
// 4. Runner calls context.Decode() repeatedly (llama/llama.go CGO binding)
|
||||
// 5. context.Decode() calls C.llama_decode() from llama.cpp
|
||||
// 6. llama_decode() executes CUDA kernels on GPU (Tesla K80 compute 3.7)
|
||||
// 7. Each generated token is sampled and streamed back via callback
|
||||
//
|
||||
// CompletionRequest fields:
|
||||
// - Prompt: formatted text (after template application)
|
||||
// - Images: base64 image data (for vision models)
|
||||
// - Options: temperature, top_p, top_k, num_gpu, etc.
|
||||
// - Shift: whether to shift context window when full
|
||||
// - Truncate: whether to truncate prompt if too long
|
||||
if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
|
||||
Prompt: prompt,
|
||||
Images: images,
|
||||
@@ -508,6 +588,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
Shift: req.Shift == nil || *req.Shift,
|
||||
Truncate: req.Truncate == nil || *req.Truncate,
|
||||
}, func(cr llm.CompletionResponse) {
|
||||
// Callback function called for each generated token (streaming)
|
||||
res := api.GenerateResponse{
|
||||
Model: req.Model,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
|
||||
Reference in New Issue
Block a user