Fix Tesla K80 CUBLAS compatibility with two-tier fallback strategy

This commit implements comprehensive Tesla K80 (Kepler, compute 3.7)
compatibility for batched matrix multiplication operations.

**Problem:**
Modern CUBLAS functions fail on Tesla K80 with CUBLAS_STATUS_ARCH_MISMATCH:
1. CUBLAS_GEMM_DEFAULT_TENSOR_OP requires Tensor Cores (Volta+ only)
2. cublasGemmStridedBatchedEx/cublasGemmBatchedEx have architectural
   requirements beyond algorithm selection

**Solution - Two-Tier Fallback:**

Tier 1: Algorithm Selection
- Volta+ (cc >= 7.0): CUBLAS_GEMM_DEFAULT_TENSOR_OP
- Pre-Volta (cc < 7.0): CUBLAS_GEMM_DEFAULT

Tier 2: Function Selection
- Volta+ or non-FP32: Use *Ex variants (flexible precision)
- Kepler/Maxwell/Pascal with FP32: Use legacy type-specific functions
  (cublasSgemmStridedBatched, cublasSgemmBatched)

**Changes:**

CUDA Implementation:
- ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
  * ggml_cuda_op_mul_mat_cublas: Algorithm selection for non-batched ops
  * ggml_cuda_mul_mat_batched_cublas_impl: Two-tier fallback for batched ops
  * Added GGML_CUDA_DEBUG environment variable for conditional debug logging
  * Comprehensive function documentation explaining fallback strategy

Documentation:
- CLAUDE.md
  * Added Tesla K80 CUBLAS Compatibility section
  * Documented GGML_CUDA_DEBUG environment variable
  * Enhanced "Running Ollama" section with log capture examples
  * Updated Files Modified list

Code Comments:
- Added detailed comments throughout CUDA code explaining:
  * Why TENSOR_OP fails on pre-Volta GPUs
  * Why *Ex functions require architectural support
  * Compute capability checks and fallback logic
  * Debug logging usage

**Testing:**
All models verified working on Tesla K80:
-  gemma3:4b
-  gpt-oss
-  deepseek-r1

Debug flag tested in both enabled and disabled states.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Shang Chieh Tseng
2025-11-05 23:52:45 +08:00
parent ef14fb5b26
commit d948926581
8 changed files with 616 additions and 153 deletions

View File

@@ -322,11 +322,18 @@ func StopHandler(cmd *cobra.Command, args []string) error {
return nil
}
// RunHandler is the entry point for "ollama run <model>" command
// This function orchestrates the entire model execution flow:
// 1. Parse command-line arguments and options (format, keepalive, think mode, etc.)
// 2. Determine if running in interactive or non-interactive mode
// 3. Query model info from server (or pull if not found)
// 4. Route to either generateInteractive() or generate() based on mode
func RunHandler(cmd *cobra.Command, args []string) error {
// Default to interactive mode unless prompt is provided or output is piped
interactive := true
opts := runOptions{
Model: args[0],
Model: args[0], // Model name (e.g., "gemma3")
WordWrap: os.Getenv("TERM") == "xterm-256color",
Options: map[string]any{},
ShowConnect: true,
@@ -379,7 +386,8 @@ func RunHandler(cmd *cobra.Command, args []string) error {
}
prompts := args[1:]
// prepend stdin to the prompt if provided
// Check if stdin contains input (e.g., piped data: echo "hello" | ollama run gemma3)
// If so, prepend it to the prompt and switch to non-interactive mode
if !term.IsTerminal(int(os.Stdin.Fd())) {
in, err := io.ReadAll(os.Stdin)
if err != nil {
@@ -392,10 +400,12 @@ func RunHandler(cmd *cobra.Command, args []string) error {
interactive = false
}
opts.Prompt = strings.Join(prompts, " ")
// If prompt provided as argument (e.g., ollama run gemma3 "tell me a joke")
// then use non-interactive mode (single-shot generation)
if len(prompts) > 0 {
interactive = false
}
// Be quiet if we're redirecting to a pipe or file
// If stdout is redirected to a pipe or file, use non-interactive mode
if !term.IsTerminal(int(os.Stdout.Fd())) {
interactive = false
}
@@ -406,22 +416,30 @@ func RunHandler(cmd *cobra.Command, args []string) error {
}
opts.WordWrap = !nowrap
// Fill out the rest of the options based on information about the
// model.
// Create HTTP client to communicate with Ollama server
// The server must be running (started via "ollama serve")
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
// Query model metadata from server (HTTP GET /api/show)
// This retrieves:
// - Model capabilities (vision, tools, thinking)
// - Model parameters (context size, architecture)
// - Chat template format
// If model not found locally, automatically pull from registry
name := args[0]
info, err := func() (*api.ShowResponse, error) {
showReq := &api.ShowRequest{Name: name}
info, err := client.Show(cmd.Context(), showReq)
var se api.StatusError
if errors.As(err, &se) && se.StatusCode == http.StatusNotFound {
// Model not found locally, pull it from registry
if err := PullHandler(cmd, []string{name}); err != nil {
return nil, err
}
// Retry after successful pull
return client.Show(cmd.Context(), &api.ShowRequest{Name: name})
}
return info, err
@@ -435,6 +453,8 @@ func RunHandler(cmd *cobra.Command, args []string) error {
return err
}
// Detect if model supports multimodal input (images + text)
// Used for models like LLaVA, Bakllava, or vision-capable models
opts.MultiModal = slices.Contains(info.Capabilities, model.CapabilityVision)
// TODO: remove the projector info and vision info checks below,
@@ -453,6 +473,8 @@ func RunHandler(cmd *cobra.Command, args []string) error {
opts.ParentModel = info.Details.ParentModel
if interactive {
// In interactive mode, load the model into memory first
// This sends a load request to the server, which triggers the scheduler
if err := loadOrUnloadModel(cmd, &opts); err != nil {
var sErr api.AuthorizationError
if errors.As(err, &sErr) && sErr.StatusCode == http.StatusUnauthorized {
@@ -466,6 +488,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
return err
}
// Display any previous conversation history (for multi-turn chats)
for _, msg := range info.Messages {
switch msg.Role {
case "user":
@@ -478,8 +501,12 @@ func RunHandler(cmd *cobra.Command, args []string) error {
}
}
// Enter interactive REPL mode (Read-Eval-Print Loop)
// User can enter multiple prompts in sequence
return generateInteractive(cmd, opts)
}
// Non-interactive mode: single generation then exit
// Used for: ollama run gemma3 "prompt here"
return generate(cmd, opts)
}