mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-18 03:37:09 +00:00
Fix Tesla K80 CUBLAS compatibility with two-tier fallback strategy
This commit implements comprehensive Tesla K80 (Kepler, compute 3.7) compatibility for batched matrix multiplication operations. **Problem:** Modern CUBLAS functions fail on Tesla K80 with CUBLAS_STATUS_ARCH_MISMATCH: 1. CUBLAS_GEMM_DEFAULT_TENSOR_OP requires Tensor Cores (Volta+ only) 2. cublasGemmStridedBatchedEx/cublasGemmBatchedEx have architectural requirements beyond algorithm selection **Solution - Two-Tier Fallback:** Tier 1: Algorithm Selection - Volta+ (cc >= 7.0): CUBLAS_GEMM_DEFAULT_TENSOR_OP - Pre-Volta (cc < 7.0): CUBLAS_GEMM_DEFAULT Tier 2: Function Selection - Volta+ or non-FP32: Use *Ex variants (flexible precision) - Kepler/Maxwell/Pascal with FP32: Use legacy type-specific functions (cublasSgemmStridedBatched, cublasSgemmBatched) **Changes:** CUDA Implementation: - ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu * ggml_cuda_op_mul_mat_cublas: Algorithm selection for non-batched ops * ggml_cuda_mul_mat_batched_cublas_impl: Two-tier fallback for batched ops * Added GGML_CUDA_DEBUG environment variable for conditional debug logging * Comprehensive function documentation explaining fallback strategy Documentation: - CLAUDE.md * Added Tesla K80 CUBLAS Compatibility section * Documented GGML_CUDA_DEBUG environment variable * Enhanced "Running Ollama" section with log capture examples * Updated Files Modified list Code Comments: - Added detailed comments throughout CUDA code explaining: * Why TENSOR_OP fails on pre-Volta GPUs * Why *Ex functions require architectural support * Compute capability checks and fallback logic * Debug logging usage **Testing:** All models verified working on Tesla K80: - ✅ gemma3:4b - ✅ gpt-oss - ✅ deepseek-r1 Debug flag tested in both enabled and disabled states. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
37
cmd/cmd.go
37
cmd/cmd.go
@@ -322,11 +322,18 @@ func StopHandler(cmd *cobra.Command, args []string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// RunHandler is the entry point for "ollama run <model>" command
|
||||
// This function orchestrates the entire model execution flow:
|
||||
// 1. Parse command-line arguments and options (format, keepalive, think mode, etc.)
|
||||
// 2. Determine if running in interactive or non-interactive mode
|
||||
// 3. Query model info from server (or pull if not found)
|
||||
// 4. Route to either generateInteractive() or generate() based on mode
|
||||
func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
// Default to interactive mode unless prompt is provided or output is piped
|
||||
interactive := true
|
||||
|
||||
opts := runOptions{
|
||||
Model: args[0],
|
||||
Model: args[0], // Model name (e.g., "gemma3")
|
||||
WordWrap: os.Getenv("TERM") == "xterm-256color",
|
||||
Options: map[string]any{},
|
||||
ShowConnect: true,
|
||||
@@ -379,7 +386,8 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
}
|
||||
|
||||
prompts := args[1:]
|
||||
// prepend stdin to the prompt if provided
|
||||
// Check if stdin contains input (e.g., piped data: echo "hello" | ollama run gemma3)
|
||||
// If so, prepend it to the prompt and switch to non-interactive mode
|
||||
if !term.IsTerminal(int(os.Stdin.Fd())) {
|
||||
in, err := io.ReadAll(os.Stdin)
|
||||
if err != nil {
|
||||
@@ -392,10 +400,12 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
interactive = false
|
||||
}
|
||||
opts.Prompt = strings.Join(prompts, " ")
|
||||
// If prompt provided as argument (e.g., ollama run gemma3 "tell me a joke")
|
||||
// then use non-interactive mode (single-shot generation)
|
||||
if len(prompts) > 0 {
|
||||
interactive = false
|
||||
}
|
||||
// Be quiet if we're redirecting to a pipe or file
|
||||
// If stdout is redirected to a pipe or file, use non-interactive mode
|
||||
if !term.IsTerminal(int(os.Stdout.Fd())) {
|
||||
interactive = false
|
||||
}
|
||||
@@ -406,22 +416,30 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
}
|
||||
opts.WordWrap = !nowrap
|
||||
|
||||
// Fill out the rest of the options based on information about the
|
||||
// model.
|
||||
// Create HTTP client to communicate with Ollama server
|
||||
// The server must be running (started via "ollama serve")
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Query model metadata from server (HTTP GET /api/show)
|
||||
// This retrieves:
|
||||
// - Model capabilities (vision, tools, thinking)
|
||||
// - Model parameters (context size, architecture)
|
||||
// - Chat template format
|
||||
// If model not found locally, automatically pull from registry
|
||||
name := args[0]
|
||||
info, err := func() (*api.ShowResponse, error) {
|
||||
showReq := &api.ShowRequest{Name: name}
|
||||
info, err := client.Show(cmd.Context(), showReq)
|
||||
var se api.StatusError
|
||||
if errors.As(err, &se) && se.StatusCode == http.StatusNotFound {
|
||||
// Model not found locally, pull it from registry
|
||||
if err := PullHandler(cmd, []string{name}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// Retry after successful pull
|
||||
return client.Show(cmd.Context(), &api.ShowRequest{Name: name})
|
||||
}
|
||||
return info, err
|
||||
@@ -435,6 +453,8 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// Detect if model supports multimodal input (images + text)
|
||||
// Used for models like LLaVA, Bakllava, or vision-capable models
|
||||
opts.MultiModal = slices.Contains(info.Capabilities, model.CapabilityVision)
|
||||
|
||||
// TODO: remove the projector info and vision info checks below,
|
||||
@@ -453,6 +473,8 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
opts.ParentModel = info.Details.ParentModel
|
||||
|
||||
if interactive {
|
||||
// In interactive mode, load the model into memory first
|
||||
// This sends a load request to the server, which triggers the scheduler
|
||||
if err := loadOrUnloadModel(cmd, &opts); err != nil {
|
||||
var sErr api.AuthorizationError
|
||||
if errors.As(err, &sErr) && sErr.StatusCode == http.StatusUnauthorized {
|
||||
@@ -466,6 +488,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// Display any previous conversation history (for multi-turn chats)
|
||||
for _, msg := range info.Messages {
|
||||
switch msg.Role {
|
||||
case "user":
|
||||
@@ -478,8 +501,12 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
}
|
||||
}
|
||||
|
||||
// Enter interactive REPL mode (Read-Eval-Print Loop)
|
||||
// User can enter multiple prompts in sequence
|
||||
return generateInteractive(cmd, opts)
|
||||
}
|
||||
// Non-interactive mode: single generation then exit
|
||||
// Used for: ollama run gemma3 "prompt here"
|
||||
return generate(cmd, opts)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user