mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-09 23:37:06 +00:00
This commit represents a complete rework after pulling the latest changes from official ollama/ollama repository and re-applying Tesla K80 compatibility patches. ## Key Changes ### CUDA Compute Capability 3.7 Support (Tesla K80) - Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt - Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset - Using 37-virtual (PTX with JIT compilation) for maximum compatibility ### Legacy Toolchain Compatibility - **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80) - **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7) - **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h) ### CPU Architecture Trade-offs Due to GCC 10.5 limitation, sacrificed newer CPU optimizations: - Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+) - Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA - Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility) ### Build System Updates - Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7 - Added -Wno-deprecated-gpu-targets flag to suppress warnings - Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI ### Upstream Sync Merged latest llama.cpp changes including: - Enhanced KV cache management with ISWA and hybrid memory support - Improved multi-modal support (mtmd framework) - New model architectures (Gemma3, Llama4, Qwen3, etc.) - GPU backend improvements for CUDA, Metal, and ROCm - Updated quantization support and GGUF format handling ### Documentation - Updated CLAUDE.md with comprehensive build instructions - Documented toolchain constraints and CPU architecture trade-offs - Removed outdated CI/CD workflows (tesla-k80-*.yml) - Cleaned up temporary development artifacts ## Rationale This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in official Ollama due to legacy driver/CUDA requirements. The toolchain constraint creates a deadlock: - K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI We accept the loss of cutting-edge CPU optimizations to enable running modern LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
76 lines
2.4 KiB
Go
76 lines
2.4 KiB
Go
//go:build integration && library
|
|
|
|
package integration
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/ollama/ollama/api"
|
|
)
|
|
|
|
// First run of this scenario on a target system will take a long time to download
|
|
// ~1.5TB of models. Set a sufficiently large -timeout for your network speed
|
|
func TestLibraryModelsChat(t *testing.T) {
|
|
softTimeout, hardTimeout := getTimeouts(t)
|
|
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
|
|
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
|
|
defer cancel()
|
|
client, _, cleanup := InitServerConnection(ctx, t)
|
|
defer cleanup()
|
|
targetArch := os.Getenv("OLLAMA_TEST_ARCHITECTURE")
|
|
|
|
chatModels := libraryChatModels
|
|
for _, model := range chatModels {
|
|
t.Run(model, func(t *testing.T) {
|
|
if time.Now().Sub(started) > softTimeout {
|
|
t.Skip("skipping remaining tests to avoid excessive runtime")
|
|
}
|
|
if err := PullIfMissing(ctx, client, model); err != nil {
|
|
t.Fatalf("pull failed %s", err)
|
|
}
|
|
if targetArch != "" {
|
|
resp, err := client.Show(ctx, &api.ShowRequest{Name: model})
|
|
if err != nil {
|
|
t.Fatalf("unable to show model: %s", err)
|
|
}
|
|
arch := resp.ModelInfo["general.architecture"].(string)
|
|
if arch != targetArch {
|
|
t.Skip(fmt.Sprintf("Skipping %s architecture %s != %s", model, arch, targetArch))
|
|
}
|
|
}
|
|
req := api.ChatRequest{
|
|
Model: model,
|
|
Messages: []api.Message{
|
|
{
|
|
Role: "user",
|
|
Content: blueSkyPrompt,
|
|
},
|
|
},
|
|
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
|
Options: map[string]interface{}{
|
|
"temperature": 0.1,
|
|
"seed": 123,
|
|
},
|
|
}
|
|
anyResp := blueSkyExpected
|
|
// Special cases
|
|
if model == "duckdb-nsql" {
|
|
anyResp = []string{"select", "from"}
|
|
} else if model == "granite3-guardian" || model == "shieldgemma" || model == "llama-guard3" || model == "bespoke-minicheck" {
|
|
anyResp = []string{"yes", "no", "safe", "unsafe"}
|
|
} else if model == "openthinker" {
|
|
anyResp = []string{"plugin", "im_sep", "components", "function call"}
|
|
} else if model == "starcoder" || model == "starcoder2" || model == "magicoder" || model == "deepseek-coder" {
|
|
req.Messages[0].Content = "def fibonacci():"
|
|
anyResp = []string{"f(n)", "sequence", "n-1", "main()", "__main__", "while"}
|
|
}
|
|
DoChat(ctx, t, client, req, anyResp, 120*time.Second, 30*time.Second)
|
|
})
|
|
}
|
|
}
|