mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-09 23:37:06 +00:00
This commit represents a complete rework after pulling the latest changes from official ollama/ollama repository and re-applying Tesla K80 compatibility patches. ## Key Changes ### CUDA Compute Capability 3.7 Support (Tesla K80) - Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt - Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset - Using 37-virtual (PTX with JIT compilation) for maximum compatibility ### Legacy Toolchain Compatibility - **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80) - **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7) - **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h) ### CPU Architecture Trade-offs Due to GCC 10.5 limitation, sacrificed newer CPU optimizations: - Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+) - Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA - Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility) ### Build System Updates - Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7 - Added -Wno-deprecated-gpu-targets flag to suppress warnings - Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI ### Upstream Sync Merged latest llama.cpp changes including: - Enhanced KV cache management with ISWA and hybrid memory support - Improved multi-modal support (mtmd framework) - New model architectures (Gemma3, Llama4, Qwen3, etc.) - GPU backend improvements for CUDA, Metal, and ROCm - Updated quantization support and GGUF format handling ### Documentation - Updated CLAUDE.md with comprehensive build instructions - Documented toolchain constraints and CPU architecture trade-offs - Removed outdated CI/CD workflows (tesla-k80-*.yml) - Cleaned up temporary development artifacts ## Rationale This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in official Ollama due to legacy driver/CUDA requirements. The toolchain constraint creates a deadlock: - K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI We accept the loss of cutting-edge CPU optimizations to enable running modern LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
127 lines
3.5 KiB
Go
127 lines
3.5 KiB
Go
//go:build integration
|
|
|
|
package integration
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"log/slog"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/ollama/ollama/api"
|
|
)
|
|
|
|
func TestMaxQueue(t *testing.T) {
|
|
t.Skip("this test needs to be re-evaluated to use a proper embedding model")
|
|
|
|
if os.Getenv("OLLAMA_TEST_EXISTING") != "" {
|
|
t.Skip("Max Queue test requires spawning a local server so we can adjust the queue size")
|
|
return
|
|
}
|
|
|
|
// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
|
|
// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
|
|
threadCount := 16
|
|
t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount))
|
|
|
|
req := api.GenerateRequest{
|
|
Model: smol,
|
|
Prompt: "write a long historical fiction story about christopher columbus. use at least 10 facts from his actual journey",
|
|
Options: map[string]any{
|
|
"seed": 42,
|
|
"temperature": 0.0,
|
|
},
|
|
}
|
|
resp := []string{"explore", "discover", "ocean"}
|
|
|
|
// CPU mode takes much longer at the limit with a large queue setting
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
|
defer cancel()
|
|
client, _, cleanup := InitServerConnection(ctx, t)
|
|
defer cleanup()
|
|
|
|
if err := PullIfMissing(ctx, client, req.Model); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Context for the worker threads so we can shut them down
|
|
// embedCtx, embedCancel := context.WithCancel(ctx)
|
|
embedCtx := ctx
|
|
|
|
var genwg sync.WaitGroup
|
|
genwg.Add(1)
|
|
go func() {
|
|
defer genwg.Done()
|
|
slog.Info("Starting generate request")
|
|
DoGenerate(ctx, t, client, req, resp, 45*time.Second, 5*time.Second)
|
|
slog.Info("generate completed")
|
|
}()
|
|
|
|
// Give the generate a chance to get started before we start hammering on embed requests
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
threadCount += 10 // Add a few extra to ensure we push the queue past its limit
|
|
busyCount := 0
|
|
resetByPeerCount := 0
|
|
canceledCount := 0
|
|
successCount := 0
|
|
counterMu := sync.Mutex{}
|
|
var embedwg sync.WaitGroup
|
|
for i := 0; i < threadCount; i++ {
|
|
embedwg.Add(1)
|
|
go func(i int) {
|
|
defer embedwg.Done()
|
|
slog.Info("embed started", "id", i)
|
|
embedReq := api.EmbeddingRequest{
|
|
Model: req.Model,
|
|
Prompt: req.Prompt,
|
|
Options: req.Options,
|
|
}
|
|
// Fresh client for every request
|
|
client, _ = GetTestEndpoint()
|
|
|
|
resp, genErr := client.Embeddings(embedCtx, &embedReq)
|
|
counterMu.Lock()
|
|
defer counterMu.Unlock()
|
|
switch {
|
|
case genErr == nil:
|
|
successCount++
|
|
if len(resp.Embedding) < 5 { // somewhat arbitrary, but sufficient to be reasonable
|
|
t.Fatalf("embeddings shorter than expected: %d", len(resp.Embedding))
|
|
}
|
|
case errors.Is(genErr, context.Canceled):
|
|
canceledCount++
|
|
case strings.Contains(genErr.Error(), "busy"):
|
|
busyCount++
|
|
case strings.Contains(genErr.Error(), "connection reset by peer"):
|
|
resetByPeerCount++
|
|
default:
|
|
if genErr != nil {
|
|
t.Fatalf("%d request failed", i)
|
|
}
|
|
}
|
|
|
|
slog.Info("embed finished", "id", i)
|
|
}(i)
|
|
}
|
|
genwg.Wait()
|
|
slog.Info("generate done, waiting for embeds")
|
|
embedwg.Wait()
|
|
|
|
slog.Info("embeds completed", "success", successCount, "busy", busyCount, "reset", resetByPeerCount, "canceled", canceledCount)
|
|
if resetByPeerCount != 0 {
|
|
t.Fatalf("Connections reset by peer, have you updated your fd and socket limits? %d", resetByPeerCount)
|
|
}
|
|
if busyCount == 0 {
|
|
t.Fatalf("no requests hit busy error but some should have")
|
|
}
|
|
if canceledCount > 0 {
|
|
t.Fatalf("no requests should have been canceled due to timeout %d", canceledCount)
|
|
}
|
|
}
|