Sync with upstream ollama/ollama and restore Tesla K80 (compute 3.7) support

This commit represents a complete rework after pulling the latest changes from official ollama/ollama repository and re-applying Tesla K80 compatibility patches. ## Key Changes ### CUDA Compute Capability 3.7 Support (Tesla K80) - Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt - Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset - Using 37-virtual (PTX with JIT compilation) for maximum compatibility ### Legacy Toolchain Compatibility - **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80) - **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7) - **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h) ### CPU Architecture Trade-offs Due to GCC 10.5 limitation, sacrificed newer CPU optimizations: - Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+) - Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA - Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility) ### Build System Updates - Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7 - Added -Wno-deprecated-gpu-targets flag to suppress warnings - Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI ### Upstream Sync Merged latest llama.cpp changes including: - Enhanced KV cache management with ISWA and hybrid memory support - Improved multi-modal support (mtmd framework) - New model architectures (Gemma3, Llama4, Qwen3, etc.) - GPU backend improvements for CUDA, Metal, and ROCm - Updated quantization support and GGUF format handling ### Documentation - Updated CLAUDE.md with comprehensive build instructions - Documented toolchain constraints and CPU architecture trade-offs - Removed outdated CI/CD workflows (tesla-k80-*.yml) - Cleaned up temporary development artifacts ## Rationale This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in official Ollama due to legacy driver/CUDA requirements. The toolchain constraint creates a deadlock: - K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI We accept the loss of cutting-edge CPU optimizations to enable running modern LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-18 03:37:09 +00:00 · 2025-11-05 14:03:05 +08:00
parent fabe2c5cb7
commit ef14fb5b26
817 changed files with 241634 additions and 70888 deletions
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -4,257 +4,185 @@ package integration

 import (
 	"context"
+	"fmt"
 	"log/slog"
+	"math"
+	"math/rand"
 	"os"
 	"strconv"
 	"sync"
 	"testing"
 	"time"

-	"github.com/stretchr/testify/require"
-
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 )

-func TestMultiModelConcurrency(t *testing.T) {
-	var (
-		req = [2]api.GenerateRequest{
-			{
-				Model:     "llama3.2:1b",
-				Prompt:    "why is the ocean blue?",
-				Stream:    &stream,
-				KeepAlive: &api.Duration{Duration: 10 * time.Second},
-				Options: map[string]any{
-					"seed":        42,
-					"temperature": 0.0,
-				},
-			}, {
-				Model:     "tinydolphin",
-				Prompt:    "what is the origin of the us thanksgiving holiday?",
-				Stream:    &stream,
-				KeepAlive: &api.Duration{Duration: 10 * time.Second},
-				Options: map[string]any{
-					"seed":        42,
-					"temperature": 0.0,
-				},
-			},
-		}
-		resp = [2][]string{
-			{"sunlight"},
-			{"england", "english", "massachusetts", "pilgrims", "british", "festival"},
-		}
-	)
-	var wg sync.WaitGroup
-	wg.Add(len(req))
-	ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
-	defer cancel()
+// Send multiple requests in parallel (concurrently) to a single model and ensure responses are expected
+func TestConcurrentChat(t *testing.T) {
+	// Assumes all requests have the same model
+	req, resp := ChatRequests()
+	numParallel := int(envconfig.NumParallel() + 1)
+	iterLimit := 3

-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-
-	for i := 0; i < len(req); i++ {
-		require.NoError(t, PullIfMissing(ctx, client, req[i].Model))
-	}
-
-	for i := 0; i < len(req); i++ {
-		go func(i int) {
-			defer wg.Done()
-			// Note: CPU based inference can crawl so don't give up too quickly
-			DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 30*time.Second)
-		}(i)
-	}
-	wg.Wait()
-}
-
-func TestIntegrationConcurrentPredict(t *testing.T) {
-	req, resp := GenerateRequests()
-	reqLimit := len(req)
-	iterLimit := 5
-
-	if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
-		maxVram, err := strconv.ParseUint(s, 10, 64)
-		require.NoError(t, err)
-		// Don't hammer on small VRAM cards...
-		if maxVram < 4*format.GibiByte {
-			reqLimit = min(reqLimit, 2)
-			iterLimit = 2
-		}
-	}
-
-	ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
+	softTimeout, hardTimeout := getTimeouts(t)
+	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()

 	// Get the server running (if applicable) warm the model up with a single initial request
-	DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)
+	slog.Info("loading", "model", req[0].Model)
+	err := client.Generate(ctx,
+		&api.GenerateRequest{Model: req[0].Model, KeepAlive: &api.Duration{Duration: 10 * time.Second}},
+		func(response api.GenerateResponse) error { return nil },
+	)
+	if err != nil {
+		t.Fatalf("failed to load model %s: %s", req[0].Model, err)
+	}

 	var wg sync.WaitGroup
-	wg.Add(reqLimit)
-	for i := 0; i < reqLimit; i++ {
+	r := rand.New(rand.NewSource(0))
+	wg.Add(numParallel)
+	for i := range numParallel {
 		go func(i int) {
 			defer wg.Done()
 			for j := 0; j < iterLimit; j++ {
-				slog.Info("Starting", "req", i, "iter", j)
+				if time.Now().Sub(started) > softTimeout {
+					slog.Info("exceeded soft timeout, winding down test")
+					return
+				}
+				k := r.Int() % len(req)
+				slog.Info("Starting", "thread", i, "iter", j)
 				// On slower GPUs it can take a while to process the concurrent requests
 				// so we allow a much longer initial timeout
-				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
+				DoChat(ctx, t, client, req[k], resp[k], 120*time.Second, 20*time.Second)
 			}
 		}(i)
 	}
 	wg.Wait()
 }

-// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
+// Stress the scheduler and attempt to load more models than will fit to cause thrashing
+// This test will always load at least 2 models even on CPU based systems
 func TestMultiModelStress(t *testing.T) {
-	s := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
+	s := os.Getenv("OLLAMA_MAX_VRAM")
 	if s == "" {
-		t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
+		s = "0"
 	}

 	maxVram, err := strconv.ParseUint(s, 10, 64)
 	if err != nil {
 		t.Fatal(err)
 	}
-	if maxVram < 2*format.GibiByte {
-		t.Skip("VRAM less than 2G, skipping model stress tests")
+
+	// All models compatible with ollama-engine
+	smallModels := []string{
+		"llama3.2:1b",
+		"qwen3:0.6b",
+		"gemma2:2b",
+		"deepseek-r1:1.5b", // qwen2 arch
+		"gemma3:270m",
+	}
+	mediumModels := []string{
+		"llama3.2:3b",    // ~3.4G
+		"qwen3:8b",       // ~6.6G
+		"gpt-oss:20b",    // ~15G
+		"deepseek-r1:7b", // ~5.6G
+		"gemma3:4b",      // ~5.8G
+		"gemma2:9b",      // ~8.1G
 	}

-	type model struct {
-		name string
-		size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
-	}
-
-	smallModels := []model{
-		{
-			name: "llama3.2:1b",
-			size: 2876 * format.MebiByte,
-		},
-		{
-			name: "phi",
-			size: 2616 * format.MebiByte,
-		},
-		{
-			name: "gemma:2b",
-			size: 2364 * format.MebiByte,
-		},
-		{
-			name: "stable-code:3b",
-			size: 2608 * format.MebiByte,
-		},
-		{
-			name: "starcoder2:3b",
-			size: 2166 * format.MebiByte,
-		},
-	}
-	mediumModels := []model{
-		{
-			name: "llama2",
-			size: 5118 * format.MebiByte,
-		},
-		{
-			name: "mistral",
-			size: 4620 * format.MebiByte,
-		},
-		{
-			name: "orca-mini:7b",
-			size: 5118 * format.MebiByte,
-		},
-		{
-			name: "dolphin-mistral",
-			size: 4620 * format.MebiByte,
-		},
-		{
-			name: "gemma:7b",
-			size: 5000 * format.MebiByte,
-		},
-		{
-			name: "codellama:7b",
-			size: 5118 * format.MebiByte,
-		},
-	}
-
-	// These seem to be too slow to be useful...
-	// largeModels := []model{
-	// 	{
-	// 		name: "llama2:13b",
-	// 		size: 7400 * format.MebiByte,
-	// 	},
-	// 	{
-	// 		name: "codellama:13b",
-	// 		size: 7400 * format.MebiByte,
-	// 	},
-	// 	{
-	// 		name: "orca-mini:13b",
-	// 		size: 7400 * format.MebiByte,
-	// 	},
-	// 	{
-	// 		name: "gemma:7b",
-	// 		size: 5000 * format.MebiByte,
-	// 	},
-	// 	{
-	// 		name: "starcoder2:15b",
-	// 		size: 9100 * format.MebiByte,
-	// 	},
-	// }
-
-	var chosenModels []model
+	var chosenModels []string
 	switch {
 	case maxVram < 10000*format.MebiByte:
 		slog.Info("selecting small models")
 		chosenModels = smallModels
-	// case maxVram < 30000*format.MebiByte:
 	default:
 		slog.Info("selecting medium models")
 		chosenModels = mediumModels
-		// default:
-		// 	slog.Info("selecting large models")
-		// 	chosenModels = largeModels
 	}

-	req, resp := GenerateRequests()
-
-	for i := range req {
-		if i > len(chosenModels) {
-			break
-		}
-		req[i].Model = chosenModels[i].name
-	}
-
-	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) // TODO baseline -- 10m too short
+	softTimeout, hardTimeout := getTimeouts(t)
+	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
+	initialTimeout := 120 * time.Second
+	streamTimeout := 20 * time.Second

 	// Make sure all the models are pulled before we get started
-	for _, r := range req {
-		require.NoError(t, PullIfMissing(ctx, client, r.Model))
+	for _, model := range chosenModels {
+		if err := PullIfMissing(ctx, client, model); err != nil {
+			t.Fatal(err)
+		}
 	}

-	var wg sync.WaitGroup
-	consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
-	for i := 0; i < len(req); i++ {
-		// Always get at least 2 models, but don't overshoot VRAM too much or we'll take too long
-		if i > 1 && consumed > maxVram {
-			slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
-			break
+	// Determine how many models we can load in parallel before we exceed VRAM
+	// The intent is to go 1 over what can fit so we force the scheduler to thrash
+	targetLoadCount := 0
+	slog.Info("Loading models to find how many can fit in VRAM before overflowing")
+chooseModels:
+	for i, model := range chosenModels {
+		req := &api.GenerateRequest{Model: model}
+		slog.Info("loading", "model", model)
+		err = client.Generate(ctx, req, func(response api.GenerateResponse) error { return nil })
+		if err != nil {
+			t.Fatalf("failed to load model %s: %s", model, err)
 		}
-		consumed += chosenModels[i].size
-		slog.Info("target vram", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
+		targetLoadCount++
+		if i > 0 {
+			models, err := client.ListRunning(ctx)
+			if err != nil {
+				t.Fatalf("failed to list running models: %s", err)
+			}
+			if len(models.Models) < targetLoadCount {
+				loaded := []string{}
+				for _, m := range models.Models {
+					loaded = append(loaded, m.Name)
+				}
+				slog.Info("found model load capacity", "target", targetLoadCount, "current", loaded, "chosen", chosenModels[:targetLoadCount])
+				break
+			}
+			// Effectively limit model count to 2 on CPU only systems to avoid thrashing and timeouts
+			for _, m := range models.Models {
+				if m.SizeVRAM == 0 {
+					slog.Info("model running on CPU", "name", m.Name, "target", targetLoadCount, "chosen", chosenModels[:targetLoadCount])
+					initialTimeout = 240 * time.Second
+					streamTimeout = 30 * time.Second
+					break chooseModels
+				}
+			}
+		}
+	}
+	if targetLoadCount == len(chosenModels) {
+		// TODO consider retrying the medium models
+		slog.Warn("all models being used without exceeding VRAM, set OLLAMA_MAX_VRAM so test can pick larger models")
+	}

+	r := rand.New(rand.NewSource(0))
+	var wg sync.WaitGroup
+	for i := range targetLoadCount {
 		wg.Add(1)
 		go func(i int) {
 			defer wg.Done()
+			reqs, resps := ChatRequests()
 			for j := 0; j < 3; j++ {
-				slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model)
-				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 5*time.Second)
+				if time.Now().Sub(started) > softTimeout {
+					slog.Info("exceeded soft timeout, winding down test")
+					return
+				}
+				k := r.Int() % len(reqs)
+				reqs[k].Model = chosenModels[i]
+				slog.Info("Starting", "model", reqs[k].Model, "iteration", j, "request", reqs[k].Messages[0].Content)
+				DoChat(ctx, t, client, reqs[k], resps[k], initialTimeout, streamTimeout)
 			}
 		}(i)
 	}
 	go func() {
 		for {
-			time.Sleep(2 * time.Second)
+			time.Sleep(10 * time.Second)
 			select {
 			case <-ctx.Done():
 				return
@@ -265,7 +193,21 @@ func TestMultiModelStress(t *testing.T) {
 					continue
 				}
 				for _, m := range models.Models {
-					slog.Info("loaded model snapshot", "model", m)
+					var procStr string
+					switch {
+					case m.SizeVRAM == 0:
+						procStr = "100% CPU"
+					case m.SizeVRAM == m.Size:
+						procStr = "100% GPU"
+					case m.SizeVRAM > m.Size || m.Size == 0:
+						procStr = "Unknown"
+					default:
+						sizeCPU := m.Size - m.SizeVRAM
+						cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
+						procStr = fmt.Sprintf("%d%%/%d%%", int(cpuPercent), int(100-cpuPercent))
+					}
+
+					slog.Info("loaded model snapshot", "model", m.Name, "CPU/GPU", procStr, "expires", format.HumanTime(m.ExpiresAt, "Never"))
 				}
 			}
 		}