Sync with upstream ollama/ollama and restore Tesla K80 (compute 3.7) support

This commit represents a complete rework after pulling the latest changes from official ollama/ollama repository and re-applying Tesla K80 compatibility patches. ## Key Changes ### CUDA Compute Capability 3.7 Support (Tesla K80) - Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt - Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset - Using 37-virtual (PTX with JIT compilation) for maximum compatibility ### Legacy Toolchain Compatibility - **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80) - **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7) - **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h) ### CPU Architecture Trade-offs Due to GCC 10.5 limitation, sacrificed newer CPU optimizations: - Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+) - Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA - Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility) ### Build System Updates - Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7 - Added -Wno-deprecated-gpu-targets flag to suppress warnings - Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI ### Upstream Sync Merged latest llama.cpp changes including: - Enhanced KV cache management with ISWA and hybrid memory support - Improved multi-modal support (mtmd framework) - New model architectures (Gemma3, Llama4, Qwen3, etc.) - GPU backend improvements for CUDA, Metal, and ROCm - Updated quantization support and GGUF format handling ### Documentation - Updated CLAUDE.md with comprehensive build instructions - Documented toolchain constraints and CPU architecture trade-offs - Removed outdated CI/CD workflows (tesla-k80-*.yml) - Cleaned up temporary development artifacts ## Rationale This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in official Ollama due to legacy driver/CUDA requirements. The toolchain constraint creates a deadlock: - K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI We accept the loss of cutting-edge CPU optimizations to enable running modern LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-18 03:37:09 +00:00 · 2025-11-05 14:03:05 +08:00
parent fabe2c5cb7
commit ef14fb5b26
817 changed files with 241634 additions and 70888 deletions
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -40,11 +40,6 @@ type Causal struct {

 	// ** current forward pass **

-	// curReserve indicates that this forward pass is only for
-	// memory reservation and we should not update our metadata
-	// based on it.
-	curReserve bool
-
 	// the active layer for Get and Put
 	curLayer int

@@ -160,7 +155,15 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
 	if c.swaMemorySize == 0 {
 		c.swaMemorySize = c.swaWindowSize
 	}
-	if int(c.swaMemorySize) > capacity {
+	// We will allocate space in the cache for the stop token, which won't be part of a follow on
+	// sequence, so allocate an extra token of storage to ensure that we can jump back without
+	// causing a cache break. As an optimization, only do this when we have parallel sequences
+	// because the extra token will live in the batch buffer and won't get overwritten if we
+	// only have a single sequence.
+	if c.swaMemorySize != math.MaxInt32 && maxSequences > 1 {
+		c.swaMemorySize = max(c.swaMemorySize, c.swaWindowSize+1)
+	}
+	if int(c.swaMemorySize) >= capacity {
 		c.swaMemorySize = math.MaxInt32
 	}

@@ -198,13 +201,12 @@ func (c *Causal) Close() {
 }

 func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
-	c.curReserve = reserve
 	c.curBatchSize = len(batch.Positions)
 	c.curSequences = batch.Sequences
 	c.curPositions = batch.Positions
 	c.opts.Except = nil

-	if !c.curReserve {
+	if !reserve {
 		c.updateSlidingWindow()

 		var err error
@@ -214,7 +216,6 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 			c.curLoc, err = c.findStartLoc()
 		}
 		if err != nil {
-			slog.Warn("unable to find a kv cache slot", "cache", c)
 			return err
 		}

@@ -288,23 +289,44 @@ func (c *Causal) updateSlidingWindow() {
 		return
 	}

+	type lowestPosition struct {
+		pos      int32
+		curBatch bool
+	}
+
 	// create a map of unique sequences to the lowest position in that sequence
-	lowestPos := make(map[int]int32)
+	lowestPos := make(map[int]lowestPosition)
 	for i := range c.curPositions {
 		seq := c.curSequences[i]

-		pos, ok := lowestPos[seq]
+		lowest, ok := lowestPos[seq]
 		if !ok {
-			pos = c.curPositions[i]
-		} else if c.curPositions[i] < pos {
-			pos = c.curPositions[i]
+			lowest = lowestPosition{pos: c.curPositions[i], curBatch: true}
+		} else if c.curPositions[i] < lowest.pos {
+			lowest.pos = c.curPositions[i]
 		}

-		lowestPos[seq] = pos
+		lowestPos[seq] = lowest
+	}
+
+	// for any sequences are not part of this batch, clean up any tokens
+	// that are no longer needed after the processing of the previous
+	// batch
+	for seq, seqRange := range c.cellRanges {
+		if _, ok := lowestPos[seq]; !ok {
+			var last int32
+			for i := seqRange.min; i <= seqRange.max; i++ {
+				if slices.Contains(c.cells[i].sequences, seq) {
+					last = max(last, c.cells[i].pos)
+				}
+			}
+
+			lowestPos[seq] = lowestPosition{pos: last + 1, curBatch: false}
+		}
 	}

 	// delete any entries that are beyond the window of the oldest position in the sequence
-	for seq, pos := range lowestPos {
+	for seq, lowest := range lowestPos {
 		oldRange, ok := c.cellRanges[seq]
 		if !ok {
 			continue
@@ -314,13 +336,13 @@ func (c *Causal) updateSlidingWindow() {

 		for i := oldRange.min; i <= oldRange.max; i++ {
 			if slices.Contains(c.cells[i].sequences, seq) {
-				if c.cells[i].pos < pos-c.swaMemorySize {
+				if c.cells[i].pos < lowest.pos-c.swaMemorySize {
 					c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == seq })
 				} else {
 					newRange.min = min(newRange.min, i)
 					newRange.max = max(newRange.max, i)
 				}
-				if c.cells[i].pos >= pos-c.swaWindowSize {
+				if lowest.curBatch && c.cells[i].pos >= lowest.pos-c.swaWindowSize {
 					c.curCellRange.min = min(c.curCellRange.min, i)
 					c.curCellRange.max = max(c.curCellRange.max, i)
 				}
@@ -351,10 +373,6 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {

 	length := c.curCellRange.max - c.curCellRange.min + 1

-	if c.curReserve {
-		return ctx.Input().Empty(c.config.MaskDType, length, batchSize)
-	}
-
 	mask := make([]float32, batchSize*length)

 	for i := range c.curBatchSize {
@@ -375,12 +393,10 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 		mask[i] = float32(math.Inf(-1))
 	}

-	maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)
+	maskTensor := ctx.Input().FromFloats(mask, length, batchSize)

 	if c.config.MaskDType != ml.DTypeF32 {
-		out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
-		ctx.Forward(maskTensor.Copy(ctx, out))
-		maskTensor = out
+		maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
 	}

 	return maskTensor
@@ -659,9 +675,11 @@ func (c *Causal) CanResume(seq int, pos int32) bool {

 	// for sliding window, check that the window of the new sequence is contained in
 	// the window of what we are storing
+	var first int32 = math.MaxInt32
 	var last int32 = -1
 	for i := seqRange.min; i <= seqRange.max; i++ {
 		if slices.Contains(c.cells[i].sequences, seq) {
+			first = min(first, c.cells[i].pos)
 			last = max(last, c.cells[i].pos)
 		}
 	}
@@ -670,10 +688,8 @@ func (c *Causal) CanResume(seq int, pos int32) bool {
 		return false
 	}

-	lastWindowStart := max(0, last-c.swaMemorySize)
 	posWindowStart := max(0, pos-c.swaWindowSize)
-
-	return posWindowStart >= lastWindowStart
+	return posWindowStart >= first && pos <= last+1
 }

 func (c *Causal) shift(seq int, beginIndex, offset int32) error {
@@ -709,7 +725,7 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 		offsets = offsets[batchFirst : batchLast+1]

 		ctx := c.backend.NewContext()
-		kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
+		kShift := ctx.Input().FromInts(offsets, len(offsets))

 		for i, key := range c.keys {
 			if key == nil {