Sync with upstream ollama/ollama and restore Tesla K80 (compute 3.7) support

This commit represents a complete rework after pulling the latest changes from
official ollama/ollama repository and re-applying Tesla K80 compatibility patches.

## Key Changes

### CUDA Compute Capability 3.7 Support (Tesla K80)
- Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt
- Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset
- Using 37-virtual (PTX with JIT compilation) for maximum compatibility

### Legacy Toolchain Compatibility
- **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80)
- **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7)
- **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h)

### CPU Architecture Trade-offs
Due to GCC 10.5 limitation, sacrificed newer CPU optimizations:
- Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+)
- Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA
- Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility)

### Build System Updates
- Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7
- Added -Wno-deprecated-gpu-targets flag to suppress warnings
- Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI

### Upstream Sync
Merged latest llama.cpp changes including:
- Enhanced KV cache management with ISWA and hybrid memory support
- Improved multi-modal support (mtmd framework)
- New model architectures (Gemma3, Llama4, Qwen3, etc.)
- GPU backend improvements for CUDA, Metal, and ROCm
- Updated quantization support and GGUF format handling

### Documentation
- Updated CLAUDE.md with comprehensive build instructions
- Documented toolchain constraints and CPU architecture trade-offs
- Removed outdated CI/CD workflows (tesla-k80-*.yml)
- Cleaned up temporary development artifacts

## Rationale

This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in
official Ollama due to legacy driver/CUDA requirements. The toolchain constraint
creates a deadlock:
- K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI

We accept the loss of cutting-edge CPU optimizations to enable running modern
LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Shang Chieh Tseng
2025-11-05 14:03:05 +08:00
parent fabe2c5cb7
commit ef14fb5b26
817 changed files with 241634 additions and 70888 deletions

View File

@@ -40,11 +40,6 @@ type Causal struct {
// ** current forward pass **
// curReserve indicates that this forward pass is only for
// memory reservation and we should not update our metadata
// based on it.
curReserve bool
// the active layer for Get and Put
curLayer int
@@ -160,7 +155,15 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
if c.swaMemorySize == 0 {
c.swaMemorySize = c.swaWindowSize
}
if int(c.swaMemorySize) > capacity {
// We will allocate space in the cache for the stop token, which won't be part of a follow on
// sequence, so allocate an extra token of storage to ensure that we can jump back without
// causing a cache break. As an optimization, only do this when we have parallel sequences
// because the extra token will live in the batch buffer and won't get overwritten if we
// only have a single sequence.
if c.swaMemorySize != math.MaxInt32 && maxSequences > 1 {
c.swaMemorySize = max(c.swaMemorySize, c.swaWindowSize+1)
}
if int(c.swaMemorySize) >= capacity {
c.swaMemorySize = math.MaxInt32
}
@@ -198,13 +201,12 @@ func (c *Causal) Close() {
}
func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
c.curReserve = reserve
c.curBatchSize = len(batch.Positions)
c.curSequences = batch.Sequences
c.curPositions = batch.Positions
c.opts.Except = nil
if !c.curReserve {
if !reserve {
c.updateSlidingWindow()
var err error
@@ -214,7 +216,6 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
c.curLoc, err = c.findStartLoc()
}
if err != nil {
slog.Warn("unable to find a kv cache slot", "cache", c)
return err
}
@@ -288,23 +289,44 @@ func (c *Causal) updateSlidingWindow() {
return
}
type lowestPosition struct {
pos int32
curBatch bool
}
// create a map of unique sequences to the lowest position in that sequence
lowestPos := make(map[int]int32)
lowestPos := make(map[int]lowestPosition)
for i := range c.curPositions {
seq := c.curSequences[i]
pos, ok := lowestPos[seq]
lowest, ok := lowestPos[seq]
if !ok {
pos = c.curPositions[i]
} else if c.curPositions[i] < pos {
pos = c.curPositions[i]
lowest = lowestPosition{pos: c.curPositions[i], curBatch: true}
} else if c.curPositions[i] < lowest.pos {
lowest.pos = c.curPositions[i]
}
lowestPos[seq] = pos
lowestPos[seq] = lowest
}
// for any sequences are not part of this batch, clean up any tokens
// that are no longer needed after the processing of the previous
// batch
for seq, seqRange := range c.cellRanges {
if _, ok := lowestPos[seq]; !ok {
var last int32
for i := seqRange.min; i <= seqRange.max; i++ {
if slices.Contains(c.cells[i].sequences, seq) {
last = max(last, c.cells[i].pos)
}
}
lowestPos[seq] = lowestPosition{pos: last + 1, curBatch: false}
}
}
// delete any entries that are beyond the window of the oldest position in the sequence
for seq, pos := range lowestPos {
for seq, lowest := range lowestPos {
oldRange, ok := c.cellRanges[seq]
if !ok {
continue
@@ -314,13 +336,13 @@ func (c *Causal) updateSlidingWindow() {
for i := oldRange.min; i <= oldRange.max; i++ {
if slices.Contains(c.cells[i].sequences, seq) {
if c.cells[i].pos < pos-c.swaMemorySize {
if c.cells[i].pos < lowest.pos-c.swaMemorySize {
c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == seq })
} else {
newRange.min = min(newRange.min, i)
newRange.max = max(newRange.max, i)
}
if c.cells[i].pos >= pos-c.swaWindowSize {
if lowest.curBatch && c.cells[i].pos >= lowest.pos-c.swaWindowSize {
c.curCellRange.min = min(c.curCellRange.min, i)
c.curCellRange.max = max(c.curCellRange.max, i)
}
@@ -351,10 +373,6 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
length := c.curCellRange.max - c.curCellRange.min + 1
if c.curReserve {
return ctx.Input().Empty(c.config.MaskDType, length, batchSize)
}
mask := make([]float32, batchSize*length)
for i := range c.curBatchSize {
@@ -375,12 +393,10 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
mask[i] = float32(math.Inf(-1))
}
maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)
maskTensor := ctx.Input().FromFloats(mask, length, batchSize)
if c.config.MaskDType != ml.DTypeF32 {
out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
ctx.Forward(maskTensor.Copy(ctx, out))
maskTensor = out
maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
}
return maskTensor
@@ -659,9 +675,11 @@ func (c *Causal) CanResume(seq int, pos int32) bool {
// for sliding window, check that the window of the new sequence is contained in
// the window of what we are storing
var first int32 = math.MaxInt32
var last int32 = -1
for i := seqRange.min; i <= seqRange.max; i++ {
if slices.Contains(c.cells[i].sequences, seq) {
first = min(first, c.cells[i].pos)
last = max(last, c.cells[i].pos)
}
}
@@ -670,10 +688,8 @@ func (c *Causal) CanResume(seq int, pos int32) bool {
return false
}
lastWindowStart := max(0, last-c.swaMemorySize)
posWindowStart := max(0, pos-c.swaWindowSize)
return posWindowStart >= lastWindowStart
return posWindowStart >= first && pos <= last+1
}
func (c *Causal) shift(seq int, beginIndex, offset int32) error {
@@ -709,7 +725,7 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
offsets = offsets[batchFirst : batchLast+1]
ctx := c.backend.NewContext()
kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
kShift := ctx.Input().FromInts(offsets, len(offsets))
for i, key := range c.keys {
if key == nil {