Sync with upstream ollama/ollama and restore Tesla K80 (compute 3.7) support

This commit represents a complete rework after pulling the latest changes from
official ollama/ollama repository and re-applying Tesla K80 compatibility patches.

## Key Changes

### CUDA Compute Capability 3.7 Support (Tesla K80)
- Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt
- Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset
- Using 37-virtual (PTX with JIT compilation) for maximum compatibility

### Legacy Toolchain Compatibility
- **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80)
- **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7)
- **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h)

### CPU Architecture Trade-offs
Due to GCC 10.5 limitation, sacrificed newer CPU optimizations:
- Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+)
- Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA
- Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility)

### Build System Updates
- Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7
- Added -Wno-deprecated-gpu-targets flag to suppress warnings
- Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI

### Upstream Sync
Merged latest llama.cpp changes including:
- Enhanced KV cache management with ISWA and hybrid memory support
- Improved multi-modal support (mtmd framework)
- New model architectures (Gemma3, Llama4, Qwen3, etc.)
- GPU backend improvements for CUDA, Metal, and ROCm
- Updated quantization support and GGUF format handling

### Documentation
- Updated CLAUDE.md with comprehensive build instructions
- Documented toolchain constraints and CPU architecture trade-offs
- Removed outdated CI/CD workflows (tesla-k80-*.yml)
- Cleaned up temporary development artifacts

## Rationale

This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in
official Ollama due to legacy driver/CUDA requirements. The toolchain constraint
creates a deadlock:
- K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI

We accept the loss of cutting-edge CPU optimizations to enable running modern
LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Shang Chieh Tseng
2025-11-05 14:03:05 +08:00
parent fabe2c5cb7
commit ef14fb5b26
817 changed files with 241634 additions and 70888 deletions

View File

@@ -34,8 +34,8 @@ type InputCache struct {
func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, batchSize int, multiUserCache bool) (*InputCache, error) {
numCtx := kvSize / int32(numSlots)
if numCtx < 1 {
return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
if int(numCtx) < batchSize {
return nil, fmt.Errorf("kv size must be at least as large as batch size * parallel (kv: %v batch: %v parallel: %v)", kvSize, batchSize, numSlots)
}
slots := make([]InputCacheSlot, numSlots)
@@ -70,11 +70,13 @@ func kvCacheTypeFromStr(s string) ml.DType {
}
func (c *InputCache) Close() {
c.cache.Close()
if c != nil && c.cache != nil {
c.cache.Close()
}
}
// Locking: Operations on InputCacheSlot (including finding one
// through LoadCacheSlot) require a lock to be be held that serializes
// through LoadCacheSlot) require a lock to be held that serializes
// these operations with each other and processBatch
type InputCacheSlot struct {
@@ -82,7 +84,7 @@ type InputCacheSlot struct {
Id int
// Inputs that are stored in the KV cache
Inputs []input.Input
Inputs []*input.Input
// is this cache actively being processed as part of a sequence?
InUse bool
@@ -91,7 +93,7 @@ type InputCacheSlot struct {
lastUsed time.Time
}
func (c *InputCache) LoadCacheSlot(prompt []input.Input) (*InputCacheSlot, []input.Input, error) {
func (c *InputCache) LoadCacheSlot(prompt []*input.Input, cachePrompt bool) (*InputCacheSlot, []*input.Input, error) {
var slot *InputCacheSlot
var numPast int32
var err error
@@ -109,6 +111,10 @@ func (c *InputCache) LoadCacheSlot(prompt []input.Input) (*InputCacheSlot, []inp
return nil, nil, err
}
if !cachePrompt {
numPast = 0
}
slot.InUse = true
slot.lastUsed = time.Now()
@@ -142,7 +148,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input.Input) (*InputCacheSlot, []inp
return slot, prompt, nil
}
func (c *InputCache) findLongestCacheSlot(prompt []input.Input) (*InputCacheSlot, int32, error) {
func (c *InputCache) findLongestCacheSlot(prompt []*input.Input) (*InputCacheSlot, int32, error) {
longest := int32(-1)
var longestSlot *InputCacheSlot
@@ -165,7 +171,7 @@ func (c *InputCache) findLongestCacheSlot(prompt []input.Input) (*InputCacheSlot
return longestSlot, longest, nil
}
func (c *InputCache) findBestCacheSlot(prompt []input.Input) (*InputCacheSlot, int32, error) {
func (c *InputCache) findBestCacheSlot(prompt []*input.Input) (*InputCacheSlot, int32, error) {
oldest := time.Now()
var oldestSlot *InputCacheSlot
@@ -201,7 +207,7 @@ func (c *InputCache) findBestCacheSlot(prompt []input.Input) (*InputCacheSlot, i
if longest > 0 && longestSlot != oldestSlot {
slog.Debug("forking cache slot", "src", longestSlot.Id, "dst", oldestSlot.Id, "inputs", longest, "total",
len(longestSlot.Inputs))
oldestSlot.Inputs = make([]input.Input, longest)
oldestSlot.Inputs = make([]*input.Input, longest)
copy(oldestSlot.Inputs, longestSlot.Inputs[:longest])
if c.cache != nil {
c.cache.CopyPrefix(longestSlot.Id, oldestSlot.Id, longest)
@@ -211,7 +217,7 @@ func (c *InputCache) findBestCacheSlot(prompt []input.Input) (*InputCacheSlot, i
return oldestSlot, longest, nil
}
func countCommonPrefix(a []input.Input, b []input.Input) int32 {
func countCommonPrefix(a []*input.Input, b []*input.Input) int32 {
var count int32
for i := range a {
@@ -229,24 +235,32 @@ func countCommonPrefix(a []input.Input, b []input.Input) int32 {
return count
}
// TODO(jessegross): If we need to reprocess the inputs we should ensure that
// we don't split up a SameBatch
func (c *InputCache) ShiftDiscard(inputLen int32, numKeep int32) int32 {
targetFree := (c.numCtx - numKeep) / 2
targetFree = max(targetFree, 1)
// ShiftDiscard computes how many inputs can be discarded from the cache. Inputs in the same batch
// are discarded together.
func (c *InputCache) ShiftDiscard(inputs []*input.Input, numKeep int32) int32 {
targetFree := max((c.numCtx-numKeep)/2, 1)
currentFree := c.numCtx - int32(len(inputs))
currentFree := c.numCtx - inputLen
discard := targetFree - currentFree
var discard, sameBatch int32
for _, input := range inputs[numKeep:] {
if sameBatch <= 0 && currentFree >= targetFree {
break
}
if discard < 0 {
discard = 0
sameBatch--
currentFree++
discard++
if input.SameBatch > 0 {
sameBatch = int32(input.SameBatch)
}
}
return discard
}
type ErrReprocessInputs struct {
Inputs []input.Input
Inputs []*input.Input
}
func (e *ErrReprocessInputs) Error() string {
@@ -263,7 +277,7 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int32) error {
}
inputLen := int32(len(slot.Inputs))
discard := c.ShiftDiscard(inputLen, numKeep)
discard := c.ShiftDiscard(slot.Inputs, numKeep)
if discard <= 0 {
return nil
@@ -279,13 +293,13 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int32) error {
"id", slot.Id, "error", err)
// Create new input slice with preserved tokens (numKeep + remaining tokens after discard)
newInputs := make([]input.Input, numKeep+inputLen-(numKeep+discard))
newInputs := make([]*input.Input, numKeep+inputLen-(numKeep+discard))
copy(newInputs[:numKeep], slot.Inputs[:numKeep])
copy(newInputs[numKeep:], slot.Inputs[numKeep+discard:])
// Reset the cache
_ = c.cache.Remove(slot.Id, 0, math.MaxInt32)
slot.Inputs = []input.Input{}
slot.Inputs = []*input.Input{}
// Return error with inputs that need to be reprocessed
return &ErrReprocessInputs{Inputs: newInputs}