mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
This commit represents a complete rework after pulling the latest changes from official ollama/ollama repository and re-applying Tesla K80 compatibility patches. ## Key Changes ### CUDA Compute Capability 3.7 Support (Tesla K80) - Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt - Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset - Using 37-virtual (PTX with JIT compilation) for maximum compatibility ### Legacy Toolchain Compatibility - **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80) - **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7) - **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h) ### CPU Architecture Trade-offs Due to GCC 10.5 limitation, sacrificed newer CPU optimizations: - Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+) - Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA - Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility) ### Build System Updates - Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7 - Added -Wno-deprecated-gpu-targets flag to suppress warnings - Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI ### Upstream Sync Merged latest llama.cpp changes including: - Enhanced KV cache management with ISWA and hybrid memory support - Improved multi-modal support (mtmd framework) - New model architectures (Gemma3, Llama4, Qwen3, etc.) - GPU backend improvements for CUDA, Metal, and ROCm - Updated quantization support and GGUF format handling ### Documentation - Updated CLAUDE.md with comprehensive build instructions - Documented toolchain constraints and CPU architecture trade-offs - Removed outdated CI/CD workflows (tesla-k80-*.yml) - Cleaned up temporary development artifacts ## Rationale This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in official Ollama due to legacy driver/CUDA requirements. The toolchain constraint creates a deadlock: - K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI We accept the loss of cutting-edge CPU optimizations to enable running modern LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
77 lines
2.5 KiB
Go
77 lines
2.5 KiB
Go
package nn
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"github.com/ollama/ollama/kvcache"
|
|
"github.com/ollama/ollama/ml"
|
|
)
|
|
|
|
// Attention implements scaled dot-product attention for transformer models:
|
|
// Attention(Q, K, V) = softmax(QK^T/√d_k)V
|
|
//
|
|
// Parameters:
|
|
// - ctx: Context for tensor operations
|
|
// - query: Query tensor (Q) with shape [d_k, heads, seq_len_q]
|
|
// - key: Key tensor (K) with shape [d_k, kv_heads, seq_len_k], can be nil to read from cache only
|
|
// - value: Value tensor (V) with shape [d_v, kv_heads, seq_len_k], can be nil to read from cache only
|
|
// - scale: Scaling factor, typically 1/√d_k where d_k is the key dimension
|
|
// - cache: KV cache to store key/value and get past history, can be nil to only use provided key/value
|
|
//
|
|
// Returns:
|
|
//
|
|
// Attention output with shape [d_v, heads, seq_len_q]
|
|
func Attention(ctx ml.Context, query, key, value ml.Tensor, scale float64, cache kvcache.Cache) ml.Tensor {
|
|
return AttentionWithSinks(ctx, query, key, value, nil, scale, cache)
|
|
}
|
|
|
|
func AttentionWithSinks(ctx ml.Context, query, key, value, sinks ml.Tensor, scale float64, cache kvcache.Cache) ml.Tensor {
|
|
ctx.Forward(query)
|
|
if key != nil && value != nil {
|
|
if query.Dim(0) != key.Dim(0) {
|
|
panic(fmt.Errorf("d_k in attention operation does not match between query(%v) and key(%v)", query.Dim(0), key.Dim(0)))
|
|
}
|
|
|
|
if key.Dim(1) != value.Dim(1) {
|
|
panic(fmt.Errorf("kv_heads in attention operation does not match between key(%v) and value(%v)", key.Dim(1), value.Dim(1)))
|
|
}
|
|
|
|
if key.Dim(2) != value.Dim(2) {
|
|
panic(fmt.Errorf("seq_len_k in attention operation does not match between key(%v) and value(%v)", key.Dim(2), value.Dim(2)))
|
|
}
|
|
|
|
ctx.Forward(key, value)
|
|
if cache != nil {
|
|
cache.Put(ctx, key, value)
|
|
}
|
|
} else if cache == nil {
|
|
panic("key & value tensors must be provided if cache is nil")
|
|
}
|
|
|
|
var mask ml.Tensor
|
|
if cache != nil {
|
|
key, value, mask = cache.Get(ctx)
|
|
}
|
|
|
|
// Only use the fast SDPA implementation if we have a cache, since that's what
|
|
// will do any expected backend-specific transformations for us
|
|
if sdpa, ok := query.(ml.ScaledDotProductAttention); ok && cache != nil {
|
|
return sdpa.ScaledDotProductAttention(ctx, key, value, mask, sinks, scale)
|
|
} else {
|
|
query = query.Permute(ctx, 0, 2, 1, 3)
|
|
key = key.Permute(ctx, 0, 2, 1, 3)
|
|
value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
|
|
|
kq := key.MulmatFullPrec(ctx, query)
|
|
|
|
kq = kq.Scale(ctx, scale)
|
|
if mask != nil {
|
|
kq = kq.Add(ctx, mask)
|
|
}
|
|
kq = kq.Softmax(ctx)
|
|
|
|
kqv := value.Mulmat(ctx, kq)
|
|
return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
|
}
|
|
}
|