Sync with upstream ollama/ollama and restore Tesla K80 (compute 3.7) support

This commit represents a complete rework after pulling the latest changes from official ollama/ollama repository and re-applying Tesla K80 compatibility patches. ## Key Changes ### CUDA Compute Capability 3.7 Support (Tesla K80) - Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt - Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset - Using 37-virtual (PTX with JIT compilation) for maximum compatibility ### Legacy Toolchain Compatibility - **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80) - **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7) - **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h) ### CPU Architecture Trade-offs Due to GCC 10.5 limitation, sacrificed newer CPU optimizations: - Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+) - Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA - Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility) ### Build System Updates - Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7 - Added -Wno-deprecated-gpu-targets flag to suppress warnings - Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI ### Upstream Sync Merged latest llama.cpp changes including: - Enhanced KV cache management with ISWA and hybrid memory support - Improved multi-modal support (mtmd framework) - New model architectures (Gemma3, Llama4, Qwen3, etc.) - GPU backend improvements for CUDA, Metal, and ROCm - Updated quantization support and GGUF format handling ### Documentation - Updated CLAUDE.md with comprehensive build instructions - Documented toolchain constraints and CPU architecture trade-offs - Removed outdated CI/CD workflows (tesla-k80-*.yml) - Cleaned up temporary development artifacts ## Rationale This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in official Ollama due to legacy driver/CUDA requirements. The toolchain constraint creates a deadlock: - K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI We accept the loss of cutting-edge CPU optimizations to enable running modern LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-12 00:37:04 +00:00 · 2025-11-05 14:03:05 +08:00
parent fabe2c5cb7
commit ef14fb5b26
817 changed files with 241634 additions and 70888 deletions
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -5,7 +5,6 @@ import (
 	"context"
 	"encoding/binary"
 	"fmt"
-	"log/slog"
 	"math"
 	"slices"
 	"strconv"
@@ -15,6 +14,9 @@ import (
 )

 type Backend interface {
+	// Close frees all memory associated with this backend
+	Close()
+
 	Load(ctx context.Context, progress func(float32)) error

 	// BackendMemory returns the memory allocations that were made for this model
@@ -24,6 +26,9 @@ type Backend interface {
 	Get(name string) Tensor
 	NewContext() Context
 	NewContextSize(size int) Context
+
+	// Enumerate the devices available for inference via this backend
+	BackendDevices() []DeviceInfo
 }

 // BackendCacheConfig should be implemented by backends that need special output
@@ -57,143 +62,21 @@ type CacheConfig struct {

 // BackendParams controls how the backend loads and executes models
 type BackendParams struct {
+	// AllocMemory causes the backend to allocate memory for the model. If
+	// false, this is only being used for discovering the required amount of
+	// memory and cannot load the model for running.
+	AllocMemory bool
+
 	// NumThreads sets the number of threads to use if running on the CPU
 	NumThreads int

-	// MainGPU is the index of the primary GPU to use
-	MainGPU int
-
-	// NumGPULayers is the number of layers to offload to GPUs
-	NumGPULayers int
-
-	// TensorSplit is the fraction of the model to offload to each GPU
-	TensorSplit []float32
+	// GPULayers is the set of layers to offload to GPUs
+	GPULayers GPULayersList

 	// FlashAttention indicates that we should use a fused flash attention kernel
 	FlashAttention bool
 }

-// ErrNoMem is returned when panicing due to insufficient memory. It includes
-// the attempted memory allocation.
-type ErrNoMem struct {
-	BackendMemory
-}
-
-func (e ErrNoMem) Error() string {
-	return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
-}
-
-type AllocationStatus int
-
-const (
-	// Unallocated memory - have not yet attempted to allocate
-	Unallocated AllocationStatus = iota
-
-	// Failed memory - tried to allocate the memory and did not succeed
-	Failed
-
-	// Allocated memory = tried and succeeded to allocate memory
-	Allocated
-)
-
-// Memory is the size of an allocation and whether it was successful.
-type Memory struct {
-	Size   uint64
-	Status AllocationStatus
-}
-
-func (m Memory) String() string {
-	s := fmt.Sprint(m.Size)
-
-	switch m.Status {
-	case Unallocated:
-		s += "U"
-	case Failed:
-		s += "F"
-	case Allocated:
-		s += "A"
-	}
-
-	return s
-}
-
-// DeviceMemory provides a breakdown of the memory needed
-// per device, such as a CPU or GPU.
-type DeviceMemory struct {
-	// Name is the name of the device as labeled by the backend. It
-	// may not be persistent across instances of the runner.
-	Name string
-
-	// ID is an identifier for the device for matching with system
-	// management libraries.
-	ID string
-
-	// Weights is the per-layer memory needed for the model weights.
-	Weights []Memory
-
-	// Cache is the per-layer memory needed for the KV cache.
-	Cache []Memory
-
-	// Graph is the size of the compute graph. It is not per-layer.
-	Graph Memory
-}
-
-func memoryPresent(mem []Memory) bool {
-	return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
-}
-
-func (m DeviceMemory) LogValue() slog.Value {
-	var attrs []slog.Attr
-	if memoryPresent(m.Weights) {
-		attrs = append(attrs, slog.Any("Weights", m.Weights))
-	}
-
-	if memoryPresent(m.Cache) {
-		attrs = append(attrs, slog.Any("Cache", m.Cache))
-	}
-
-	if m.Graph.Size != 0 {
-		attrs = append(attrs, slog.Any("Graph", m.Graph))
-	}
-
-	if len(attrs) > 0 && m.ID != "" {
-		attrs = append([]slog.Attr{slog.String("ID", m.ID)}, attrs...)
-	}
-
-	return slog.GroupValue(attrs...)
-}
-
-// BackendMemory provides the amount of memory required to load the model
-// per device based on the BackendParams. In some cases, not all required
-// allocations will be known at this point. However, the size of the most recent
-// allocation is guaranteed to be provided so that if it failed, the caller can
-// accommodate that to make forward progress.
-type BackendMemory struct {
-	// InputsWeights are always located on the CPU and cannot be moved
-	InputWeights Memory
-
-	// CPU model components are located in system memory. This does not
-	// include unified memory allocated through the GPU.
-	CPU DeviceMemory
-
-	// GPU model components are located on one or more GPUs.
-	GPUs []DeviceMemory
-}
-
-func (m BackendMemory) LogValue() slog.Value {
-	var attrs []slog.Attr
-	if m.InputWeights.Size != 0 {
-		attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
-	}
-
-	attrs = append(attrs, slog.Any(m.CPU.Name, m.CPU))
-	for _, g := range m.GPUs {
-		attrs = append(attrs, slog.Any(g.Name, g))
-	}
-
-	return slog.GroupValue(attrs...)
-}
-
 var backends = make(map[string]func(string, BackendParams) (Backend, error))

 func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
@@ -215,14 +98,21 @@ func NewBackend(modelPath string, params BackendParams) (Backend, error) {
 type Context interface {
 	Empty(dtype DType, shape ...int) Tensor
 	Zeros(dtype DType, shape ...int) Tensor
-	FromFloatSlice(s []float32, shape ...int) Tensor
-	FromIntSlice(s []int32, shape ...int) Tensor
+	FromBytes(dtype DType, s []byte, shape ...int) Tensor
+	FromFloats(s []float32, shape ...int) Tensor
+	FromInts(s []int32, shape ...int) Tensor

 	// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
 	Arange(start, stop, step float32, dtype DType) Tensor

 	Forward(...Tensor) Context
+
+	// SetBatchSize provides a hint on the batch size to optimize processing
+	// Uses heuristics if not set
+	SetBatchSize(int)
+
 	Compute(...Tensor)
+	ComputeWithNotify(func(), ...Tensor) // notify callback once compute has begun

 	// Reserve is analogous to Compute but rather than executing a
 	// graph, simply preallocates memory. Typically called with a
@@ -247,10 +137,15 @@ type Tensor interface {

 	Shape() []int
 	DType() DType
+	Cast(ctx Context, dtype DType) Tensor

 	Bytes() []byte
 	Floats() []float32

+	FromBytes([]byte)
+	FromFloats([]float32)
+	FromInts([]int32)
+
 	Neg(ctx Context) Tensor
 	Add(ctx Context, t2 Tensor) Tensor
 	Sub(ctx Context, t2 Tensor) Tensor
@@ -260,8 +155,10 @@ type Tensor interface {
 	Mulmat(ctx Context, t2 Tensor) Tensor
 	MulmatFullPrec(ctx Context, t2 Tensor) Tensor
 	MulmatID(ctx Context, t2, ids Tensor) Tensor
+	AddID(ctx Context, t2, ids Tensor) Tensor

 	Softmax(ctx Context) Tensor
+	L2Norm(ctx Context, eps float32) Tensor
 	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
 	RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
 	Scale(ctx Context, s float64) Tensor
@@ -269,18 +166,21 @@ type Tensor interface {

 	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
+	Conv3D(ctx Context, weight Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) Tensor

 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

 	Sin(ctx Context) Tensor
 	Cos(ctx Context) Tensor
 	Tanh(ctx Context) Tensor
-	GELU(ctx Context) Tensor
-	QuickGELU(ctx Context) Tensor
-	SILU(ctx Context) Tensor
-	RELU(ctx Context) Tensor
+	GELU(ctx Context, up ...Tensor) Tensor
+	SILU(ctx Context, up ...Tensor) Tensor
+	RELU(ctx Context, up ...Tensor) Tensor
 	Sigmoid(ctx Context) Tensor

+	// AlphaLimitSILU is a variant of SILU that clamps the input to the range [-limit, limit]
+	SILUAlphaLimit(ctx Context, up Tensor, alpha, limit float32) Tensor
+
 	Reshape(ctx Context, shape ...int) Tensor
 	View(ctx Context, offset int, shape ...int) Tensor
 	Permute(ctx Context, shape ...int) Tensor
@@ -329,7 +229,7 @@ type Tensor interface {
 // kqv := value.Mulmat(ctx, kq)
 // return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 type ScaledDotProductAttention interface {
-	ScaledDotProductAttention(ctx Context, key, value, mask Tensor, scale float64) Tensor
+	ScaledDotProductAttention(ctx Context, key, value, mask, sinks Tensor, scale float64) Tensor
 }

 type number interface {