next ollama runner (#7913)

feat: add new Ollama engine using ggml through cgo This change introduces a new way to run pretrained models. It introduces 3 high level interfaces and a bunch of smaller helper interfaces to facilitate this. - `model.Model` defines the interface for a model architecture. Models such as `llama` and `mllama`, which are provided as examples, can implement the model's forward propagation in the `Forward` method. This method will be called to generate completions. This interface can be found in `model/model.go` - `ml.Backend` defines the interface for a backend tensor library, in this case `ggml`. Among other things, a Backend is responsible for loading a pretrained model into hardware (GPU, CPU, etc) and providing an interface for Models to access loaded tensors. This interface can be found in `ml/backend.go` - `ml.Tensor` defines the interface for a tensor and tensor operations This is the first implementation of the new engine. Follow up PRs will implement more features: - non-greedy sampling (#8410) - integration with Ollama and KV caching (#8301) - more model support (#9080) with more coming soon Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2025-12-11 00:07:07 +00:00 · 2025-02-14 00:31:21 +00:00
parent 8cf16063a5
commit 58245413f4
57 changed files with 475427 additions and 494 deletions
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -11,18 +11,19 @@ import (
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/fs/ggml"
 )

 // This algorithm looks for a complete fit to determine if we need to unload other models
-func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
+func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
 		var layerCount int
-		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+		estimate := EstimateGPULayers(gpus, f, projectors, opts)
 		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
 		if opts.NumGPU < 0 {
-			if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
+			if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
 				return true, estimatedVRAM
 			}
 		} else {
@@ -70,7 +71,7 @@ type MemoryEstimate struct {

 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
-func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
+func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options) MemoryEstimate {
 	// Graph size for a partial offload, applies to all GPUs
 	var graphPartialOffload uint64

@@ -115,33 +116,31 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}

-	layers := ggml.Tensors().Layers()
+	layers := f.Tensors().GroupLayers()
 	// add one layer worth of memory as a buffer
 	if blk0, ok := layers["blk.0"]; ok {
-		layerSize = blk0.size()
+		layerSize = blk0.Size()
 	} else {
 		slog.Warn("model missing blk.0 layer size")
 	}

-	fa := envconfig.FlashAttention() &&
-		discover.GetGPUInfo().FlashAttentionSupported() &&
-		ggml.SupportsFlashAttention()
-
 	var kvct string
-	if fa {
+	if envconfig.FlashAttention() &&
+		discover.GetGPUInfo().FlashAttentionSupported() &&
+		f.SupportsFlashAttention() {
 		requested := strings.ToLower(envconfig.KvCacheType())
-		if requested != "" && ggml.SupportsKVCacheType(requested) {
+		if requested != "" && f.SupportsKVCacheType(requested) {
 			kvct = requested
 		}
 	}

-	kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)
+	kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)

 	// KV is proportional to the number of layers
-	layerSize += kv / ggml.KV().BlockCount()
+	layerSize += kv / f.KV().BlockCount()

 	if graphPartialOffload == 0 {
-		graphPartialOffload = ggml.KV().GQA() * kv / 6
+		graphPartialOffload = f.KV().GQA() * kv / 6
 	}
 	if graphFullOffload == 0 {
 		graphFullOffload = graphPartialOffload
@@ -156,12 +155,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 	}

 	if layer, ok := layers["output_norm"]; ok {
-		memoryLayerOutput += layer.size()
+		memoryLayerOutput += layer.Size()
 	}
 	if layer, ok := layers["output"]; ok {
-		memoryLayerOutput += layer.size()
+		memoryLayerOutput += layer.Size()
 	} else if layer, ok := layers["token_embd"]; ok {
-		memoryLayerOutput += layer.size()
+		memoryLayerOutput += layer.Size()
 	}

 	// Output layer handled at the end if we have space
@@ -211,11 +210,11 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 	}

 	// For all the layers, find where they can fit on the GPU(s)
-	for i := range int(ggml.KV().BlockCount()) {
+	for i := range int(f.KV().BlockCount()) {
 		// Some models have inconsistent layer sizes
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
-			layerSize = blk.size()
-			layerSize += kv / ggml.KV().BlockCount()
+			layerSize = blk.Size()
+			layerSize += kv / f.KV().BlockCount()
 		}
 		memoryWeights += layerSize

@@ -238,10 +237,10 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 			}
 		}
 	}
-	if layerCount >= int(ggml.KV().BlockCount()) {
+	if layerCount >= int(f.KV().BlockCount()) {
 		fullyLoaded = true
 	} else {
-		for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
+		for i := layerCount; i < int(f.KV().BlockCount()); i++ {
 			overflow += layerSize
 		}
 	}
@@ -259,7 +258,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 			}
 		}

-		if layerCount < int(ggml.KV().BlockCount())+1 {
+		if layerCount < int(f.KV().BlockCount())+1 {
 			fullyLoaded = false
 			overflow += memoryLayerOutput
 		}
@@ -311,7 +310,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,

 		inferenceLibrary:    gpus[0].Library,
 		layersRequested:     opts.NumGPU,
-		layersModel:         int(ggml.KV().BlockCount()) + 1,
+		layersModel:         int(f.KV().BlockCount()) + 1,
 		availableList:       availableList,
 		kv:                  kv,
 		allocationsList:     allocationsList,
@@ -339,22 +338,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 	return estimate
 }

-func (m MemoryEstimate) log() {
-	overhead := envconfig.GpuOverhead()
-
-	log := slog.With()
-	if m.projectorWeights > 0 {
-		log = log.With(
-			slog.Group(
-				"projector",
-				"weights", format.HumanBytes2(m.projectorWeights),
-				"graph", format.HumanBytes2(m.projectorGraph),
-			),
-		)
-	}
-
-	log.Info(
-		"offload to "+m.inferenceLibrary,
+func (m MemoryEstimate) LogValue() slog.Value {
+	attrs := []slog.Attr{
+		slog.String("library", m.inferenceLibrary),
 		slog.Group(
 			"layers",
 			// requested number of layers to offload
@@ -370,7 +356,7 @@ func (m MemoryEstimate) log() {
 			"memory",
 			// memory available by GPU for offloading
 			"available", m.availableList,
-			"gpu_overhead", format.HumanBytes2(overhead),
+			"gpu_overhead", format.HumanBytes2(envconfig.GpuOverhead()),
 			slog.Group(
 				"required",
 				// memory required for full offloading
@@ -399,7 +385,17 @@ func (m MemoryEstimate) log() {
 				"partial", format.HumanBytes2(m.graphPartialOffload),
 			),
 		),
-	)
+	}
+
+	if m.projectorWeights > 0 {
+		attrs = append(attrs, slog.Group(
+			"projector",
+			"weights", format.HumanBytes2(m.projectorWeights),
+			"graph", format.HumanBytes2(m.projectorGraph),
+		))
+	}
+
+	return slog.GroupValue(attrs...)
 }

 func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
@@ -409,13 +405,13 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
 	}
 	defer file.Close()

-	ggml, _, err := DecodeGGML(file, 0)
+	ggml, _, err := ggml.Decode(file, 0)
 	if err != nil {
 		return 0, 0
 	}

-	for _, layer := range ggml.Tensors().Layers() {
-		weights += layer.size()
+	for _, layer := range ggml.Tensors().GroupLayers() {
+		weights += layer.Size()
 	}

 	switch arch := ggml.KV().Architecture(); arch {
@@ -435,7 +431,7 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
 		headCount := kv("attention.head_count")

 		numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
-		if _, ok := ggml.Tensors().Layers()["v"]["class_embd"]; ok {
+		if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
 			numPatches++
 		}