next ollama runner (#7913)

feat: add new Ollama engine using ggml through cgo This change introduces a new way to run pretrained models. It introduces 3 high level interfaces and a bunch of smaller helper interfaces to facilitate this. - `model.Model` defines the interface for a model architecture. Models such as `llama` and `mllama`, which are provided as examples, can implement the model's forward propagation in the `Forward` method. This method will be called to generate completions. This interface can be found in `model/model.go` - `ml.Backend` defines the interface for a backend tensor library, in this case `ggml`. Among other things, a Backend is responsible for loading a pretrained model into hardware (GPU, CPU, etc) and providing an interface for Models to access loaded tensors. This interface can be found in `ml/backend.go` - `ml.Tensor` defines the interface for a tensor and tensor operations This is the first implementation of the new engine. Follow up PRs will implement more features: - non-greedy sampling (#8410) - integration with Ollama and KV caching (#8301) - more model support (#9080) with more coming soon Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2025-12-10 15:57:04 +00:00 · 2025-02-14 00:31:21 +00:00
parent 8cf16063a5
commit 58245413f4
57 changed files with 475427 additions and 494 deletions
--- a/server/images.go
+++ b/server/images.go
@@ -23,7 +23,7 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/model"
@@ -78,21 +78,21 @@ func (m *Model) CheckCapabilities(caps ...Capability) error {
 	for _, cap := range caps {
 		switch cap {
 		case CapabilityCompletion:
-			f, err := os.Open(m.ModelPath)
+			r, err := os.Open(m.ModelPath)
 			if err != nil {
 				slog.Error("couldn't open model file", "error", err)
 				continue
 			}
-			defer f.Close()
+			defer r.Close()

 			// TODO(mxyng): decode the GGML into model to avoid doing this multiple times
-			ggml, _, err := llm.DecodeGGML(f, 0)
+			f, _, err := ggml.Decode(r, 0)
 			if err != nil {
 				slog.Error("couldn't decode ggml", "error", err)
 				continue
 			}

-			if _, ok := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]; ok {
+			if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
 				errs = append(errs, errCapabilityCompletion)
 			}
 		case CapabilityTools: