next ollama runner (#7913)

feat: add new Ollama engine using ggml through cgo This change introduces a new way to run pretrained models. It introduces 3 high level interfaces and a bunch of smaller helper interfaces to facilitate this. - `model.Model` defines the interface for a model architecture. Models such as `llama` and `mllama`, which are provided as examples, can implement the model's forward propagation in the `Forward` method. This method will be called to generate completions. This interface can be found in `model/model.go` - `ml.Backend` defines the interface for a backend tensor library, in this case `ggml`. Among other things, a Backend is responsible for loading a pretrained model into hardware (GPU, CPU, etc) and providing an interface for Models to access loaded tensors. This interface can be found in `ml/backend.go` - `ml.Tensor` defines the interface for a tensor and tensor operations This is the first implementation of the new engine. Follow up PRs will implement more features: - non-greedy sampling (#8410) - integration with Ollama and KV caching (#8301) - more model support (#9080) with more coming soon Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2025-12-10 15:57:04 +00:00 · 2025-02-14 00:31:21 +00:00
parent 8cf16063a5
commit 58245413f4
57 changed files with 475427 additions and 494 deletions
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -0,0 +1,196 @@
+package ml
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+)
+
+type Config interface {
+	Architecture() string
+	String(string, ...string) string
+	Uint(string, ...uint32) uint32
+	Float(string, ...float32) float32
+
+	Strings(string, ...[]string) []string
+	Uints(string, ...[]uint32) []uint32
+}
+
+type Backend interface {
+	Config() Config
+	Get(name string) Tensor
+	NewContext() Context
+}
+
+var backends = make(map[string]func(*os.File) (Backend, error))
+
+func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
+	if _, ok := backends[name]; ok {
+		panic("backend: backend already registered")
+	}
+
+	backends[name] = f
+}
+
+func NewBackend(f *os.File) (Backend, error) {
+	if backend, ok := backends["ggml"]; ok {
+		return backend(f)
+	}
+
+	return nil, fmt.Errorf("unsupported backend")
+}
+
+type Context interface {
+	Zeros(dtype DType, shape ...int) Tensor
+	FromFloatSlice(s []float32, shape ...int) (Tensor, error)
+	FromIntSlice(s []int32, shape ...int) (Tensor, error)
+
+	Forward(Tensor)
+	Compute(Tensor) Tensor
+	Close() error
+}
+
+type Tensor interface {
+	Dim(n int) int64
+	Stride(n int) int64
+
+	Shape() []int64
+	DType() DType
+
+	Bytes() []byte
+	Floats() []float32
+
+	Add(ctx Context, t2 Tensor) Tensor
+	Mul(ctx Context, t2 Tensor) Tensor
+	Mulmat(ctx Context, t2 Tensor) Tensor
+
+	Softmax(ctx Context) Tensor
+	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
+	RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
+	Scale(ctx Context, s float64) Tensor
+
+	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
+	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim uint32, base, scale float32) Tensor
+
+	Tanh(ctx Context) Tensor
+	GELU(ctx Context) Tensor
+	SILU(ctx Context) Tensor
+
+	Reshape(ctx Context, shape ...int64) Tensor
+	View(ctx Context, offset int, shape ...int) Tensor
+	Permute(ctx Context, shape ...int) Tensor
+	Contiguous(ctx Context) Tensor
+
+	Pad(ctx Context, shape ...int64) Tensor
+	Unpad(ctx Context, shape ...int64) Tensor
+
+	Stack(ctx Context, dim int, s ...Tensor) Tensor
+	Concat(ctx Context, t2 Tensor, dim int) Tensor
+	Rows(ctx Context, t2 Tensor) Tensor
+	Copy(ctx Context, t2 Tensor) Tensor
+}
+
+type number interface {
+	~int | ~int8 | ~int16 | ~int32 | ~int64 |
+		~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 |
+		~float32 | ~float64 |
+		~complex64 | ~complex128
+}
+
+func mul[T number](s ...T) T {
+	p := T(1)
+	for _, v := range s {
+		p *= v
+	}
+
+	return p
+}
+
+type DumpOptions struct {
+	// Items is the number of elements to print at the beginning and end of each dimension.
+	Items int64
+
+	// Precision is the number of decimal places to print. Applies to float32 and float64.
+	Precision int
+}
+
+func Dump(t Tensor, opts ...DumpOptions) string {
+	if len(opts) < 1 {
+		opts = append(opts, DumpOptions{
+			Items:     3,
+			Precision: 4,
+		})
+	}
+
+	switch t.DType() {
+	case DTypeF32:
+		return dump[[]float32](t, opts[0].Items, func(f float32) string {
+			return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
+		})
+	case DTypeI32:
+		return dump[[]int32](t, opts[0].Items, func(i int32) string {
+			return strconv.FormatInt(int64(i), 10)
+		})
+	default:
+		return "<unsupported>"
+	}
+}
+
+func dump[S ~[]E, E number](t Tensor, items int64, fn func(E) string) string {
+	bts := t.Bytes()
+	if bts == nil {
+		return "<nil>"
+	}
+
+	s := make(S, mul(t.Shape()...))
+	if err := binary.Read(bytes.NewBuffer(t.Bytes()), binary.LittleEndian, &s); err != nil {
+		panic(err)
+	}
+
+	shape := t.Shape()
+
+	var sb strings.Builder
+	var f func([]int64, int64)
+	f = func(dims []int64, stride int64) {
+		prefix := strings.Repeat(" ", len(shape)-len(dims)+1)
+		fmt.Fprint(&sb, "[")
+		defer func() { fmt.Fprint(&sb, "]") }()
+		for i := int64(0); i < dims[0]; i++ {
+			if i >= items && i < dims[0]-items {
+				fmt.Fprint(&sb, "..., ")
+				// skip to next printable element
+				skip := dims[0] - 2*items
+				if len(dims) > 1 {
+					stride += mul(append(dims[1:], skip)...)
+					fmt.Fprint(&sb, strings.Repeat("\n", len(dims)-1), prefix)
+				}
+				i += skip - 1
+			} else if len(dims) > 1 {
+				f(dims[1:], stride)
+				stride += mul(dims[1:]...)
+				if i < dims[0]-1 {
+					fmt.Fprint(&sb, ",", strings.Repeat("\n", len(dims)-1), prefix)
+				}
+			} else {
+				fmt.Fprint(&sb, fn(s[stride+i]))
+				if i < dims[0]-1 {
+					fmt.Fprint(&sb, ", ")
+				}
+			}
+		}
+	}
+	f(shape, 0)
+
+	return sb.String()
+}
+
+type DType int
+
+const (
+	DTypeF32 DType = iota
+	DTypeI32
+	DTypeOther
+)