mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-12 00:37:04 +00:00
feat: add new Ollama engine using ggml through cgo This change introduces a new way to run pretrained models. It introduces 3 high level interfaces and a bunch of smaller helper interfaces to facilitate this. - `model.Model` defines the interface for a model architecture. Models such as `llama` and `mllama`, which are provided as examples, can implement the model's forward propagation in the `Forward` method. This method will be called to generate completions. This interface can be found in `model/model.go` - `ml.Backend` defines the interface for a backend tensor library, in this case `ggml`. Among other things, a Backend is responsible for loading a pretrained model into hardware (GPU, CPU, etc) and providing an interface for Models to access loaded tensors. This interface can be found in `ml/backend.go` - `ml.Tensor` defines the interface for a tensor and tensor operations This is the first implementation of the new engine. Follow up PRs will implement more features: - non-greedy sampling (#8410) - integration with Ollama and KV caching (#8301) - more model support (#9080) with more coming soon Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
64 lines
1.5 KiB
Go
64 lines
1.5 KiB
Go
package cache
|
|
|
|
import (
|
|
"github.com/ollama/ollama/ml"
|
|
)
|
|
|
|
type Options struct {
|
|
Position int
|
|
}
|
|
|
|
type Cache interface {
|
|
Sub(i int) Cache
|
|
Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor)
|
|
}
|
|
|
|
type Simple struct {
|
|
DType ml.DType
|
|
Capacity int
|
|
|
|
keys, values []ml.Tensor
|
|
}
|
|
|
|
func (c *Simple) Sub(i int) Cache {
|
|
if i >= len(c.keys) {
|
|
c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
|
|
c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
|
|
}
|
|
|
|
return &Simple{
|
|
keys: c.keys[i : i+1],
|
|
values: c.values[i : i+1],
|
|
Capacity: c.Capacity,
|
|
DType: c.DType,
|
|
}
|
|
}
|
|
|
|
func (c *Simple) Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor) {
|
|
if c.keys[0] == nil || c.values[0] == nil {
|
|
c.keys[0] = ctx.Zeros(c.DType, int(key.Dim(0)*key.Dim(1))*c.Capacity)
|
|
c.values[0] = ctx.Zeros(c.DType, int(value.Dim(0)*value.Dim(1))*c.Capacity)
|
|
}
|
|
|
|
ctx.Forward(key.Copy(ctx, c.keys[0].View(ctx, int(key.Stride(2))*opts.Position, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
|
|
ctx.Forward(value.Copy(ctx, c.values[0].View(ctx, int(value.Stride(2))*opts.Position, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))
|
|
|
|
n := min(c.Capacity, int(key.Dim(2))+opts.Position)
|
|
|
|
key = c.keys[0].View(ctx, 0,
|
|
int(key.Dim(0)), int(key.Stride(1)),
|
|
int(key.Dim(1)), int(key.Stride(2)),
|
|
n,
|
|
)
|
|
|
|
value = c.values[0].View(ctx, 0,
|
|
int(value.Dim(0)), int(value.Stride(1)),
|
|
int(value.Dim(1)), int(value.Stride(2)),
|
|
n,
|
|
)
|
|
|
|
// TODO shift context if necessary
|
|
|
|
return key, value
|
|
}
|