mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 07:46:59 +00:00
For some multimodal models (such as gemma3), we create a single graph that generates the image embedding and then use this in the text model. The embedding tensor is completely opaque to the runner. However, this doesn't work if we need to use the embedding in multiple batches. This can arise if the embedding is larger than the batch size. In these cases (as with llama4), we would like to create views that are more appropriately sized. However, if we do this then the original source tensor is used in multiple graphs, which isn't allowed. To avoid that problem, models with this pattern compute the embedding tensor on first use and recreate the individual views. There is no longer a single vision and text graph. This codifies the pattern of separating vision and text graphs. The logic of computing tensors on demand is moved to the runner, so models no longer have to worry about this. It also gives the runner visibility into the multimodal tensors, which is important for memory management.
104 lines
2.7 KiB
Go
104 lines
2.7 KiB
Go
package ollamarunner
|
|
|
|
import (
|
|
"errors"
|
|
|
|
"github.com/ollama/ollama/ml"
|
|
"github.com/ollama/ollama/model/input"
|
|
)
|
|
|
|
// Tensors can't be used across multiple compute graphs. This is a problem
|
|
// if a single embedding is split across batches using views since all of
|
|
// the views will have the same source tensor. We also don't want to
|
|
// recompute the entire embedding for each batch.
|
|
//
|
|
// To avoid this, we compute all of the tensors for the embedding on the
|
|
// first use and then store the result in system memory. When we need
|
|
// additional tensors, we recreate them from the stored data.
|
|
|
|
// multimodalEntry represents the embeddings of a single object (such
|
|
// as an image).
|
|
type multimodalEntry struct {
|
|
// mm is the original set of tensors created by EncodeMultimodal
|
|
mm []input.Multimodal
|
|
|
|
// data is the computed result of mm. Nil if not yet computed
|
|
data [][]float32
|
|
}
|
|
|
|
// multimodalStore maps from an individual tensor (of which there
|
|
// may be many in a single multimodal object) to its parent embedding
|
|
type multimodalStore map[ml.Tensor]*multimodalEntry
|
|
|
|
func newMultimodalStore() multimodalStore {
|
|
return make(multimodalStore)
|
|
}
|
|
|
|
// addMultimodal stores an embedding for later use in a compute graph
|
|
func (m multimodalStore) addMultimodal(embedding []input.Multimodal) {
|
|
entry := &multimodalEntry{mm: embedding}
|
|
|
|
for _, e := range embedding {
|
|
if e.Tensor != nil {
|
|
m[e.Tensor] = entry
|
|
}
|
|
}
|
|
}
|
|
|
|
// getMultimodal takes a source set of tensors (which may contain a whole or
|
|
// parts of one or more images) and returns the equivalent that can be used in
|
|
// the current context
|
|
func (m multimodalStore) getMultimodal(backend ml.Backend, ctx ml.Context, in []input.Multimodal) ([]input.Multimodal, error) {
|
|
out := make([]input.Multimodal, len(in))
|
|
for i := range out {
|
|
if in[i].Tensor != nil {
|
|
var err error
|
|
out[i].Tensor, err = m.getTensor(backend, ctx, in[i].Tensor)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
out[i].Data = in[i].Data
|
|
}
|
|
|
|
return out, nil
|
|
}
|
|
|
|
func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Tensor) (ml.Tensor, error) {
|
|
entry := m[in]
|
|
|
|
if entry.data == nil {
|
|
computeCtx := backend.NewContext()
|
|
defer computeCtx.Close()
|
|
|
|
var tensors []ml.Tensor
|
|
for _, t := range entry.mm {
|
|
if t.Tensor != nil {
|
|
tensors = append(tensors, t.Tensor)
|
|
}
|
|
}
|
|
|
|
if len(tensors) == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
computeCtx.Forward(tensors...).Compute(tensors...)
|
|
|
|
entry.data = make([][]float32, len(entry.mm))
|
|
for i, t := range entry.mm {
|
|
if t.Tensor != nil {
|
|
entry.data[i] = t.Tensor.Floats()
|
|
}
|
|
}
|
|
}
|
|
|
|
for i, t := range entry.mm {
|
|
if in == t.Tensor {
|
|
return ctx.Input().FromFloatSlice(entry.data[i], t.Tensor.Shape()...)
|
|
}
|
|
}
|
|
|
|
return nil, errors.New("multimodal tensor not found")
|
|
}
|