mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
setting samebatch on the vision start token is problematic because it will be shared with other inputs that also use images. this will cause the input to be cached and the runner will not see SameBatch. SameBatch will also be incorrect since it may be for a different image. assigning samebatch to the input tokens resolves this by ensure it's assigned correctly to inputs corresponding to the image. not setting same batch correctly may cause panics during inference since images are no longer guaranteed to be in the same batch.
161 lines
4.5 KiB
Go
161 lines
4.5 KiB
Go
package qwen25vl
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"image"
|
|
"slices"
|
|
|
|
"github.com/ollama/ollama/fs"
|
|
"github.com/ollama/ollama/kvcache"
|
|
"github.com/ollama/ollama/ml"
|
|
"github.com/ollama/ollama/model"
|
|
"github.com/ollama/ollama/model/input"
|
|
)
|
|
|
|
type Model struct {
|
|
model.Base
|
|
model.BytePairEncoding
|
|
|
|
*TextModel
|
|
*VisionModel `gguf:"v,vision"`
|
|
|
|
ImageProcessor
|
|
}
|
|
|
|
// Implement MultimodalProcessor interface
|
|
var _ model.MultimodalProcessor = (*Model)(nil)
|
|
|
|
func New(c fs.Config) (model.Model, error) {
|
|
m := &Model{
|
|
BytePairEncoding: model.NewBytePairEncoding(
|
|
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
|
&model.Vocabulary{
|
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
|
Merges: c.Strings("tokenizer.ggml.merges"),
|
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
|
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
|
EOS: append(
|
|
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
|
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
|
),
|
|
},
|
|
),
|
|
TextModel: NewTextModel(c),
|
|
VisionModel: newVisionModel(c),
|
|
ImageProcessor: newImageProcessor(c),
|
|
}
|
|
|
|
m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
|
|
|
|
return m, nil
|
|
}
|
|
|
|
func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *Grid, error) {
|
|
image, _, err := image.Decode(bytes.NewReader(multimodalData))
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
f32s, grid, err := m.ImageProcessor.ProcessImage(image)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
// Calculate tensor dimensions
|
|
patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize *
|
|
m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
|
|
numPatches := grid.Temporal * grid.Height * grid.Width
|
|
|
|
pixelValues, err := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("failed to create tensor from image: %w", err)
|
|
}
|
|
|
|
return pixelValues, grid, nil
|
|
}
|
|
|
|
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
|
|
if len(m.VisionModel.Layers) == 0 {
|
|
return nil, model.ErrNoVisionModel
|
|
}
|
|
|
|
pixels, grid, err := m.PixelValues(ctx, multimodalData)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
|
|
return []input.Multimodal{{Tensor: visionOutputs}}, nil
|
|
}
|
|
|
|
// PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
|
|
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
|
var result []input.Input
|
|
|
|
var (
|
|
imageToken int32 = 151655
|
|
visionStartToken int32 = 151652
|
|
visionEndToken int32 = 151653
|
|
)
|
|
|
|
nImg := 0
|
|
for _, inp := range inputs {
|
|
if inp.Multimodal == nil {
|
|
// If not a multimodal input, add it to the result unchanged
|
|
result = append(result, inp)
|
|
} else {
|
|
// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
|
|
// the image tokens with a prompt, so we add a prefix here
|
|
nImg++
|
|
pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to encode image prompt: %w", err)
|
|
}
|
|
for i := range pre {
|
|
result = append(result, input.Input{Token: pre[i]})
|
|
}
|
|
|
|
patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
|
|
|
|
// First add the vision start token
|
|
result = append(result, input.Input{Token: visionStartToken})
|
|
|
|
// Add the image token with the multimodal tensor data at the first position
|
|
result = append(result, input.Input{
|
|
Token: imageToken,
|
|
Multimodal: inp.Multimodal,
|
|
MultimodalHash: inp.MultimodalHash,
|
|
SameBatch: patchesPerChunk,
|
|
})
|
|
|
|
// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
|
|
result = append(result, slices.Repeat([]input.Input{{Token: imageToken}}, patchesPerChunk-1)...)
|
|
|
|
result = append(result, input.Input{Token: visionEndToken})
|
|
}
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
|
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache)
|
|
}
|
|
|
|
func init() {
|
|
model.Register("qwen25vl", New)
|
|
}
|