mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
models: Move model into their own directory
This allows there to be a file that is a list of models that is not mixed into the runner code.
This commit is contained in:
155
model/models/llama/model.go
Normal file
155
model/models/llama/model.go
Normal file
@@ -0,0 +1,155 @@
|
||||
package llama
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
type Options struct {
|
||||
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
|
||||
hiddenSize, numHeads, numKVHeads int
|
||||
eps, ropeBase, ropeScale float32
|
||||
ropeDim uint32
|
||||
}
|
||||
|
||||
type Model struct {
|
||||
model.Base
|
||||
model.BytePairEncoding
|
||||
|
||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||
Layers []Layer `gguf:"blk"`
|
||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||
Output *nn.Linear `gguf:"output,alt:token_embd"`
|
||||
|
||||
*Options
|
||||
}
|
||||
|
||||
func New(c ml.Config) (model.Model, error) {
|
||||
return &Model{
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Uints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
},
|
||||
),
|
||||
Layers: make([]Layer, c.Uint("block_count")),
|
||||
Options: &Options{
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
numHeads: int(c.Uint("attention.head_count")),
|
||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||
ropeBase: c.Float("rope.freq_base"),
|
||||
ropeScale: c.Float("rope.freq_scale", 1),
|
||||
ropeDim: c.Uint("rope.dimension_count"),
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
type SelfAttention struct {
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_output"`
|
||||
}
|
||||
|
||||
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache model.Cache, opts *Options) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
|
||||
q := sa.Query.Forward(ctx, hiddenState)
|
||||
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
q = q.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
k := sa.Key.Forward(ctx, hiddenState)
|
||||
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
k = k.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
v := sa.Value.Forward(ctx, hiddenState)
|
||||
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
|
||||
k, v = cache.Put(ctx, k, v, cache.Options)
|
||||
|
||||
q = q.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
k = k.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
kq := k.MulmatFullPrec(ctx, q)
|
||||
kq = kq.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
|
||||
kq = kq.Softmax(ctx)
|
||||
|
||||
kqv := v.Mulmat(ctx, kq)
|
||||
kqv = kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||
|
||||
return sa.Output.Forward(ctx, kqv)
|
||||
}
|
||||
|
||||
type MLP struct {
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
||||
}
|
||||
|
||||
func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
|
||||
hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
|
||||
return mlp.Down.Forward(ctx, hiddenState)
|
||||
}
|
||||
|
||||
type Layer struct {
|
||||
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
||||
SelfAttention *SelfAttention
|
||||
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
||||
MLP *MLP
|
||||
}
|
||||
|
||||
func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache model.Cache, opts *Options) ml.Tensor {
|
||||
residual := hiddenState
|
||||
|
||||
hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
residual = hiddenState
|
||||
|
||||
hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
|
||||
return hiddenState.Add(ctx, residual)
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
|
||||
inputs, err := ctx.FromIntSlice(opts.Inputs(), len(opts.Inputs()))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
positions, err := ctx.FromIntSlice(opts.Positions(), len(opts.Positions()))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
|
||||
|
||||
for i, layer := range m.Layers {
|
||||
hiddenState = layer.Forward(ctx, hiddenState, positions, opts.Cache.Sub(i), m.Options)
|
||||
}
|
||||
|
||||
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
|
||||
hiddenState = m.Output.Forward(ctx, hiddenState)
|
||||
|
||||
outputs, err := ctx.FromIntSlice([]int32{int32(len(opts.Positions())) - 1}, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return hiddenState.Rows(ctx, outputs), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
model.Register("llama", New)
|
||||
}
|
||||
201
model/models/mllama/imageproc.go
Normal file
201
model/models/mllama/imageproc.go
Normal file
@@ -0,0 +1,201 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"image"
|
||||
_ "image/jpeg"
|
||||
_ "image/png"
|
||||
"io"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"golang.org/x/image/draw"
|
||||
|
||||
"github.com/ollama/ollama/model/imageproc"
|
||||
)
|
||||
|
||||
func getSupportedAspectRatios(maxTiles int) []image.Point {
|
||||
ratios := []image.Point{}
|
||||
|
||||
for w := range maxTiles {
|
||||
for h := range maxTiles {
|
||||
if (w+1)*(h+1) <= maxTiles {
|
||||
ratios = append(ratios, image.Point{w + 1, h + 1})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ratios
|
||||
}
|
||||
|
||||
func clip(a, a_min, a_max int) int {
|
||||
if a < a_min {
|
||||
return a_min
|
||||
} else if a > a_max {
|
||||
return a_max
|
||||
}
|
||||
|
||||
return a
|
||||
}
|
||||
|
||||
func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
|
||||
possibleTileArrangements := getSupportedAspectRatios(maxImageTiles)
|
||||
possibleCanvasSizes := []image.Point{}
|
||||
for _, pta := range possibleTileArrangements {
|
||||
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
|
||||
}
|
||||
|
||||
scales := []float64{}
|
||||
|
||||
for _, pcs := range possibleCanvasSizes {
|
||||
scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
|
||||
scaleWidth := float64(pcs.X) / float64(imageSize.X)
|
||||
|
||||
if scaleWidth > scaleHeight {
|
||||
scales = append(scales, scaleHeight)
|
||||
} else {
|
||||
scales = append(scales, scaleWidth)
|
||||
}
|
||||
}
|
||||
|
||||
var minUpscale float64
|
||||
var maxDownscale float64
|
||||
var upscale bool
|
||||
|
||||
for _, s := range scales {
|
||||
if s > 1.0 {
|
||||
upscale = true
|
||||
if minUpscale == 0 {
|
||||
minUpscale = s
|
||||
} else {
|
||||
minUpscale = math.Min(minUpscale, s)
|
||||
}
|
||||
} else {
|
||||
maxDownscale = math.Max(maxDownscale, s)
|
||||
}
|
||||
}
|
||||
|
||||
selectedScale := maxDownscale
|
||||
if upscale {
|
||||
selectedScale = minUpscale
|
||||
}
|
||||
|
||||
var selectedCanvas image.Point
|
||||
for n, pcs := range possibleCanvasSizes {
|
||||
if scales[n] == selectedScale {
|
||||
// choose the smallest possible canvas
|
||||
if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
|
||||
selectedCanvas = pcs
|
||||
} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
|
||||
selectedCanvas = pcs
|
||||
}
|
||||
}
|
||||
}
|
||||
return selectedCanvas
|
||||
}
|
||||
|
||||
func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
|
||||
targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
|
||||
targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
|
||||
|
||||
scaleWidth := float64(targetWidth) / float64(imageSize.X)
|
||||
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
|
||||
|
||||
var w, h int
|
||||
|
||||
if scaleWidth < scaleHeight {
|
||||
w = targetWidth
|
||||
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
|
||||
} else {
|
||||
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
|
||||
h = targetHeight
|
||||
}
|
||||
|
||||
return image.Point{w, h}
|
||||
}
|
||||
|
||||
func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
|
||||
if format == "png" {
|
||||
img = imageproc.Composite(img)
|
||||
}
|
||||
|
||||
b := img.Bounds()
|
||||
tileSize := outputSize.Y
|
||||
|
||||
canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
|
||||
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
|
||||
newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
|
||||
|
||||
return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio
|
||||
}
|
||||
|
||||
func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
|
||||
paddedSize := image.Point{
|
||||
X: outputSize.X * aspectRatio.X,
|
||||
Y: outputSize.Y * aspectRatio.Y,
|
||||
}
|
||||
|
||||
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
|
||||
draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
|
||||
b := img.Bounds()
|
||||
width := b.Max.X - b.Min.X
|
||||
height := b.Max.Y - b.Min.Y
|
||||
tileHeight := height / numTilesSize.Y
|
||||
tileWidth := width / numTilesSize.X
|
||||
|
||||
images := []image.Image{}
|
||||
|
||||
for h := range numTilesSize.Y {
|
||||
for w := range numTilesSize.X {
|
||||
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
|
||||
images = append(images, img.(interface {
|
||||
SubImage(image.Rectangle) image.Image
|
||||
}).SubImage(rect))
|
||||
}
|
||||
}
|
||||
|
||||
return images
|
||||
}
|
||||
|
||||
func packImages(img image.Image, aspectRatio image.Point) []float32 {
|
||||
subImages := splitToTiles(img, aspectRatio)
|
||||
|
||||
var pixelVals []float32
|
||||
|
||||
rescale := true
|
||||
channelFirst := true
|
||||
|
||||
for _, subImg := range subImages {
|
||||
vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst)
|
||||
pixelVals = append(pixelVals, vals...)
|
||||
}
|
||||
|
||||
return pixelVals
|
||||
}
|
||||
|
||||
func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
|
||||
outputSize := image.Point{560, 560}
|
||||
maxTiles := 4
|
||||
|
||||
img, format, err := image.Decode(imageData)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("failed to decode image: %w", err)
|
||||
}
|
||||
|
||||
newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles)
|
||||
newImage = padImage(newImage, outputSize, aspectRatio)
|
||||
|
||||
data := packImages(newImage, aspectRatio)
|
||||
aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1
|
||||
|
||||
opts := map[string]any{
|
||||
"aspectRatioIndex": aspectRatioIndex,
|
||||
}
|
||||
|
||||
return data, opts, nil
|
||||
}
|
||||
420
model/models/mllama/imageproc_test.go
Normal file
420
model/models/mllama/imageproc_test.go
Normal file
@@ -0,0 +1,420 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"image"
|
||||
"image/png"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
)
|
||||
|
||||
func TestAspectRatios(t *testing.T) {
|
||||
type aspectCase struct {
|
||||
MaxTiles int
|
||||
Expected []image.Point
|
||||
}
|
||||
|
||||
cases := []aspectCase{
|
||||
{
|
||||
MaxTiles: 1,
|
||||
Expected: []image.Point{{1, 1}},
|
||||
},
|
||||
{
|
||||
MaxTiles: 2,
|
||||
Expected: []image.Point{{1, 1}, {1, 2}, {2, 1}},
|
||||
},
|
||||
{
|
||||
MaxTiles: 3,
|
||||
Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {2, 1}, {3, 1}},
|
||||
},
|
||||
{
|
||||
MaxTiles: 4,
|
||||
Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {2, 1}, {2, 2}, {3, 1}, {4, 1}},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := getSupportedAspectRatios(c.MaxTiles)
|
||||
|
||||
if diff := cmp.Diff(actual, c.Expected); diff != "" {
|
||||
t.Errorf("mismatch (-got +want):\n%s", diff)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetImageSizeFitToCanvas(t *testing.T) {
|
||||
type imageSizeCase struct {
|
||||
ImageRect image.Point
|
||||
CanvasRect image.Point
|
||||
TileSize int
|
||||
Expected image.Point
|
||||
}
|
||||
|
||||
cases := []imageSizeCase{
|
||||
{
|
||||
ImageRect: image.Point{400, 400},
|
||||
CanvasRect: image.Point{640, 480},
|
||||
TileSize: 200,
|
||||
Expected: image.Point{400, 400},
|
||||
},
|
||||
{
|
||||
ImageRect: image.Point{1024, 768},
|
||||
CanvasRect: image.Point{640, 480},
|
||||
TileSize: 200,
|
||||
Expected: image.Point{640, 480},
|
||||
},
|
||||
{
|
||||
ImageRect: image.Point{500, 500},
|
||||
CanvasRect: image.Point{1000, 1000},
|
||||
TileSize: 750,
|
||||
Expected: image.Point{750, 750},
|
||||
},
|
||||
{
|
||||
ImageRect: image.Point{500, 1000},
|
||||
CanvasRect: image.Point{2000, 2000},
|
||||
TileSize: 2000,
|
||||
Expected: image.Point{1000, 2000},
|
||||
},
|
||||
{
|
||||
ImageRect: image.Point{4000, 3000},
|
||||
CanvasRect: image.Point{2000, 1000},
|
||||
TileSize: 1000,
|
||||
Expected: image.Point{1333, 1000},
|
||||
},
|
||||
{
|
||||
ImageRect: image.Point{667, 1000},
|
||||
CanvasRect: image.Point{1000, 1000},
|
||||
TileSize: 560,
|
||||
Expected: image.Point{667, 1000},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize)
|
||||
|
||||
if actual != c.Expected {
|
||||
t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetOptimalTiledCanvas(t *testing.T) {
|
||||
type tiledCanvasSizeCase struct {
|
||||
ImageSize image.Point
|
||||
MaxImageTiles int
|
||||
TileSize int
|
||||
Expected image.Point
|
||||
}
|
||||
|
||||
cases := []tiledCanvasSizeCase{
|
||||
{
|
||||
ImageSize: image.Point{1024, 768},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 1000,
|
||||
Expected: image.Point{2000, 1000},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{1024, 768},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{1120, 1120},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{800, 600},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{1120, 1120},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{640, 480},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{1120, 560},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{320, 200},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{560, 560},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{1320, 200},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{1680, 560},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{2000, 200},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{2240, 560},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{10000, 200},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{2240, 560},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{480, 640},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{560, 1120},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{200, 320},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{560, 560},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{200, 1320},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{560, 1680},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{200, 2000},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{560, 2240},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{200, 10000},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{560, 2240},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{10000, 10000},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{1120, 1120},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize)
|
||||
|
||||
if actual != c.Expected {
|
||||
t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplitToTiles(t *testing.T) {
|
||||
type splitCase struct {
|
||||
TestImage image.Image
|
||||
NumTilesSize image.Point
|
||||
Expected []image.Image
|
||||
}
|
||||
|
||||
cases := []splitCase{
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
||||
NumTilesSize: image.Point{1, 1},
|
||||
Expected: []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 500)),
|
||||
NumTilesSize: image.Point{2, 1},
|
||||
Expected: []image.Image{
|
||||
image.NewRGBA(image.Rect(0, 0, 500, 500)),
|
||||
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
|
||||
},
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 1000)),
|
||||
NumTilesSize: image.Point{2, 2},
|
||||
Expected: []image.Image{
|
||||
image.NewRGBA(image.Rect(0, 0, 500, 500)),
|
||||
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
|
||||
image.NewRGBA(image.Rect(0, 500, 500, 1000)),
|
||||
image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := splitToTiles(c.TestImage, c.NumTilesSize)
|
||||
|
||||
if len(actual) != len(c.Expected) {
|
||||
t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected))
|
||||
}
|
||||
|
||||
for i := range actual {
|
||||
if actual[i].Bounds() != c.Expected[i].Bounds() {
|
||||
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual[i].Bounds(), c.Expected[i].Bounds())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestResize(t *testing.T) {
|
||||
type resizeCase struct {
|
||||
TestImage image.Image
|
||||
OutputSize image.Point
|
||||
MaxImageTiles int
|
||||
ExpectedImage image.Image
|
||||
ExpectedAspectRatio image.Point
|
||||
}
|
||||
|
||||
cases := []resizeCase{
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)),
|
||||
OutputSize: image.Point{100, 100},
|
||||
MaxImageTiles: 1,
|
||||
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
|
||||
ExpectedAspectRatio: image.Point{1, 1},
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)),
|
||||
OutputSize: image.Point{100, 100},
|
||||
MaxImageTiles: 2,
|
||||
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
|
||||
ExpectedAspectRatio: image.Point{1, 1},
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)),
|
||||
OutputSize: image.Point{560, 560},
|
||||
MaxImageTiles: 4,
|
||||
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
|
||||
ExpectedAspectRatio: image.Point{1, 1},
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 2560, 1920)),
|
||||
OutputSize: image.Point{560, 560},
|
||||
MaxImageTiles: 4,
|
||||
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1120, 840)),
|
||||
ExpectedAspectRatio: image.Point{2, 2},
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
||||
OutputSize: image.Point{560, 560},
|
||||
MaxImageTiles: 4,
|
||||
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
||||
ExpectedAspectRatio: image.Point{2, 2},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
|
||||
|
||||
if actualImage.Bounds() != c.ExpectedImage.Bounds() {
|
||||
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds())
|
||||
}
|
||||
|
||||
if actualAspectRatio != c.ExpectedAspectRatio {
|
||||
t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPad(t *testing.T) {
|
||||
type padCase struct {
|
||||
TestImage image.Image
|
||||
OutputSize image.Point
|
||||
AspectRatio image.Point
|
||||
Expected image.Image
|
||||
}
|
||||
|
||||
cases := []padCase{
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 667)),
|
||||
OutputSize: image.Point{560, 560},
|
||||
AspectRatio: image.Point{2, 2},
|
||||
Expected: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio)
|
||||
|
||||
if actual.Bounds() != c.Expected.Bounds() {
|
||||
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPackImages(t *testing.T) {
|
||||
type packCase struct {
|
||||
TestImage image.Image
|
||||
AspectRatio image.Point
|
||||
ExpectedVals int
|
||||
}
|
||||
|
||||
cases := []packCase{
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
|
||||
AspectRatio: image.Point{2, 2},
|
||||
ExpectedVals: 2 * 2 * 3 * 560 * 560,
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
|
||||
AspectRatio: image.Point{1, 1},
|
||||
ExpectedVals: 1 * 1 * 3 * 560 * 560,
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 560)),
|
||||
AspectRatio: image.Point{1, 2},
|
||||
ExpectedVals: 1 * 2 * 3 * 560 * 560,
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actualVals := packImages(c.TestImage, c.AspectRatio)
|
||||
if len(actualVals) != c.ExpectedVals {
|
||||
t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPreprocess(t *testing.T) {
|
||||
type preprocessCase struct {
|
||||
TestImage image.Image
|
||||
ExpectedVals int
|
||||
ExpectedAspectRatioID int
|
||||
}
|
||||
|
||||
cases := []preprocessCase{
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)),
|
||||
ExpectedVals: 0,
|
||||
ExpectedAspectRatioID: 1,
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
||||
ExpectedVals: 0,
|
||||
ExpectedAspectRatioID: 6,
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
var buf bytes.Buffer
|
||||
err := png.Encode(&buf, c.TestImage)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
imgData, opts, err := Preprocess(&buf)
|
||||
if err != nil {
|
||||
t.Fatalf("error processing: %q", err)
|
||||
}
|
||||
|
||||
if len(imgData) == 0 {
|
||||
t.Errorf("no image data returned")
|
||||
}
|
||||
|
||||
ar, ok := opts["aspectRatioIndex"]
|
||||
if !ok {
|
||||
t.Fatalf("no aspect ratio found")
|
||||
}
|
||||
|
||||
aspectRatioID := ar.(int)
|
||||
|
||||
if aspectRatioID != c.ExpectedAspectRatioID {
|
||||
t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
|
||||
}
|
||||
}
|
||||
}
|
||||
99
model/models/mllama/model.go
Normal file
99
model/models/mllama/model.go
Normal file
@@ -0,0 +1,99 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
type Model struct {
|
||||
model.Base
|
||||
model.BytePairEncoding
|
||||
|
||||
*VisionModel `gguf:"v,vision"`
|
||||
*TextModel
|
||||
|
||||
Projector *nn.Linear `gguf:"mm.0"`
|
||||
|
||||
ImageProcessor
|
||||
}
|
||||
|
||||
func New(c ml.Config) (model.Model, error) {
|
||||
return &Model{
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Uints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
},
|
||||
),
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
VisionModel: newVisionModel(c),
|
||||
TextModel: newTextModel(c),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
|
||||
var crossAttentionStates ml.Tensor
|
||||
if opts.Images != nil {
|
||||
f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(opts.Images[0])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pixelValues, err := ctx.FromFloatSlice(f32s,
|
||||
m.ImageProcessor.imageSize,
|
||||
m.ImageProcessor.imageSize,
|
||||
m.ImageProcessor.numChannels,
|
||||
m.ImageProcessor.maxNumTiles,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
aspectRatio, err := ctx.FromIntSlice([]int32{int32(aspectRatioID)}, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
positions := make([]int32, 1601)
|
||||
for i := range positions {
|
||||
positions[i] = int32(i)
|
||||
}
|
||||
|
||||
positionIDs, err := ctx.FromIntSlice(positions, len(positions))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
crossAttentionStates = m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
|
||||
crossAttentionStates = m.Projector.Forward(ctx, crossAttentionStates)
|
||||
}
|
||||
|
||||
inputs, err := ctx.FromIntSlice(opts.Inputs(), len(opts.Inputs()))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
positions, err := ctx.FromIntSlice(opts.Positions(), len(opts.Positions()))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// TODO: attention mask, cross attention mask
|
||||
hiddenState := m.TextModel.Forward(ctx, inputs, positions, nil, crossAttentionStates, nil, opts.Cache)
|
||||
|
||||
outputs, err := ctx.FromIntSlice([]int32{int32(len(opts.Positions())) - 1}, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return hiddenState.Rows(ctx, outputs), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
model.Register("mllama", New)
|
||||
}
|
||||
225
model/models/mllama/model_text.go
Normal file
225
model/models/mllama/model_text.go
Normal file
@@ -0,0 +1,225 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
type TextSelfAttention struct {
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_output"`
|
||||
}
|
||||
|
||||
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, mask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
|
||||
query := sa.Query.Forward(ctx, hiddenState)
|
||||
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
query = query.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
key := sa.Key.Forward(ctx, hiddenState)
|
||||
key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
key = key.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
value := sa.Value.Forward(ctx, hiddenState)
|
||||
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
|
||||
key, value = cache.Put(ctx, key, value, cache.Options)
|
||||
|
||||
query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
scores := key.MulmatFullPrec(ctx, query)
|
||||
scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
|
||||
|
||||
if mask != nil {
|
||||
scores = scores.Add(ctx, mask)
|
||||
}
|
||||
|
||||
scores = scores.Softmax(ctx)
|
||||
|
||||
attention := value.Mulmat(ctx, scores)
|
||||
attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||
|
||||
return sa.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
type TextMLP struct {
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
||||
}
|
||||
|
||||
func (mlp *TextMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextModelOptions) ml.Tensor {
|
||||
hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
|
||||
return mlp.Down.Forward(ctx, hiddenState)
|
||||
}
|
||||
|
||||
type TextSelfAttentionDecoderLayer struct {
|
||||
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
||||
SelfAttention *TextSelfAttention
|
||||
|
||||
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
||||
MLP *TextMLP
|
||||
}
|
||||
|
||||
func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, mask, _, _ ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
|
||||
residual := hiddenState
|
||||
|
||||
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, mask, cache, opts)
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
residual = hiddenState
|
||||
|
||||
hiddenState = d.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = d.MLP.Forward(ctx, hiddenState, opts)
|
||||
return hiddenState.Add(ctx, residual)
|
||||
}
|
||||
|
||||
type TextCrossAttention struct {
|
||||
QueryNorm *nn.RMSNorm `gguf:"cross_attn_q_norm"`
|
||||
Query *nn.Linear `gguf:"cross_attn_q_proj"`
|
||||
KeyNorm *nn.RMSNorm `gguf:"cross_attn_k_norm"`
|
||||
Key *nn.Linear `gguf:"cross_attn_k_proj"`
|
||||
Value *nn.Linear `gguf:"cross_attn_v_proj"`
|
||||
Output *nn.Linear `gguf:"cross_attn_o_proj"`
|
||||
}
|
||||
|
||||
func (ca *TextCrossAttention) Forward(ctx ml.Context, hiddenState, crossAttentionStates ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
numVisionTokens, numTiles := crossAttentionStates.Dim(1), crossAttentionStates.Dim(2)
|
||||
|
||||
query := ca.Query.Forward(ctx, hiddenState)
|
||||
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
query = ca.QueryNorm.Forward(ctx, query, opts.eps)
|
||||
|
||||
key := ca.Key.Forward(ctx, crossAttentionStates)
|
||||
key = key.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
|
||||
key = ca.KeyNorm.Forward(ctx, key, opts.eps)
|
||||
|
||||
value := ca.Value.Forward(ctx, crossAttentionStates)
|
||||
value = value.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
|
||||
|
||||
// TODO cache key, value
|
||||
|
||||
query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
scores := key.Mulmat(ctx, query)
|
||||
scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
|
||||
scores = scores.Softmax(ctx)
|
||||
|
||||
attention := value.Mulmat(ctx, scores)
|
||||
attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||
|
||||
return ca.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
type TextCrossAttentionDecoderLayer struct {
|
||||
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
||||
CrossAttention *TextCrossAttention
|
||||
AttentionGate ml.Tensor `gguf:"cross_attn_attn_gate"`
|
||||
|
||||
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
||||
MLP *TextMLP
|
||||
MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
|
||||
}
|
||||
|
||||
func (d TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
|
||||
residual := hiddenState
|
||||
|
||||
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = d.CrossAttention.Forward(ctx, hiddenState, crossAttentionStates, cache, opts)
|
||||
hiddenState = hiddenState.Mul(ctx, d.AttentionGate.Tanh(ctx))
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
residual = hiddenState
|
||||
|
||||
hiddenState = d.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = d.MLP.Forward(ctx, hiddenState, opts)
|
||||
hiddenState = hiddenState.Mul(ctx, d.MLPGate.Tanh(ctx))
|
||||
return hiddenState.Add(ctx, residual)
|
||||
}
|
||||
|
||||
type TextDecoderLayer interface {
|
||||
Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor
|
||||
}
|
||||
|
||||
type TextDecoder struct {
|
||||
Layers []TextDecoderLayer
|
||||
}
|
||||
|
||||
func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
|
||||
for i, layer := range d.Layers {
|
||||
if !slices.Contains(opts.crossAttentionLayers, uint32(i)) || crossAttentionStates != nil {
|
||||
hiddenState = layer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache.Sub(i), opts)
|
||||
}
|
||||
}
|
||||
|
||||
return hiddenState
|
||||
}
|
||||
|
||||
type TextModelOptions struct {
|
||||
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
|
||||
|
||||
hiddenSize, numHeads, numKVHeads int
|
||||
eps, ropeBase, ropeScale float32
|
||||
ropeDim uint32
|
||||
|
||||
crossAttentionLayers []uint32
|
||||
}
|
||||
|
||||
type TextModel struct {
|
||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||
Transformer *TextDecoder `gguf:"blk"`
|
||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||
Output *nn.Linear `gguf:"output"`
|
||||
|
||||
*TextModelOptions
|
||||
}
|
||||
|
||||
func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache) ml.Tensor {
|
||||
hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
|
||||
hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
|
||||
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
|
||||
return m.Output.Forward(ctx, hiddenState)
|
||||
}
|
||||
|
||||
func newTextModel(c ml.Config) *TextModel {
|
||||
var decoderLayers []TextDecoderLayer
|
||||
for i := range c.Uint("block_count") {
|
||||
var textDecoderLayer TextDecoderLayer
|
||||
if slices.Contains(c.Uints("attention.cross_attention_layers"), i) {
|
||||
textDecoderLayer = &TextCrossAttentionDecoderLayer{}
|
||||
} else {
|
||||
textDecoderLayer = &TextSelfAttentionDecoderLayer{}
|
||||
}
|
||||
|
||||
decoderLayers = append(decoderLayers, textDecoderLayer)
|
||||
}
|
||||
|
||||
return &TextModel{
|
||||
Transformer: &TextDecoder{Layers: decoderLayers},
|
||||
TextModelOptions: &TextModelOptions{
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
numHeads: int(c.Uint("attention.head_count")),
|
||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||
ropeBase: c.Float("rope.freq_base"),
|
||||
ropeScale: c.Float("rope.freq_scale", 1),
|
||||
ropeDim: c.Uint("rope.dimension_count"),
|
||||
crossAttentionLayers: c.Uints("attention.cross_attention_layers"),
|
||||
},
|
||||
}
|
||||
}
|
||||
234
model/models/mllama/model_vision.go
Normal file
234
model/models/mllama/model_vision.go
Normal file
@@ -0,0 +1,234 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
)
|
||||
|
||||
var batchSize int = 1
|
||||
|
||||
type VisionSelfAttention struct {
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_out"`
|
||||
|
||||
Gate ml.Tensor `gguf:"attn_gate"`
|
||||
}
|
||||
|
||||
func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
|
||||
query := sa.Query.Forward(ctx, hiddenState)
|
||||
query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
|
||||
query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
|
||||
key := sa.Key.Forward(ctx, hiddenState)
|
||||
key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
|
||||
key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
|
||||
value := sa.Value.Forward(ctx, hiddenState)
|
||||
value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
|
||||
value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
scores := key.Mulmat(ctx, query)
|
||||
scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
|
||||
scores = scores.Softmax(ctx)
|
||||
|
||||
attention := value.Mulmat(ctx, scores)
|
||||
attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize)
|
||||
attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
|
||||
|
||||
hiddenState = sa.Output.Forward(ctx, attention)
|
||||
if sa.Gate != nil {
|
||||
hiddenState = hiddenState.Mul(ctx, sa.Gate)
|
||||
}
|
||||
|
||||
return hiddenState
|
||||
}
|
||||
|
||||
type VisionMLP struct {
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
|
||||
Gate ml.Tensor `gguf:"ffn_gate"`
|
||||
}
|
||||
|
||||
func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
hiddenState = mlp.Down.Forward(ctx, hiddenState).GELU(ctx)
|
||||
hiddenState = mlp.Up.Forward(ctx, hiddenState)
|
||||
if mlp.Gate != nil {
|
||||
hiddenState = hiddenState.Mul(ctx, mlp.Gate)
|
||||
}
|
||||
|
||||
return hiddenState
|
||||
}
|
||||
|
||||
type VisionEncoderLayer struct {
|
||||
AttentionNorm *nn.LayerNorm `gguf:"ln1"`
|
||||
SelfAttention *VisionSelfAttention
|
||||
|
||||
MLPNorm *nn.LayerNorm `gguf:"ln2"`
|
||||
MLP *VisionMLP
|
||||
}
|
||||
|
||||
func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
residual := hiddenState
|
||||
|
||||
// self attention
|
||||
hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
residual = hiddenState
|
||||
|
||||
// feed forward
|
||||
hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
|
||||
return hiddenState.Add(ctx, residual)
|
||||
}
|
||||
|
||||
type VisionEncoder struct {
|
||||
Layers []VisionEncoderLayer
|
||||
}
|
||||
|
||||
func (e *VisionEncoder) Forward(ctx ml.Context, hiddenState ml.Tensor, intermediateLayersIndices []uint32, opts *VisionModelOptions) (ml.Tensor, []ml.Tensor) {
|
||||
var intermediateHiddenStates []ml.Tensor
|
||||
for i, layer := range e.Layers {
|
||||
if slices.Contains(intermediateLayersIndices, uint32(i)) {
|
||||
intermediateHiddenStates = append(intermediateHiddenStates, hiddenState.Reshape(ctx, append([]int{1}, hiddenState.Shape()...)...))
|
||||
}
|
||||
|
||||
hiddenState = layer.Forward(ctx, hiddenState, opts)
|
||||
}
|
||||
|
||||
return hiddenState, intermediateHiddenStates
|
||||
}
|
||||
|
||||
type PrecomputedAspectRatioEmbedding struct {
|
||||
Embedding *nn.Embedding
|
||||
Gate ml.Tensor `gguf:"gate"`
|
||||
}
|
||||
|
||||
func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
|
||||
embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, opts.numTiles)
|
||||
if e.Gate != nil {
|
||||
embeddings = embeddings.Mul(ctx, e.Gate)
|
||||
}
|
||||
|
||||
return hiddenState.Add(ctx, embeddings)
|
||||
}
|
||||
|
||||
type PrecomputedPositionEmbedding struct {
|
||||
PositionEmbedding *nn.Embedding `gguf:"position_embd"`
|
||||
PositionEmbeddingGate ml.Tensor `gguf:"position_embd.gate"`
|
||||
|
||||
TilePositionEmbedding *nn.Embedding `gguf:"tile_position_embd"`
|
||||
TilePositionEmbeddingGate ml.Tensor `gguf:"tile_position_embd.gate"`
|
||||
}
|
||||
|
||||
func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int, opts *VisionModelOptions) ml.Tensor {
|
||||
positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
|
||||
if e.PositionEmbeddingGate != nil {
|
||||
positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
|
||||
}
|
||||
|
||||
hiddenState = hiddenState.Add(ctx, positionEmbedding)
|
||||
|
||||
tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
|
||||
tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, opts.numTiles)
|
||||
if e.TilePositionEmbeddingGate != nil {
|
||||
tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
|
||||
}
|
||||
|
||||
return hiddenState.Add(ctx, tilePositionEmbedding)
|
||||
}
|
||||
|
||||
type VisionModelOptions struct {
|
||||
hiddenSize, numHeads, numTiles int
|
||||
imageSize, patchSize int
|
||||
eps float32
|
||||
|
||||
intermediateLayersIndices []uint32
|
||||
}
|
||||
|
||||
type VisionModel struct {
|
||||
PatchEmbeddings *nn.Conv2D `gguf:"patch_embd"`
|
||||
|
||||
PreTilePositionEmbedding *PrecomputedAspectRatioEmbedding `gguf:"pre_tile_position_embd"`
|
||||
PostTilePositionEmbedding *PrecomputedAspectRatioEmbedding `gguf:"post_tile_position_embd"`
|
||||
PositionEmbedding *PrecomputedPositionEmbedding
|
||||
|
||||
PreLayerNorm *nn.LayerNorm `gguf:"pre_ln"`
|
||||
PostLayerNorm *nn.LayerNorm `gguf:"post_ln"`
|
||||
ClassEmbedding ml.Tensor `gguf:"class_embd"`
|
||||
|
||||
Transformer *VisionEncoder `gguf:"blk"`
|
||||
GlobalTransformer *VisionEncoder `gguf:"global.blk"`
|
||||
|
||||
*VisionModelOptions
|
||||
}
|
||||
|
||||
func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRatioIDs ml.Tensor) ml.Tensor {
|
||||
numPatches := (m.imageSize / m.patchSize) * (m.imageSize / m.patchSize)
|
||||
numPositions := numPatches
|
||||
if m.ClassEmbedding != nil {
|
||||
numPositions++
|
||||
}
|
||||
|
||||
hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
|
||||
hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, m.numTiles)
|
||||
hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
||||
|
||||
hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
|
||||
hiddenState = m.ClassEmbedding.Stack(ctx, 2, slices.Repeat([]ml.Tensor{m.ClassEmbedding}, m.numTiles-1)...).Concat(ctx, hiddenState, 1)
|
||||
|
||||
hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions)
|
||||
hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)
|
||||
|
||||
numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
|
||||
hiddenState = hiddenState.Pad(ctx, 0, numPaddingPatches, 0, 0)
|
||||
|
||||
hiddenState = hiddenState.Reshape(ctx, hiddenState.Dim(0), hiddenState.Dim(1)*hiddenState.Dim(2), batchSize)
|
||||
hiddenState, intermediateHiddenStates := m.Transformer.Forward(ctx, hiddenState, m.intermediateLayersIndices, m.VisionModelOptions)
|
||||
|
||||
hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
|
||||
|
||||
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
|
||||
hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
|
||||
|
||||
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, m.numTiles*(numPositions+numPaddingPatches), batchSize)
|
||||
hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)
|
||||
|
||||
hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
|
||||
hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
|
||||
hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0)
|
||||
|
||||
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
|
||||
hiddenState = hiddenState.Unpad(ctx, 0, numPaddingPatches, 0, 0)
|
||||
return hiddenState.Concat(ctx, hiddenStates, 0)
|
||||
}
|
||||
|
||||
func newVisionModel(c ml.Config) *VisionModel {
|
||||
return &VisionModel{
|
||||
Transformer: &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count"))},
|
||||
GlobalTransformer: &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.global.block_count"))},
|
||||
|
||||
VisionModelOptions: &VisionModelOptions{
|
||||
hiddenSize: int(c.Uint("vision.embedding_length")),
|
||||
numHeads: int(c.Uint("vision.attention.head_count")),
|
||||
numTiles: int(c.Uint("vision.max_num_tiles")),
|
||||
|
||||
imageSize: int(c.Uint("vision.image_size")),
|
||||
patchSize: int(c.Uint("vision.patch_size")),
|
||||
|
||||
eps: c.Float("vision.attention.layer_norm_epsilon"),
|
||||
|
||||
intermediateLayersIndices: c.Uints("vision.intermediate_layers_indices"),
|
||||
},
|
||||
}
|
||||
}
|
||||
240
model/models/mllama/process_image.go
Normal file
240
model/models/mllama/process_image.go
Normal file
@@ -0,0 +1,240 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"image"
|
||||
"image/color"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"golang.org/x/image/draw"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
type ImageProcessor struct {
|
||||
imageSize, numChannels, maxNumTiles int
|
||||
}
|
||||
|
||||
func newImageProcessor(c ml.Config) ImageProcessor {
|
||||
return ImageProcessor{
|
||||
imageSize: int(c.Uint("vision.image_size")),
|
||||
numChannels: int(c.Uint("vision.num_channels")),
|
||||
maxNumTiles: int(c.Uint("vision.max_num_tiles")),
|
||||
}
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point {
|
||||
ratios := []image.Point{}
|
||||
|
||||
for w := range maxTiles {
|
||||
for h := range maxTiles {
|
||||
if (w+1)*(h+1) <= maxTiles {
|
||||
ratios = append(ratios, image.Point{w + 1, h + 1})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ratios
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) clip(a, a_min, a_max int) int {
|
||||
if a < a_min {
|
||||
return a_min
|
||||
} else if a > a_max {
|
||||
return a_max
|
||||
}
|
||||
|
||||
return a
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
|
||||
targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
|
||||
targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
|
||||
|
||||
scaleWidth := float64(targetWidth) / float64(imageSize.X)
|
||||
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
|
||||
|
||||
var w, h int
|
||||
|
||||
if scaleWidth < scaleHeight {
|
||||
w = targetWidth
|
||||
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
|
||||
} else {
|
||||
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
|
||||
h = targetHeight
|
||||
}
|
||||
|
||||
return image.Point{w, h}
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
|
||||
possibleTileArrangements := p.supportedAspectRatios(maxImageTiles)
|
||||
possibleCanvasSizes := []image.Point{}
|
||||
for _, pta := range possibleTileArrangements {
|
||||
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
|
||||
}
|
||||
|
||||
scales := []float64{}
|
||||
|
||||
for _, pcs := range possibleCanvasSizes {
|
||||
scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
|
||||
scaleWidth := float64(pcs.X) / float64(imageSize.X)
|
||||
|
||||
if scaleWidth > scaleHeight {
|
||||
scales = append(scales, scaleHeight)
|
||||
} else {
|
||||
scales = append(scales, scaleWidth)
|
||||
}
|
||||
}
|
||||
|
||||
var minUpscale float64
|
||||
var maxDownscale float64
|
||||
var upscale bool
|
||||
|
||||
for _, s := range scales {
|
||||
if s > 1.0 {
|
||||
upscale = true
|
||||
if minUpscale == 0 {
|
||||
minUpscale = s
|
||||
} else {
|
||||
minUpscale = math.Min(minUpscale, s)
|
||||
}
|
||||
} else {
|
||||
maxDownscale = math.Max(maxDownscale, s)
|
||||
}
|
||||
}
|
||||
|
||||
selectedScale := maxDownscale
|
||||
if upscale {
|
||||
selectedScale = minUpscale
|
||||
}
|
||||
|
||||
var selectedCanvas image.Point
|
||||
for n, pcs := range possibleCanvasSizes {
|
||||
if scales[n] == selectedScale {
|
||||
// choose the smallest possible canvas
|
||||
if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
|
||||
selectedCanvas = pcs
|
||||
} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
|
||||
selectedCanvas = pcs
|
||||
}
|
||||
}
|
||||
}
|
||||
return selectedCanvas
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
|
||||
b := img.Bounds()
|
||||
width := b.Max.X - b.Min.X
|
||||
height := b.Max.Y - b.Min.Y
|
||||
tileHeight := height / numTilesSize.Y
|
||||
tileWidth := width / numTilesSize.X
|
||||
|
||||
images := []image.Image{}
|
||||
|
||||
for h := range numTilesSize.Y {
|
||||
for w := range numTilesSize.X {
|
||||
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
|
||||
images = append(images, img.(interface {
|
||||
SubImage(image.Rectangle) image.Image
|
||||
}).SubImage(rect))
|
||||
}
|
||||
}
|
||||
|
||||
return images
|
||||
}
|
||||
|
||||
// remove the "alpha" channel by drawing over a prefilled image
|
||||
//
|
||||
// remove the "alpha" channel by drawing over a prefilled image
|
||||
//
|
||||
//nolint:unused
|
||||
func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
|
||||
dst := image.NewRGBA(img.Bounds())
|
||||
|
||||
white := color.RGBA{255, 255, 255, 255}
|
||||
draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
|
||||
draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
|
||||
b := img.Bounds()
|
||||
tileSize := outputSize.Y
|
||||
|
||||
canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize)
|
||||
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
|
||||
newSize := p.fitToCanvas(b.Max, canvasSize, tileSize)
|
||||
|
||||
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
|
||||
|
||||
// scaling choices:
|
||||
// NearestNeighbor fast, blocky output
|
||||
// ApproxBiLinear fast, medium quality
|
||||
// BiLinear slow, high quality
|
||||
// CatmullRom very slow, very high quality
|
||||
draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)
|
||||
|
||||
return dst, aspectRatio
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image {
|
||||
paddedSize := image.Point{
|
||||
X: outputSize.X * aspectRatio.X,
|
||||
Y: outputSize.Y * aspectRatio.Y,
|
||||
}
|
||||
|
||||
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
|
||||
draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
|
||||
subImages := p.splitToTiles(img, aspectRatio)
|
||||
|
||||
var pixelVals []float32
|
||||
|
||||
for _, subImg := range subImages {
|
||||
bounds := subImg.Bounds()
|
||||
var rVals, gVals, bVals []float32
|
||||
for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
|
||||
for x := bounds.Min.X; x < bounds.Max.X; x++ {
|
||||
c := subImg.At(x, y)
|
||||
r, g, b, _ := c.RGBA()
|
||||
rVal := float32(r>>8) / 255.0
|
||||
gVal := float32(g>>8) / 255.0
|
||||
bVal := float32(b>>8) / 255.0
|
||||
|
||||
rVal = (rVal - mean[0]) / std[0]
|
||||
gVal = (gVal - mean[1]) / std[1]
|
||||
bVal = (bVal - mean[2]) / std[2]
|
||||
|
||||
rVals = append(rVals, rVal)
|
||||
gVals = append(gVals, gVal)
|
||||
bVals = append(bVals, bVal)
|
||||
}
|
||||
}
|
||||
pixelVals = append(pixelVals, rVals...)
|
||||
pixelVals = append(pixelVals, gVals...)
|
||||
pixelVals = append(pixelVals, bVals...)
|
||||
}
|
||||
|
||||
return pixelVals
|
||||
}
|
||||
|
||||
func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) {
|
||||
outputSize := image.Point{p.imageSize, p.imageSize}
|
||||
|
||||
// clip values
|
||||
mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
|
||||
std := [3]float32{0.26862954, 0.26130258, 0.27577711}
|
||||
|
||||
newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles)
|
||||
newImage = p.pad(newImage, outputSize, aspectRatio)
|
||||
|
||||
data := p.pack(newImage, aspectRatio, mean, std)
|
||||
aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
|
||||
return data, aspectRatioIndex, nil
|
||||
}
|
||||
6
model/models/models.go
Normal file
6
model/models/models.go
Normal file
@@ -0,0 +1,6 @@
|
||||
package models
|
||||
|
||||
import (
|
||||
_ "github.com/ollama/ollama/model/models/llama"
|
||||
_ "github.com/ollama/ollama/model/models/mllama"
|
||||
)
|
||||
68
model/models/pixtral/imageproc.go
Normal file
68
model/models/pixtral/imageproc.go
Normal file
@@ -0,0 +1,68 @@
|
||||
package pixtral
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"image"
|
||||
_ "image/jpeg"
|
||||
_ "image/png"
|
||||
"io"
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/model/imageproc"
|
||||
)
|
||||
|
||||
func getNumImageTokens(imageSize, patchSize image.Point) image.Point {
|
||||
return image.Point{
|
||||
(imageSize.X-1)/patchSize.X + 1,
|
||||
(imageSize.Y-1)/patchSize.Y + 1,
|
||||
}
|
||||
}
|
||||
|
||||
func getResizeOutputImageSize(img image.Image, longestEdge int, patchSize image.Point) image.Point {
|
||||
b := img.Bounds()
|
||||
le := float64(longestEdge)
|
||||
ratio := math.Max(float64(b.Max.Y)/le, float64(b.Max.X)/le)
|
||||
|
||||
newSize := img.Bounds().Max
|
||||
|
||||
if ratio > 1.0 {
|
||||
newSize = image.Point{
|
||||
int(math.Ceil(float64(b.Max.X) / ratio)),
|
||||
int(math.Ceil(float64(b.Max.Y) / ratio)),
|
||||
}
|
||||
}
|
||||
|
||||
tokens := getNumImageTokens(newSize, patchSize)
|
||||
return image.Point{
|
||||
tokens.X * patchSize.X,
|
||||
tokens.Y * patchSize.Y,
|
||||
}
|
||||
}
|
||||
|
||||
func resizeImage(img image.Image, format string, longestEdge int, patchSize image.Point) image.Image {
|
||||
if format == "png" {
|
||||
img = imageproc.Composite(img)
|
||||
}
|
||||
|
||||
newSize := getResizeOutputImageSize(img, longestEdge, patchSize)
|
||||
|
||||
// todo should be ResizeBicubic, but it doesn't exist
|
||||
return imageproc.Resize(img, newSize, imageproc.ResizeBilinear)
|
||||
}
|
||||
|
||||
func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
|
||||
img, format, err := image.Decode(imageData)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("failed to decode image: %w", err)
|
||||
}
|
||||
|
||||
longestEdge := 1024
|
||||
patchSize := image.Point{16, 16}
|
||||
|
||||
img = resizeImage(img, format, longestEdge, patchSize)
|
||||
|
||||
data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
|
||||
|
||||
opts := map[string]any{}
|
||||
return data, opts, nil
|
||||
}
|
||||
219
model/models/pixtral/imageproc_test.go
Normal file
219
model/models/pixtral/imageproc_test.go
Normal file
@@ -0,0 +1,219 @@
|
||||
package pixtral
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"image"
|
||||
"image/png"
|
||||
"math"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
)
|
||||
|
||||
func TestGetNumImageTokens(t *testing.T) {
|
||||
type numImageTokensCase struct {
|
||||
ImageSize image.Point
|
||||
PatchSize image.Point
|
||||
Expected image.Point
|
||||
}
|
||||
|
||||
cases := []numImageTokensCase{
|
||||
{
|
||||
ImageSize: image.Point{1024, 764},
|
||||
PatchSize: image.Point{16, 16},
|
||||
Expected: image.Point{64, 48},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{800, 600},
|
||||
PatchSize: image.Point{16, 16},
|
||||
Expected: image.Point{50, 38},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{640, 480},
|
||||
PatchSize: image.Point{16, 16},
|
||||
Expected: image.Point{40, 30},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{320, 200},
|
||||
PatchSize: image.Point{16, 16},
|
||||
Expected: image.Point{20, 13},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{1320, 200},
|
||||
PatchSize: image.Point{16, 16},
|
||||
Expected: image.Point{83, 13},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{2000, 200},
|
||||
PatchSize: image.Point{16, 16},
|
||||
Expected: image.Point{125, 13},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{10000, 200},
|
||||
PatchSize: image.Point{16, 16},
|
||||
Expected: image.Point{625, 13},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{1131, 577},
|
||||
PatchSize: image.Point{16, 16},
|
||||
Expected: image.Point{71, 37},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{16, 16},
|
||||
PatchSize: image.Point{16, 16},
|
||||
Expected: image.Point{1, 1},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := getNumImageTokens(c.ImageSize, c.PatchSize)
|
||||
|
||||
if diff := cmp.Diff(actual, c.Expected); diff != "" {
|
||||
t.Errorf("mismatch (-got +want):\n%s", diff)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetResizeOutputImageSize(t *testing.T) {
|
||||
type resizeCase struct {
|
||||
Image image.Image
|
||||
LongestEdge int
|
||||
PatchSize image.Point
|
||||
Expected image.Point
|
||||
}
|
||||
|
||||
cases := []resizeCase{
|
||||
{
|
||||
Image: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
||||
LongestEdge: 1024,
|
||||
PatchSize: image.Point{16, 16},
|
||||
Expected: image.Point{1024, 768},
|
||||
},
|
||||
{
|
||||
Image: image.NewRGBA(image.Rect(0, 0, 1162, 690)),
|
||||
LongestEdge: 1024,
|
||||
PatchSize: image.Point{16, 16},
|
||||
Expected: image.Point{1024, 624},
|
||||
},
|
||||
{
|
||||
Image: image.NewRGBA(image.Rect(0, 0, 300, 200)),
|
||||
LongestEdge: 1024,
|
||||
PatchSize: image.Point{16, 16},
|
||||
Expected: image.Point{304, 208},
|
||||
},
|
||||
{
|
||||
Image: image.NewRGBA(image.Rect(0, 0, 1862, 522)),
|
||||
LongestEdge: 1024,
|
||||
PatchSize: image.Point{16, 16},
|
||||
Expected: image.Point{1024, 288},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := getResizeOutputImageSize(c.Image, c.LongestEdge, c.PatchSize)
|
||||
|
||||
if diff := cmp.Diff(actual, c.Expected); diff != "" {
|
||||
t.Errorf("mismatch (-got +want):\n%s", diff)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestResize(t *testing.T) {
|
||||
type resizeCase struct {
|
||||
Image image.Image
|
||||
LongestEdge int
|
||||
PatchSize image.Point
|
||||
Expected image.Image
|
||||
}
|
||||
|
||||
cases := []resizeCase{
|
||||
{
|
||||
Image: image.NewRGBA(image.Rect(0, 0, 1862, 522)),
|
||||
LongestEdge: 1024,
|
||||
PatchSize: image.Point{16, 16},
|
||||
Expected: image.NewRGBA(image.Rect(0, 0, 1024, 288)),
|
||||
},
|
||||
{
|
||||
Image: image.NewRGBA(image.Rect(0, 0, 10, 10)),
|
||||
LongestEdge: 1024,
|
||||
PatchSize: image.Point{16, 16},
|
||||
Expected: image.NewRGBA(image.Rect(0, 0, 16, 16)),
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := resizeImage(c.Image, "png", c.LongestEdge, c.PatchSize)
|
||||
|
||||
if actual.Bounds() != c.Expected.Bounds() {
|
||||
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPreprocess(t *testing.T) {
|
||||
type preprocessCase struct {
|
||||
TestImage image.Image
|
||||
ExpectedLen int
|
||||
}
|
||||
|
||||
cases := []preprocessCase{
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)),
|
||||
ExpectedLen: 16 * 16 * 3 * 1,
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
|
||||
ExpectedLen: 1024 * 1024 * 3 * 1,
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
var buf bytes.Buffer
|
||||
err := png.Encode(&buf, c.TestImage)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
imgData, _, err := Preprocess(&buf)
|
||||
if err != nil {
|
||||
t.Fatalf("error processing: %q", err)
|
||||
}
|
||||
|
||||
switch len(imgData) {
|
||||
case 0:
|
||||
t.Errorf("no image data returned")
|
||||
case c.ExpectedLen:
|
||||
// ok
|
||||
default:
|
||||
t.Errorf("unexpected image data length: %d, expected: %d", len(imgData), c.ExpectedLen)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPreprocessImages(t *testing.T) {
|
||||
for _, testFile := range []string{"flight.png", "sportsball.png"} {
|
||||
f, err := os.Open(testFile)
|
||||
if err != nil {
|
||||
t.Skipf("skipping test, no test image found at %s", testFile)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
imgData, _, err := Preprocess(f)
|
||||
if err != nil {
|
||||
t.Fatalf("error processing: %q", err)
|
||||
}
|
||||
|
||||
byteData := make([]byte, len(imgData)*4) // float32 is 4 bytes
|
||||
for i, f := range imgData {
|
||||
binary.LittleEndian.PutUint32(byteData[i*4:], math.Float32bits(f))
|
||||
}
|
||||
|
||||
outputPath := "processed_" + testFile + ".bin"
|
||||
err = os.WriteFile(outputPath, byteData, 0o644)
|
||||
if err != nil {
|
||||
t.Fatalf("error writing processed image: %q", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
74
model/models/qwen2vl/imageproc.go
Normal file
74
model/models/qwen2vl/imageproc.go
Normal file
@@ -0,0 +1,74 @@
|
||||
package qwen2vl
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"image"
|
||||
_ "image/jpeg"
|
||||
_ "image/png"
|
||||
"io"
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/model/imageproc"
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultFactor = 28
|
||||
DefaultMinPixels = 56 * 56
|
||||
DefaultMaxPixels = 14 * 14 * 4 * 1280
|
||||
)
|
||||
|
||||
// smartResize calculates the size of the image to resize to based on the
|
||||
// factor, minPixels, and maxPixels.
|
||||
func smartResize(size image.Point, factor, minPixels, maxPixels int) image.Point {
|
||||
// 1. Both dimensions of size are divisible by factor
|
||||
// 2. The area of the image is between minPixels and maxPixels
|
||||
// 3. The aspect ratio of the image is as close to 1:1 as possible
|
||||
|
||||
if size.Y < factor || size.X < factor {
|
||||
panic("image is too small to resize")
|
||||
} else if max(size.X, size.Y)/min(size.X, size.Y) > 200 {
|
||||
panic("aspect ratio must be less than 200:1")
|
||||
}
|
||||
|
||||
f := float64(factor)
|
||||
width := float64(size.X)
|
||||
height := float64(size.Y)
|
||||
|
||||
xBar := math.Round(width/f) * f
|
||||
yBar := math.Round(height/f) * f
|
||||
|
||||
if xBar*yBar > float64(maxPixels) {
|
||||
beta := math.Sqrt(height * width / float64(maxPixels))
|
||||
xBar = math.Floor(width/beta/f) * f
|
||||
yBar = math.Floor(height/beta/f) * f
|
||||
} else if xBar*yBar < float64(minPixels) {
|
||||
beta := math.Sqrt(float64(minPixels) / (height * width))
|
||||
xBar = math.Ceil(width*beta/f) * f
|
||||
yBar = math.Ceil(height*beta/f) * f
|
||||
}
|
||||
|
||||
return image.Point{int(xBar), int(yBar)}
|
||||
}
|
||||
|
||||
func resizeImage(img image.Image, format string, size image.Point) image.Image {
|
||||
if format == "png" {
|
||||
img = imageproc.Composite(img)
|
||||
}
|
||||
|
||||
return imageproc.Resize(img, size, imageproc.ResizeBilinear)
|
||||
}
|
||||
|
||||
func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
|
||||
img, format, err := image.Decode(imageData)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("failed to decode image: %w", err)
|
||||
}
|
||||
|
||||
size := smartResize(img.Bounds().Max, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
|
||||
img = resizeImage(img, format, size)
|
||||
|
||||
data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
|
||||
|
||||
opts := map[string]any{}
|
||||
return data, opts, nil
|
||||
}
|
||||
78
model/models/qwen2vl/imageproc_test.go
Normal file
78
model/models/qwen2vl/imageproc_test.go
Normal file
@@ -0,0 +1,78 @@
|
||||
package qwen2vl
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"image"
|
||||
"image/png"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestSmartResize(t *testing.T) {
|
||||
type smartResizeCase struct {
|
||||
TestImage image.Image
|
||||
Expected image.Point
|
||||
}
|
||||
|
||||
cases := []smartResizeCase{
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)),
|
||||
Expected: image.Point{980, 980},
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
||||
Expected: image.Point{1036, 756},
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
|
||||
Expected: image.Point{980, 980},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
b := c.TestImage.Bounds().Max
|
||||
actual := smartResize(b, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
|
||||
if actual != c.Expected {
|
||||
t.Errorf("expected: %v, actual: %v", c.Expected, actual)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPreprocess(t *testing.T) {
|
||||
type preprocessCase struct {
|
||||
TestImage image.Image
|
||||
ExpectedLen int
|
||||
}
|
||||
|
||||
cases := []preprocessCase{
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 256, 256)),
|
||||
ExpectedLen: 252 * 252 * 3 * 1,
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
|
||||
ExpectedLen: 980 * 980 * 3 * 1,
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
var buf bytes.Buffer
|
||||
err := png.Encode(&buf, c.TestImage)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
imgData, _, err := Preprocess(&buf)
|
||||
if err != nil {
|
||||
t.Fatalf("error processing: %q", err)
|
||||
}
|
||||
|
||||
switch len(imgData) {
|
||||
case 0:
|
||||
t.Errorf("no image data returned")
|
||||
case c.ExpectedLen:
|
||||
// ok
|
||||
default:
|
||||
t.Errorf("unexpected image data length: %d, expected: %d", len(imgData), c.ExpectedLen)
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user