chore: update mllama to use ollama engine (#10637)

2025-12-10 15:57:04 +00:00 · 2025-05-13 17:36:02 -07:00
parent 0478d440f0
commit 23125648b8
67 changed files with 785 additions and 4354 deletions
--- a/model/models/mllama/imageproc.go
+++ b/model/models/mllama/imageproc.go
@@ -1,201 +0,0 @@
-package mllama
-
-import (
-	"fmt"
-	"image"
-	_ "image/jpeg"
-	_ "image/png"
-	"io"
-	"math"
-	"slices"
-
-	"golang.org/x/image/draw"
-
-	"github.com/ollama/ollama/model/imageproc"
-)
-
-func getSupportedAspectRatios(maxTiles int) []image.Point {
-	ratios := []image.Point{}
-
-	for w := range maxTiles {
-		for h := range maxTiles {
-			if (w+1)*(h+1) <= maxTiles {
-				ratios = append(ratios, image.Point{w + 1, h + 1})
-			}
-		}
-	}
-
-	return ratios
-}
-
-func clip(a, a_min, a_max int) int {
-	if a < a_min {
-		return a_min
-	} else if a > a_max {
-		return a_max
-	}
-
-	return a
-}
-
-func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
-	possibleTileArrangements := getSupportedAspectRatios(maxImageTiles)
-	possibleCanvasSizes := []image.Point{}
-	for _, pta := range possibleTileArrangements {
-		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
-	}
-
-	scales := []float64{}
-
-	for _, pcs := range possibleCanvasSizes {
-		scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
-		scaleWidth := float64(pcs.X) / float64(imageSize.X)
-
-		if scaleWidth > scaleHeight {
-			scales = append(scales, scaleHeight)
-		} else {
-			scales = append(scales, scaleWidth)
-		}
-	}
-
-	var minUpscale float64
-	var maxDownscale float64
-	var upscale bool
-
-	for _, s := range scales {
-		if s > 1.0 {
-			upscale = true
-			if minUpscale == 0 {
-				minUpscale = s
-			} else {
-				minUpscale = math.Min(minUpscale, s)
-			}
-		} else {
-			maxDownscale = math.Max(maxDownscale, s)
-		}
-	}
-
-	selectedScale := maxDownscale
-	if upscale {
-		selectedScale = minUpscale
-	}
-
-	var selectedCanvas image.Point
-	for n, pcs := range possibleCanvasSizes {
-		if scales[n] == selectedScale {
-			// choose the smallest possible canvas
-			if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
-				selectedCanvas = pcs
-			} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
-				selectedCanvas = pcs
-			}
-		}
-	}
-	return selectedCanvas
-}
-
-func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
-	targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
-	targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
-
-	scaleWidth := float64(targetWidth) / float64(imageSize.X)
-	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
-
-	var w, h int
-
-	if scaleWidth < scaleHeight {
-		w = targetWidth
-		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
-	} else {
-		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
-		h = targetHeight
-	}
-
-	return image.Point{w, h}
-}
-
-func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
-	if format == "png" {
-		img = imageproc.Composite(img)
-	}
-
-	b := img.Bounds()
-	tileSize := outputSize.Y
-
-	canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
-	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
-	newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
-
-	return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio
-}
-
-func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
-	paddedSize := image.Point{
-		X: outputSize.X * aspectRatio.X,
-		Y: outputSize.Y * aspectRatio.Y,
-	}
-
-	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
-	draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
-
-	return dst
-}
-
-func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
-	b := img.Bounds()
-	width := b.Max.X - b.Min.X
-	height := b.Max.Y - b.Min.Y
-	tileHeight := height / numTilesSize.Y
-	tileWidth := width / numTilesSize.X
-
-	images := []image.Image{}
-
-	for h := range numTilesSize.Y {
-		for w := range numTilesSize.X {
-			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
-			images = append(images, img.(interface {
-				SubImage(image.Rectangle) image.Image
-			}).SubImage(rect))
-		}
-	}
-
-	return images
-}
-
-func packImages(img image.Image, aspectRatio image.Point) []float32 {
-	subImages := splitToTiles(img, aspectRatio)
-
-	var pixelVals []float32
-
-	rescale := true
-	channelFirst := true
-
-	for _, subImg := range subImages {
-		vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst)
-		pixelVals = append(pixelVals, vals...)
-	}
-
-	return pixelVals
-}
-
-func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
-	outputSize := image.Point{560, 560}
-	maxTiles := 4
-
-	img, format, err := image.Decode(imageData)
-	if err != nil {
-		return nil, nil, fmt.Errorf("failed to decode image: %w", err)
-	}
-
-	newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles)
-	newImage = padImage(newImage, outputSize, aspectRatio)
-
-	data := packImages(newImage, aspectRatio)
-	aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1
-
-	opts := map[string]any{
-		"aspectRatioIndex": aspectRatioIndex,
-	}
-
-	return data, opts, nil
-}
--- a/model/models/mllama/imageproc_test.go
+++ b/model/models/mllama/imageproc_test.go
@@ -1,420 +0,0 @@
-package mllama
-
-import (
-	"bytes"
-	"image"
-	"image/png"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func TestAspectRatios(t *testing.T) {
-	type aspectCase struct {
-		MaxTiles int
-		Expected []image.Point
-	}
-
-	cases := []aspectCase{
-		{
-			MaxTiles: 1,
-			Expected: []image.Point{{1, 1}},
-		},
-		{
-			MaxTiles: 2,
-			Expected: []image.Point{{1, 1}, {1, 2}, {2, 1}},
-		},
-		{
-			MaxTiles: 3,
-			Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {2, 1}, {3, 1}},
-		},
-		{
-			MaxTiles: 4,
-			Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {2, 1}, {2, 2}, {3, 1}, {4, 1}},
-		},
-	}
-
-	for _, c := range cases {
-		actual := getSupportedAspectRatios(c.MaxTiles)
-
-		if diff := cmp.Diff(actual, c.Expected); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	}
-}
-
-func TestGetImageSizeFitToCanvas(t *testing.T) {
-	type imageSizeCase struct {
-		ImageRect  image.Point
-		CanvasRect image.Point
-		TileSize   int
-		Expected   image.Point
-	}
-
-	cases := []imageSizeCase{
-		{
-			ImageRect:  image.Point{400, 400},
-			CanvasRect: image.Point{640, 480},
-			TileSize:   200,
-			Expected:   image.Point{400, 400},
-		},
-		{
-			ImageRect:  image.Point{1024, 768},
-			CanvasRect: image.Point{640, 480},
-			TileSize:   200,
-			Expected:   image.Point{640, 480},
-		},
-		{
-			ImageRect:  image.Point{500, 500},
-			CanvasRect: image.Point{1000, 1000},
-			TileSize:   750,
-			Expected:   image.Point{750, 750},
-		},
-		{
-			ImageRect:  image.Point{500, 1000},
-			CanvasRect: image.Point{2000, 2000},
-			TileSize:   2000,
-			Expected:   image.Point{1000, 2000},
-		},
-		{
-			ImageRect:  image.Point{4000, 3000},
-			CanvasRect: image.Point{2000, 1000},
-			TileSize:   1000,
-			Expected:   image.Point{1333, 1000},
-		},
-		{
-			ImageRect:  image.Point{667, 1000},
-			CanvasRect: image.Point{1000, 1000},
-			TileSize:   560,
-			Expected:   image.Point{667, 1000},
-		},
-	}
-
-	for _, c := range cases {
-		actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize)
-
-		if actual != c.Expected {
-			t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected)
-		}
-	}
-}
-
-func TestGetOptimalTiledCanvas(t *testing.T) {
-	type tiledCanvasSizeCase struct {
-		ImageSize     image.Point
-		MaxImageTiles int
-		TileSize      int
-		Expected      image.Point
-	}
-
-	cases := []tiledCanvasSizeCase{
-		{
-			ImageSize:     image.Point{1024, 768},
-			MaxImageTiles: 4,
-			TileSize:      1000,
-			Expected:      image.Point{2000, 1000},
-		},
-		{
-			ImageSize:     image.Point{1024, 768},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1120, 1120},
-		},
-		{
-			ImageSize:     image.Point{800, 600},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1120, 1120},
-		},
-		{
-			ImageSize:     image.Point{640, 480},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1120, 560},
-		},
-		{
-			ImageSize:     image.Point{320, 200},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 560},
-		},
-		{
-			ImageSize:     image.Point{1320, 200},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1680, 560},
-		},
-		{
-			ImageSize:     image.Point{2000, 200},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{2240, 560},
-		},
-		{
-			ImageSize:     image.Point{10000, 200},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{2240, 560},
-		},
-		{
-			ImageSize:     image.Point{480, 640},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 1120},
-		},
-		{
-			ImageSize:     image.Point{200, 320},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 560},
-		},
-		{
-			ImageSize:     image.Point{200, 1320},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 1680},
-		},
-		{
-			ImageSize:     image.Point{200, 2000},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 2240},
-		},
-		{
-			ImageSize:     image.Point{200, 10000},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 2240},
-		},
-		{
-			ImageSize:     image.Point{10000, 10000},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1120, 1120},
-		},
-	}
-
-	for _, c := range cases {
-		actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize)
-
-		if actual != c.Expected {
-			t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected)
-		}
-	}
-}
-
-func TestSplitToTiles(t *testing.T) {
-	type splitCase struct {
-		TestImage    image.Image
-		NumTilesSize image.Point
-		Expected     []image.Image
-	}
-
-	cases := []splitCase{
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			NumTilesSize: image.Point{1, 1},
-			Expected:     []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
-		},
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1000, 500)),
-			NumTilesSize: image.Point{2, 1},
-			Expected: []image.Image{
-				image.NewRGBA(image.Rect(0, 0, 500, 500)),
-				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
-			},
-		},
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1000, 1000)),
-			NumTilesSize: image.Point{2, 2},
-			Expected: []image.Image{
-				image.NewRGBA(image.Rect(0, 0, 500, 500)),
-				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
-				image.NewRGBA(image.Rect(0, 500, 500, 1000)),
-				image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
-			},
-		},
-	}
-
-	for _, c := range cases {
-		actual := splitToTiles(c.TestImage, c.NumTilesSize)
-
-		if len(actual) != len(c.Expected) {
-			t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected))
-		}
-
-		for i := range actual {
-			if actual[i].Bounds() != c.Expected[i].Bounds() {
-				t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual[i].Bounds(), c.Expected[i].Bounds())
-			}
-		}
-	}
-}
-
-func TestResize(t *testing.T) {
-	type resizeCase struct {
-		TestImage           image.Image
-		OutputSize          image.Point
-		MaxImageTiles       int
-		ExpectedImage       image.Image
-		ExpectedAspectRatio image.Point
-	}
-
-	cases := []resizeCase{
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 200, 200)),
-			OutputSize:          image.Point{100, 100},
-			MaxImageTiles:       1,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
-			ExpectedAspectRatio: image.Point{1, 1},
-		},
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 200, 200)),
-			OutputSize:          image.Point{100, 100},
-			MaxImageTiles:       2,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
-			ExpectedAspectRatio: image.Point{1, 1},
-		},
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 10, 10)),
-			OutputSize:          image.Point{560, 560},
-			MaxImageTiles:       4,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 560, 560)),
-			ExpectedAspectRatio: image.Point{1, 1},
-		},
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 2560, 1920)),
-			OutputSize:          image.Point{560, 560},
-			MaxImageTiles:       4,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 1120, 840)),
-			ExpectedAspectRatio: image.Point{2, 2},
-		},
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			OutputSize:          image.Point{560, 560},
-			MaxImageTiles:       4,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			ExpectedAspectRatio: image.Point{2, 2},
-		},
-	}
-
-	for _, c := range cases {
-		actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
-
-		if actualImage.Bounds() != c.ExpectedImage.Bounds() {
-			t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds())
-		}
-
-		if actualAspectRatio != c.ExpectedAspectRatio {
-			t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio)
-		}
-	}
-}
-
-func TestPad(t *testing.T) {
-	type padCase struct {
-		TestImage   image.Image
-		OutputSize  image.Point
-		AspectRatio image.Point
-		Expected    image.Image
-	}
-
-	cases := []padCase{
-		{
-			TestImage:   image.NewRGBA(image.Rect(0, 0, 1000, 667)),
-			OutputSize:  image.Point{560, 560},
-			AspectRatio: image.Point{2, 2},
-			Expected:    image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
-		},
-	}
-
-	for _, c := range cases {
-		actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio)
-
-		if actual.Bounds() != c.Expected.Bounds() {
-			t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
-		}
-	}
-}
-
-func TestPackImages(t *testing.T) {
-	type packCase struct {
-		TestImage    image.Image
-		AspectRatio  image.Point
-		ExpectedVals int
-	}
-
-	cases := []packCase{
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
-			AspectRatio:  image.Point{2, 2},
-			ExpectedVals: 2 * 2 * 3 * 560 * 560,
-		},
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 560, 560)),
-			AspectRatio:  image.Point{1, 1},
-			ExpectedVals: 1 * 1 * 3 * 560 * 560,
-		},
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1120, 560)),
-			AspectRatio:  image.Point{1, 2},
-			ExpectedVals: 1 * 2 * 3 * 560 * 560,
-		},
-	}
-
-	for _, c := range cases {
-		actualVals := packImages(c.TestImage, c.AspectRatio)
-		if len(actualVals) != c.ExpectedVals {
-			t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
-		}
-	}
-}
-
-func TestPreprocess(t *testing.T) {
-	type preprocessCase struct {
-		TestImage             image.Image
-		ExpectedVals          int
-		ExpectedAspectRatioID int
-	}
-
-	cases := []preprocessCase{
-		{
-			TestImage:             image.NewRGBA(image.Rect(0, 0, 10, 10)),
-			ExpectedVals:          0,
-			ExpectedAspectRatioID: 1,
-		},
-		{
-			TestImage:             image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			ExpectedVals:          0,
-			ExpectedAspectRatioID: 6,
-		},
-	}
-
-	for _, c := range cases {
-		var buf bytes.Buffer
-		err := png.Encode(&buf, c.TestImage)
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		imgData, opts, err := Preprocess(&buf)
-		if err != nil {
-			t.Fatalf("error processing: %q", err)
-		}
-
-		if len(imgData) == 0 {
-			t.Errorf("no image data returned")
-		}
-
-		ar, ok := opts["aspectRatioIndex"]
-		if !ok {
-			t.Fatalf("no aspect ratio found")
-		}
-
-		aspectRatioID := ar.(int)
-
-		if aspectRatioID != c.ExpectedAspectRatioID {
-			t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
-		}
-	}
-}
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -2,11 +2,7 @@ package mllama

 import (
 	"bytes"
-	"encoding/binary"
-	"fmt"
-	"hash/fnv"
 	"image"
-	"slices"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -34,10 +30,6 @@ const (
 )

 func New(c fs.Config) (model.Model, error) {
-	// Verify unified config
-	if c.Uint("vision.block_count") == 0 {
-		return nil, fmt.Errorf("non-unified vision model not supported")
-	}
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
@@ -76,22 +68,19 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}

-	f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(image)
+	f32s, ratio, err := m.ImageProcessor.ProcessImage(image)
 	if err != nil {
 		return nil, err
 	}

-	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
-		m.ImageProcessor.imageSize,
-		m.ImageProcessor.imageSize,
-		m.ImageProcessor.numChannels,
-		m.ImageProcessor.maxNumTiles,
-	)
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
 	if err != nil {
 		return nil, err
 	}

-	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(aspectRatioID)}, 1)
+	pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
+
+	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
 	if err != nil {
 		return nil, err
 	}
@@ -102,41 +91,19 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
-	var images []input.Input
-	fnvHash := fnv.New64a()
-
 	for i := range inputs {
-		if inputs[i].Multimodal == nil {
-			if len(images) > 0 {
-				inputs[i].Multimodal = []ml.Tensor{images[0].Multimodal.(ml.Tensor)}
-				inputs[i].MultimodalHash = images[0].MultimodalHash
-				for j := 1; j < len(images); j++ {
-					inputs[i].Multimodal = append(inputs[i].Multimodal.([]ml.Tensor), images[0].Multimodal.(ml.Tensor))
-					fnvHash.Reset()
-					binary.Write(fnvHash, binary.NativeEndian, inputs[i].MultimodalHash)
-					binary.Write(fnvHash, binary.NativeEndian, inputs[j].MultimodalHash)
-					inputs[i].MultimodalHash = fnvHash.Sum64()
-				}
-				images = nil
-			}
-		} else {
-			images = append(images, inputs[i])
-			inputs[i].Token = -1
+		if inputs[i].Multimodal != nil {
+			inputs[i].Token = 128256 // <|image|>
 		}
 	}

-	inputs = slices.DeleteFunc(inputs, func(input input.Input) bool { return input.Token == -1 })
-
 	return inputs, nil
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	var crossAttentionStates ml.Tensor
 	if len(batch.Multimodal) > 0 {
-		images := batch.Multimodal[len(batch.Multimodal)-1].Multimodal.([]ml.Tensor)
-		if len(images) > 0 {
-			crossAttentionStates = images[len(images)-1]
-		}
+		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor)
 	}

 	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
@@ -150,7 +117,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	}

 	// TODO: attention mask, cross attention mask
-	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
 }

 func init() {
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -18,7 +18,7 @@ type TextSelfAttention struct {
 	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
 }

-func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
 	ropeType := uint32(0)
@@ -69,11 +69,11 @@ type TextSelfAttentionDecoderLayer struct {
 	MLP     *TextMLP
 }

-func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, mask, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	residual := hiddenState

 	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, mask, cache, opts)
+	hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)

 	// In the final layer (outputs != nil), optimize by pruning to just the token positions
 	// we need logits for.
@@ -151,7 +151,7 @@ type TextCrossAttentionDecoderLayer struct {
 	MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
 }

-func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	residual := hiddenState

 	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
@@ -167,14 +167,14 @@ func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _,
 }

 type TextDecoderLayer interface {
-	Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
+	Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
 }

 type TextDecoder struct {
 	Layers []TextDecoderLayer
 }

-func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	for i, layer := range d.Layers {
 		layerType := selfAttentionLayer
 		if slices.Contains(opts.crossAttentionLayers, int32(i)) {
@@ -190,7 +190,7 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
 				lastLayerOutputs = outputs
 			}

-			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, mask, crossAttentionStates, crossAttentionMask, cache, opts)
+			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, crossAttentionStates, crossAttentionMask, cache, opts)
 		}
 	}

@@ -214,9 +214,9 @@ type TextModel struct {
 	*TextModelOptions
 }

-func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
+func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
 	hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
-	hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
+	hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
 	return m.Output.Forward(ctx, hiddenState)
 }
--- a/model/models/mllama/model_vision.go
+++ b/model/models/mllama/model_vision.go
@@ -15,7 +15,7 @@ type VisionSelfAttention struct {
 	Query  *nn.Linear `gguf:"attn_q"`
 	Key    *nn.Linear `gguf:"attn_k"`
 	Value  *nn.Linear `gguf:"attn_v"`
-	Output *nn.Linear `gguf:"attn_out"`
+	Output *nn.Linear `gguf:"attn_output"`

 	Gate ml.Tensor `gguf:"attn_gate"`
 }
@@ -45,36 +45,29 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op
 	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)

 	hiddenState = sa.Output.Forward(ctx, attention)
-	if sa.Gate != nil {
-		hiddenState = hiddenState.Mul(ctx, sa.Gate)
-	}
-
 	return hiddenState
 }

 type VisionMLP struct {
-	Down *nn.Linear `gguf:"ffn_down"`
 	Up   *nn.Linear `gguf:"ffn_up"`
-
-	Gate ml.Tensor `gguf:"ffn_gate"`
+	Down *nn.Linear `gguf:"ffn_down"`
 }

 func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
-	hiddenState = mlp.Down.Forward(ctx, hiddenState).GELU(ctx)
-	hiddenState = mlp.Up.Forward(ctx, hiddenState)
-	if mlp.Gate != nil {
-		hiddenState = hiddenState.Mul(ctx, mlp.Gate)
-	}
+	hiddenState = mlp.Up.Forward(ctx, hiddenState).GELU(ctx)
+	hiddenState = mlp.Down.Forward(ctx, hiddenState)

 	return hiddenState
 }

 type VisionEncoderLayer struct {
-	AttentionNorm *nn.LayerNorm `gguf:"ln1"`
+	AttentionNorm *nn.LayerNorm `gguf:"attn_norm"`
 	SelfAttention *VisionSelfAttention
+	AttentionGate ml.Tensor `gguf:"attn_gate"`

-	MLPNorm *nn.LayerNorm `gguf:"ln2"`
+	MLPNorm *nn.LayerNorm `gguf:"ffn_norm"`
 	MLP     *VisionMLP
+	MLPGate ml.Tensor `gguf:"ffn_gate"`
 }

 func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
@@ -83,13 +76,22 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
 	// self attention
 	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
+
+	if e.AttentionGate != nil {
+		hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
+	}
 	hiddenState = hiddenState.Add(ctx, residual)
 	residual = hiddenState

 	// feed forward
 	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
-	return hiddenState.Add(ctx, residual)
+	hiddenState = hiddenState.Add(ctx, residual)
+	if e.MLPGate != nil {
+		hiddenState = hiddenState.Mul(ctx, e.MLPGate)
+	}
+
+	return hiddenState
 }

 type VisionEncoder struct {
@@ -114,9 +116,9 @@ type PrecomputedAspectRatioEmbedding struct {
 	Gate      ml.Tensor `gguf:"gate"`
 }

-func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, numTiles int, opts *VisionModelOptions) ml.Tensor {
 	embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
-	embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, opts.numTiles)
+	embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, numTiles)
 	if e.Gate != nil {
 		embeddings = embeddings.Mul(ctx, e.Gate)
 	}
@@ -132,7 +134,7 @@ type PrecomputedPositionEmbedding struct {
 	TilePositionEmbeddingGate ml.Tensor     `gguf:"tile_position_embd.gate"`
 }

-func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int, opts *VisionModelOptions) ml.Tensor {
+func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions, numTiles int, opts *VisionModelOptions) ml.Tensor {
 	positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
 	if e.PositionEmbeddingGate != nil {
 		positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
@@ -141,7 +143,7 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
 	hiddenState = hiddenState.Add(ctx, positionEmbedding)

 	tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
-	tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, opts.numTiles)
+	tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, numTiles)
 	if e.TilePositionEmbeddingGate != nil {
 		tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
 	}
@@ -150,9 +152,9 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
 }

 type VisionModelOptions struct {
-	hiddenSize, numHeads, numTiles int
-	imageSize, patchSize           int
-	eps                            float32
+	hiddenSize, numHeads int
+	imageSize, patchSize int
+	eps                  float32

 	intermediateLayersIndices []int32
 }
@@ -181,14 +183,16 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
 		numPositions++
 	}

+	numTiles := pixelValues.Dim(3)
+
 	hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
-	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, m.numTiles)
+	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, numTiles)
 	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)

-	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
-	hiddenState = m.ClassEmbedding.Repeat(ctx, 2, m.numTiles).Concat(ctx, hiddenState, 1)
+	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
+	hiddenState = m.ClassEmbedding.Repeat(ctx, 2, numTiles).Concat(ctx, hiddenState, 1)

-	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions)
+	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, numTiles, m.VisionModelOptions)
 	hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)

 	numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
@@ -199,18 +203,18 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa

 	hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)

-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
-	hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
+	hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)

-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, m.numTiles*(numPositions+numPaddingPatches), batchSize)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numTiles*(numPositions+numPaddingPatches), batchSize)
 	hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)

 	hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
-	hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
-	hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0)
+	hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
+	hiddenStates = hiddenStates.Pad(ctx, 0, -numPaddingPatches, 0, 0)

-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
-	hiddenState = hiddenState.Unpad(ctx, 0, numPaddingPatches, 0, 0)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
+	hiddenState = hiddenState.Pad(ctx, 0, -numPaddingPatches, 0, 0)
 	return hiddenState.Concat(ctx, hiddenStates, 0)
 }

@@ -222,7 +226,6 @@ func newVisionModel(c fs.Config) *VisionModel {
 		VisionModelOptions: &VisionModelOptions{
 			hiddenSize: int(c.Uint("vision.embedding_length")),
 			numHeads:   int(c.Uint("vision.attention.head_count")),
-			numTiles:   int(c.Uint("vision.max_num_tiles")),

 			imageSize: int(c.Uint("vision.image_size")),
 			patchSize: int(c.Uint("vision.patch_size")),
--- a/model/models/mllama/process_image.go
+++ b/model/models/mllama/process_image.go
@@ -2,17 +2,31 @@ package mllama

 import (
 	"image"
-	"image/color"
 	"math"
 	"slices"

 	"golang.org/x/image/draw"

 	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/model/imageproc"
 )

+type supportedAspectRatio struct {
+	rank, width, height int
+}
+
+func (a supportedAspectRatio) Point() image.Point {
+	return image.Point{a.width, a.height}
+}
+
+func (a supportedAspectRatio) numTiles() int {
+	return a.width * a.height
+}
+
 type ImageProcessor struct {
 	imageSize, numChannels, maxNumTiles int
+
+	mean, std [3]float32
 }

 func newImageProcessor(c fs.Config) ImageProcessor {
@@ -20,71 +34,49 @@ func newImageProcessor(c fs.Config) ImageProcessor {
 		imageSize:   int(c.Uint("vision.image_size")),
 		numChannels: int(c.Uint("vision.num_channels")),
 		maxNumTiles: int(c.Uint("vision.max_num_tiles")),
+
+		mean: imageproc.ClipDefaultMean,
+		std:  imageproc.ClipDefaultSTD,
 	}
 }

-func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point {
-	ratios := []image.Point{}
-
-	for w := range maxTiles {
-		for h := range maxTiles {
-			if (w+1)*(h+1) <= maxTiles {
-				ratios = append(ratios, image.Point{w + 1, h + 1})
-			}
+func (p ImageProcessor) supportedAspectRatios() (ratios []supportedAspectRatio) {
+	for w := 1; w <= p.maxNumTiles; w++ {
+		for h := 1; h <= p.maxNumTiles/w; h++ {
+			ratios = append(ratios, supportedAspectRatio{len(ratios) + 1, w, h})
 		}
 	}
-
 	return ratios
 }

-func (p *ImageProcessor) clip(a, a_min, a_max int) int {
-	if a < a_min {
-		return a_min
-	} else if a > a_max {
-		return a_max
-	}
+func (p ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point) image.Point {
+	tw := min(max(imageSize.X, p.imageSize), canvasSize.X)
+	th := min(max(imageSize.Y, p.imageSize), canvasSize.Y)

-	return a
-}
+	r := math.Min(
+		float64(tw)/float64(imageSize.X),
+		float64(th)/float64(imageSize.Y),
+	)

-func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
-	targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
-	targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
-
-	scaleWidth := float64(targetWidth) / float64(imageSize.X)
-	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
-
-	var w, h int
-
-	if scaleWidth < scaleHeight {
-		w = targetWidth
-		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
-	} else {
-		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
-		h = targetHeight
-	}
+	w := min(int(math.Floor(float64(imageSize.X)*r)), tw)
+	h := min(int(math.Floor(float64(imageSize.Y)*r)), th)

 	return image.Point{w, h}
 }

-func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
-	possibleTileArrangements := p.supportedAspectRatios(maxImageTiles)
-	possibleCanvasSizes := []image.Point{}
-	for _, pta := range possibleTileArrangements {
-		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
+func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point {
+	possibleTileArrangements := p.supportedAspectRatios()
+	possibleCanvasSizes := make([]image.Point, len(possibleTileArrangements))
+	for i, pta := range possibleTileArrangements {
+		possibleCanvasSizes[i] = image.Point{pta.width * p.imageSize, pta.height * p.imageSize}
 	}

-	scales := []float64{}
-
-	for _, pcs := range possibleCanvasSizes {
-		scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
-		scaleWidth := float64(pcs.X) / float64(imageSize.X)
-
-		if scaleWidth > scaleHeight {
-			scales = append(scales, scaleHeight)
-		} else {
-			scales = append(scales, scaleWidth)
-		}
+	scales := make([]float64, len(possibleCanvasSizes))
+	for i, pcs := range possibleCanvasSizes {
+		scales[i] = min(
+			float64(pcs.Y)/float64(imageSize.Y),
+			float64(pcs.X)/float64(imageSize.X),
+		)
 	}

 	var minUpscale float64
@@ -123,47 +115,41 @@ func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles
 	return selectedCanvas
 }

-func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
+func (p ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
 	b := img.Bounds()
 	width := b.Max.X - b.Min.X
 	height := b.Max.Y - b.Min.Y
 	tileHeight := height / numTilesSize.Y
 	tileWidth := width / numTilesSize.X

-	images := []image.Image{}
+	images := make([]image.Image, 0, numTilesSize.Y*numTilesSize.X)

 	for h := range numTilesSize.Y {
 		for w := range numTilesSize.X {
 			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
-			images = append(images, img.(interface {
+			if subImg, ok := img.(interface {
 				SubImage(image.Rectangle) image.Image
-			}).SubImage(rect))
+			}); ok {
+				images = append(images, subImg.SubImage(rect))
+			} else {
+				// Handle the case where img does not implement SubImage
+				// This is a fallback and may not be efficient
+				newImg := image.NewRGBA(rect)
+				draw.Draw(newImg, rect, img, rect.Min, draw.Src)
+				images = append(images, newImg)
+			}
 		}
 	}

 	return images
 }

-// remove the "alpha" channel by drawing over a prefilled image
-//
-//nolint:unused
-func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
-	dst := image.NewRGBA(img.Bounds())
-
-	white := color.RGBA{255, 255, 255, 255}
-	draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
-	draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
-
-	return dst
-}
-
-func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
+func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) {
 	b := img.Bounds()
-	tileSize := outputSize.Y

-	canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize)
-	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
-	newSize := p.fitToCanvas(b.Max, canvasSize, tileSize)
+	canvasSize := p.optimalTiledCanvas(b.Max)
+	aspectRatio := image.Point{canvasSize.X / p.imageSize, canvasSize.Y / p.imageSize}
+	newSize := p.fitToCanvas(b.Max, canvasSize)

 	dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))

@@ -177,10 +163,10 @@ func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImag
 	return dst, aspectRatio
 }

-func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image {
+func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Image {
 	paddedSize := image.Point{
-		X: outputSize.X * aspectRatio.X,
-		Y: outputSize.Y * aspectRatio.Y,
+		X: p.imageSize * aspectRatio.X,
+		Y: p.imageSize * aspectRatio.Y,
 	}

 	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
@@ -189,7 +175,7 @@ func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Poin
 	return dst
 }

-func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
+func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32 {
 	subImages := p.splitToTiles(img, aspectRatio)

 	var pixelVals []float32
@@ -205,9 +191,9 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
 				gVal := float32(g>>8) / 255.0
 				bVal := float32(b>>8) / 255.0

-				rVal = (rVal - mean[0]) / std[0]
-				gVal = (gVal - mean[1]) / std[1]
-				bVal = (bVal - mean[2]) / std[2]
+				rVal = (rVal - p.mean[0]) / p.std[0]
+				gVal = (gVal - p.mean[1]) / p.std[1]
+				bVal = (bVal - p.mean[2]) / p.std[2]

 				rVals = append(rVals, rVal)
 				gVals = append(gVals, gVal)
@@ -222,17 +208,15 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
 	return pixelVals
 }

-func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) {
-	outputSize := image.Point{p.imageSize, p.imageSize}
+func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, supportedAspectRatio, error) {
+	newImage, newImageRatio := p.resize(img)
+	newImage = p.pad(newImage, newImageRatio)
+	pixelValues := p.pack(newImage, newImageRatio)

-	// clip values
-	mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
-	std := [3]float32{0.26862954, 0.26130258, 0.27577711}
+	supportedAspectRatios := p.supportedAspectRatios()
+	aspectRatioID := slices.IndexFunc(supportedAspectRatios, func(i supportedAspectRatio) bool {
+		return i.width == newImageRatio.X && i.height == newImageRatio.Y
+	})

-	newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles)
-	newImage = p.pad(newImage, outputSize, aspectRatio)
-
-	data := p.pack(newImage, aspectRatio, mean, std)
-	aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
-	return data, aspectRatioIndex, nil
+	return pixelValues, supportedAspectRatios[aspectRatioID], nil
 }
--- a/model/models/mllama/process_image_test.go
+++ b/model/models/mllama/process_image_test.go
@@ -0,0 +1,387 @@
+package mllama
+
+import (
+	"image"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func TestSupportedAspectRatios(t *testing.T) {
+	cases := []struct {
+		p    ImageProcessor
+		want []supportedAspectRatio
+	}{
+		{
+			p: ImageProcessor{maxNumTiles: 1},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+			},
+		},
+		{
+			p: ImageProcessor{maxNumTiles: 2},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+				{2, 1, 2},
+				{3, 2, 1},
+			},
+		},
+		{
+			p: ImageProcessor{maxNumTiles: 3},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+				{2, 1, 2},
+				{3, 1, 3},
+				{4, 2, 1},
+				{5, 3, 1},
+			},
+		},
+		{
+			p: ImageProcessor{maxNumTiles: 4},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+				{2, 1, 2},
+				{3, 1, 3},
+				{4, 1, 4},
+				{5, 2, 1},
+				{6, 2, 2},
+				{7, 3, 1},
+				{8, 4, 1},
+			},
+		},
+	}
+
+	for _, tt := range cases {
+		actual := tt.p.supportedAspectRatios()
+		if diff := cmp.Diff(actual, tt.want, cmp.AllowUnexported(supportedAspectRatio{})); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+
+func TestFitToCanvas(t *testing.T) {
+	cases := []struct {
+		p      ImageProcessor
+		image  image.Point
+		canvas image.Point
+		expect image.Point
+	}{
+		{
+			p:      ImageProcessor{imageSize: 200},
+			image:  image.Point{400, 400},
+			canvas: image.Point{640, 480},
+			expect: image.Point{400, 400},
+		},
+		{
+			p:      ImageProcessor{imageSize: 200},
+			image:  image.Point{1024, 768},
+			canvas: image.Point{640, 480},
+			expect: image.Point{640, 480},
+		},
+		{
+			p:      ImageProcessor{imageSize: 750},
+			image:  image.Point{500, 500},
+			canvas: image.Point{1000, 1000},
+			expect: image.Point{750, 750},
+		},
+		{
+			p:      ImageProcessor{imageSize: 2000},
+			image:  image.Point{500, 1000},
+			canvas: image.Point{2000, 2000},
+			expect: image.Point{1000, 2000},
+		},
+		{
+			p:      ImageProcessor{imageSize: 1000},
+			image:  image.Point{4000, 3000},
+			canvas: image.Point{2000, 1000},
+			expect: image.Point{1333, 1000},
+		},
+		{
+			p:      ImageProcessor{imageSize: 560},
+			image:  image.Point{667, 1000},
+			canvas: image.Point{1000, 1000},
+			expect: image.Point{667, 1000},
+		},
+	}
+
+	for _, tt := range cases {
+		actual := tt.p.fitToCanvas(tt.image, tt.canvas)
+		if diff := cmp.Diff(actual, tt.expect); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+
+func TestOptimalTiledCanvas(t *testing.T) {
+	cases := []struct {
+		p      ImageProcessor
+		image  image.Point
+		expect image.Point
+	}{
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 1000},
+			image:  image.Point{1024, 768},
+			expect: image.Point{2000, 1000},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{1024, 768},
+			expect: image.Point{1120, 1120},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{800, 600},
+			expect: image.Point{1120, 1120},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{640, 480},
+			expect: image.Point{1120, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{320, 200},
+			expect: image.Point{560, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{1320, 200},
+			expect: image.Point{1680, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{2000, 200},
+			expect: image.Point{2240, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{10000, 200},
+			expect: image.Point{2240, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{480, 640},
+			expect: image.Point{560, 1120},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 320},
+			expect: image.Point{560, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 1320},
+			expect: image.Point{560, 1680},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 2000},
+			expect: image.Point{560, 2240},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 10000},
+			expect: image.Point{560, 2240},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{10000, 10000},
+			expect: image.Point{1120, 1120},
+		},
+	}
+
+	for _, tt := range cases {
+		actual := tt.p.optimalTiledCanvas(tt.image)
+		if diff := cmp.Diff(actual, tt.expect); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+
+func TestSplitToTiles(t *testing.T) {
+	cases := []struct {
+		imageMax image.Point
+		numTiles image.Point
+		expect   []image.Image
+	}{
+		{
+			imageMax: image.Point{1024, 768},
+			numTiles: image.Point{1, 1},
+			expect:   []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
+		},
+		{
+			imageMax: image.Point{1000, 500},
+			numTiles: image.Point{2, 1},
+			expect: []image.Image{
+				image.NewRGBA(image.Rect(0, 0, 500, 500)),
+				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
+			},
+		},
+		{
+			imageMax: image.Point{1000, 1000},
+			numTiles: image.Point{2, 2},
+			expect: []image.Image{
+				image.NewRGBA(image.Rect(0, 0, 500, 500)),
+				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
+				image.NewRGBA(image.Rect(0, 500, 500, 1000)),
+				image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
+			},
+		},
+	}
+
+	var p ImageProcessor
+
+	for _, tt := range cases {
+		actual := p.splitToTiles(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.numTiles)
+
+		if len(actual) != len(tt.expect) {
+			t.Errorf("incorrect number of images '%d': expect: '%d'", len(actual), len(tt.expect))
+		}
+
+		for i := range actual {
+			if actual[i].Bounds() != tt.expect[i].Bounds() {
+				t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual[i].Bounds(), tt.expect[i].Bounds())
+			}
+		}
+	}
+}
+
+func TestResize(t *testing.T) {
+	cases := []struct {
+		p                 ImageProcessor
+		imageMax          image.Point
+		expectImage       image.Image
+		expectAspectRatio image.Point
+	}{
+		{
+			p:                 ImageProcessor{maxNumTiles: 1, imageSize: 100},
+			imageMax:          image.Point{200, 200},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
+			expectAspectRatio: image.Point{1, 1},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 2, imageSize: 100},
+			imageMax:          image.Point{200, 200},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
+			expectAspectRatio: image.Point{1, 1},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:          image.Point{10, 10},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 560, 560)),
+			expectAspectRatio: image.Point{1, 1},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:          image.Point{2560, 1920},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 1120, 840)),
+			expectAspectRatio: image.Point{2, 2},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:          image.Point{1024, 768},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 1024, 768)),
+			expectAspectRatio: image.Point{2, 2},
+		},
+	}
+
+	for _, tt := range cases {
+		actualImage, actualAspectRatio := tt.p.resize(image.Rectangle{Max: tt.imageMax})
+
+		if actualImage.Bounds() != tt.expectImage.Bounds() {
+			t.Errorf("image size incorrect: '%#v': expect: '%#v'", actualImage.Bounds(), tt.expectImage.Bounds())
+		}
+
+		if actualAspectRatio != tt.expectAspectRatio {
+			t.Errorf("aspect ratio incorrect: '%#v': expect: '%#v'", actualAspectRatio, tt.expectAspectRatio)
+		}
+	}
+}
+
+func TestPad(t *testing.T) {
+	cases := []struct {
+		p           ImageProcessor
+		imageMax    image.Point
+		aspectRatio image.Point
+		expect      image.Image
+	}{
+		{
+			p:           ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:    image.Point{1000, 667},
+			aspectRatio: image.Point{2, 2},
+			expect:      image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
+		},
+	}
+
+	for _, tt := range cases {
+		actual := tt.p.pad(image.Rectangle{Max: tt.imageMax}, tt.aspectRatio)
+
+		if actual.Bounds() != tt.expect.Bounds() {
+			t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual.Bounds(), tt.expect.Bounds())
+		}
+	}
+}
+
+func TestPackImages(t *testing.T) {
+	cases := []struct {
+		imageMax    image.Point
+		aspectRatio image.Point
+		expectVals  int
+	}{
+		{
+			imageMax:    image.Point{1120, 1120},
+			aspectRatio: image.Point{2, 2},
+			expectVals:  2 * 2 * 3 * 560 * 560,
+		},
+		{
+			imageMax:    image.Point{560, 560},
+			aspectRatio: image.Point{1, 1},
+			expectVals:  1 * 1 * 3 * 560 * 560,
+		},
+		{
+			imageMax:    image.Point{1120, 560},
+			aspectRatio: image.Point{1, 2},
+			expectVals:  1 * 2 * 3 * 560 * 560,
+		},
+	}
+
+	for _, tt := range cases {
+		var p ImageProcessor
+		actualVals := p.pack(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.aspectRatio)
+		if len(actualVals) != tt.expectVals {
+			t.Errorf("packed image size incorrect: '%d': expect: '%d'", len(actualVals), tt.expectVals)
+		}
+	}
+}
+
+func TestPreprocess(t *testing.T) {
+	cases := []struct {
+		imageMax            image.Point
+		expectAspectRatioID int
+	}{
+		{
+			imageMax:            image.Point{10, 10},
+			expectAspectRatioID: 1,
+		},
+		{
+			imageMax:            image.Point{1024, 768},
+			expectAspectRatioID: 6,
+		},
+	}
+
+	p := ImageProcessor{imageSize: 560, maxNumTiles: 4}
+	for _, tt := range cases {
+		img, aspectRatio, err := p.ProcessImage(image.NewRGBA(image.Rectangle{Max: tt.imageMax}))
+		if err != nil {
+			t.Fatalf("error processing: %q", err)
+		}
+
+		if len(img) == 0 {
+			t.Errorf("no image data returned")
+		}
+
+		if aspectRatio.rank != tt.expectAspectRatioID {
+			t.Errorf("aspect ratio incorrect: '%d': expect: '%d'", aspectRatio, tt.expectAspectRatioID)
+		}
+	}
+}