imageproc mllama refactor (#7537)

Refactor mllama image processing code, and add pixtral and qwen2vl
This commit is contained in:
Patrick Devine
2024-12-14 19:50:15 -08:00
committed by GitHub
parent b75ccfc5ec
commit 8c9fb8eb73
10 changed files with 828 additions and 125 deletions

201
model/mllama/imageproc.go Normal file
View File

@@ -0,0 +1,201 @@
package mllama
import (
"fmt"
"image"
_ "image/jpeg"
_ "image/png"
"io"
"math"
"slices"
"golang.org/x/image/draw"
"github.com/ollama/ollama/model/imageproc"
)
func getSupportedAspectRatios(maxTiles int) []image.Point {
ratios := []image.Point{}
for w := range maxTiles {
for h := range maxTiles {
if (w+1)*(h+1) <= maxTiles {
ratios = append(ratios, image.Point{w + 1, h + 1})
}
}
}
return ratios
}
func clip(a, a_min, a_max int) int {
if a < a_min {
return a_min
} else if a > a_max {
return a_max
}
return a
}
func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
possibleTileArrangements := getSupportedAspectRatios(maxImageTiles)
possibleCanvasSizes := []image.Point{}
for _, pta := range possibleTileArrangements {
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
}
scales := []float64{}
for _, pcs := range possibleCanvasSizes {
scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
scaleWidth := float64(pcs.X) / float64(imageSize.X)
if scaleWidth > scaleHeight {
scales = append(scales, scaleHeight)
} else {
scales = append(scales, scaleWidth)
}
}
var minUpscale float64
var maxDownscale float64
var upscale bool
for _, s := range scales {
if s > 1.0 {
upscale = true
if minUpscale == 0 {
minUpscale = s
} else {
minUpscale = math.Min(minUpscale, s)
}
} else {
maxDownscale = math.Max(maxDownscale, s)
}
}
selectedScale := maxDownscale
if upscale {
selectedScale = minUpscale
}
var selectedCanvas image.Point
for n, pcs := range possibleCanvasSizes {
if scales[n] == selectedScale {
// choose the smallest possible canvas
if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
selectedCanvas = pcs
} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
selectedCanvas = pcs
}
}
}
return selectedCanvas
}
func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
scaleWidth := float64(targetWidth) / float64(imageSize.X)
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
var w, h int
if scaleWidth < scaleHeight {
w = targetWidth
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
} else {
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
h = targetHeight
}
return image.Point{w, h}
}
func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
if format == "png" {
img = imageproc.Composite(img)
}
b := img.Bounds()
tileSize := outputSize.Y
canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio
}
func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
paddedSize := image.Point{
X: outputSize.X * aspectRatio.X,
Y: outputSize.Y * aspectRatio.Y,
}
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
return dst
}
func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
b := img.Bounds()
width := b.Max.X - b.Min.X
height := b.Max.Y - b.Min.Y
tileHeight := height / numTilesSize.Y
tileWidth := width / numTilesSize.X
images := []image.Image{}
for h := range numTilesSize.Y {
for w := range numTilesSize.X {
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
images = append(images, img.(interface {
SubImage(image.Rectangle) image.Image
}).SubImage(rect))
}
}
return images
}
func packImages(img image.Image, aspectRatio image.Point) []float32 {
subImages := splitToTiles(img, aspectRatio)
var pixelVals []float32
rescale := true
channelFirst := true
for _, subImg := range subImages {
vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst)
pixelVals = append(pixelVals, vals...)
}
return pixelVals
}
func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
outputSize := image.Point{560, 560}
maxTiles := 4
img, format, err := image.Decode(imageData)
if err != nil {
return nil, nil, fmt.Errorf("failed to decode image: %w", err)
}
newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles)
newImage = padImage(newImage, outputSize, aspectRatio)
data := packImages(newImage, aspectRatio)
aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1
opts := map[string]any{
"aspectRatioIndex": aspectRatioIndex,
}
return data, opts, nil
}

View File

@@ -0,0 +1,420 @@
package mllama
import (
"bytes"
"image"
"image/png"
"testing"
"github.com/google/go-cmp/cmp"
)
func TestAspectRatios(t *testing.T) {
type aspectCase struct {
MaxTiles int
Expected []image.Point
}
cases := []aspectCase{
{
MaxTiles: 1,
Expected: []image.Point{{1, 1}},
},
{
MaxTiles: 2,
Expected: []image.Point{{1, 1}, {1, 2}, {2, 1}},
},
{
MaxTiles: 3,
Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {2, 1}, {3, 1}},
},
{
MaxTiles: 4,
Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {2, 1}, {2, 2}, {3, 1}, {4, 1}},
},
}
for _, c := range cases {
actual := getSupportedAspectRatios(c.MaxTiles)
if diff := cmp.Diff(actual, c.Expected); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
}
}
func TestGetImageSizeFitToCanvas(t *testing.T) {
type imageSizeCase struct {
ImageRect image.Point
CanvasRect image.Point
TileSize int
Expected image.Point
}
cases := []imageSizeCase{
{
ImageRect: image.Point{400, 400},
CanvasRect: image.Point{640, 480},
TileSize: 200,
Expected: image.Point{400, 400},
},
{
ImageRect: image.Point{1024, 768},
CanvasRect: image.Point{640, 480},
TileSize: 200,
Expected: image.Point{640, 480},
},
{
ImageRect: image.Point{500, 500},
CanvasRect: image.Point{1000, 1000},
TileSize: 750,
Expected: image.Point{750, 750},
},
{
ImageRect: image.Point{500, 1000},
CanvasRect: image.Point{2000, 2000},
TileSize: 2000,
Expected: image.Point{1000, 2000},
},
{
ImageRect: image.Point{4000, 3000},
CanvasRect: image.Point{2000, 1000},
TileSize: 1000,
Expected: image.Point{1333, 1000},
},
{
ImageRect: image.Point{667, 1000},
CanvasRect: image.Point{1000, 1000},
TileSize: 560,
Expected: image.Point{667, 1000},
},
}
for _, c := range cases {
actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize)
if actual != c.Expected {
t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected)
}
}
}
func TestGetOptimalTiledCanvas(t *testing.T) {
type tiledCanvasSizeCase struct {
ImageSize image.Point
MaxImageTiles int
TileSize int
Expected image.Point
}
cases := []tiledCanvasSizeCase{
{
ImageSize: image.Point{1024, 768},
MaxImageTiles: 4,
TileSize: 1000,
Expected: image.Point{2000, 1000},
},
{
ImageSize: image.Point{1024, 768},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{1120, 1120},
},
{
ImageSize: image.Point{800, 600},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{1120, 1120},
},
{
ImageSize: image.Point{640, 480},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{1120, 560},
},
{
ImageSize: image.Point{320, 200},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 560},
},
{
ImageSize: image.Point{1320, 200},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{1680, 560},
},
{
ImageSize: image.Point{2000, 200},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{2240, 560},
},
{
ImageSize: image.Point{10000, 200},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{2240, 560},
},
{
ImageSize: image.Point{480, 640},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 1120},
},
{
ImageSize: image.Point{200, 320},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 560},
},
{
ImageSize: image.Point{200, 1320},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 1680},
},
{
ImageSize: image.Point{200, 2000},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 2240},
},
{
ImageSize: image.Point{200, 10000},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 2240},
},
{
ImageSize: image.Point{10000, 10000},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{1120, 1120},
},
}
for _, c := range cases {
actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize)
if actual != c.Expected {
t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected)
}
}
}
func TestSplitToTiles(t *testing.T) {
type splitCase struct {
TestImage image.Image
NumTilesSize image.Point
Expected []image.Image
}
cases := []splitCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
NumTilesSize: image.Point{1, 1},
Expected: []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 500)),
NumTilesSize: image.Point{2, 1},
Expected: []image.Image{
image.NewRGBA(image.Rect(0, 0, 500, 500)),
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 1000)),
NumTilesSize: image.Point{2, 2},
Expected: []image.Image{
image.NewRGBA(image.Rect(0, 0, 500, 500)),
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
image.NewRGBA(image.Rect(0, 500, 500, 1000)),
image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
},
},
}
for _, c := range cases {
actual := splitToTiles(c.TestImage, c.NumTilesSize)
if len(actual) != len(c.Expected) {
t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected))
}
for i := range actual {
if actual[i].Bounds() != c.Expected[i].Bounds() {
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual[i].Bounds(), c.Expected[i].Bounds())
}
}
}
}
func TestResize(t *testing.T) {
type resizeCase struct {
TestImage image.Image
OutputSize image.Point
MaxImageTiles int
ExpectedImage image.Image
ExpectedAspectRatio image.Point
}
cases := []resizeCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)),
OutputSize: image.Point{100, 100},
MaxImageTiles: 1,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
ExpectedAspectRatio: image.Point{1, 1},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)),
OutputSize: image.Point{100, 100},
MaxImageTiles: 2,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
ExpectedAspectRatio: image.Point{1, 1},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)),
OutputSize: image.Point{560, 560},
MaxImageTiles: 4,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
ExpectedAspectRatio: image.Point{1, 1},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 2560, 1920)),
OutputSize: image.Point{560, 560},
MaxImageTiles: 4,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1120, 840)),
ExpectedAspectRatio: image.Point{2, 2},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
OutputSize: image.Point{560, 560},
MaxImageTiles: 4,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
ExpectedAspectRatio: image.Point{2, 2},
},
}
for _, c := range cases {
actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
if actualImage.Bounds() != c.ExpectedImage.Bounds() {
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds())
}
if actualAspectRatio != c.ExpectedAspectRatio {
t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio)
}
}
}
func TestPad(t *testing.T) {
type padCase struct {
TestImage image.Image
OutputSize image.Point
AspectRatio image.Point
Expected image.Image
}
cases := []padCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 667)),
OutputSize: image.Point{560, 560},
AspectRatio: image.Point{2, 2},
Expected: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
},
}
for _, c := range cases {
actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio)
if actual.Bounds() != c.Expected.Bounds() {
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
}
}
}
func TestPackImages(t *testing.T) {
type packCase struct {
TestImage image.Image
AspectRatio image.Point
ExpectedVals int
}
cases := []packCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
AspectRatio: image.Point{2, 2},
ExpectedVals: 2 * 2 * 3 * 560 * 560,
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
AspectRatio: image.Point{1, 1},
ExpectedVals: 1 * 1 * 3 * 560 * 560,
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 560)),
AspectRatio: image.Point{1, 2},
ExpectedVals: 1 * 2 * 3 * 560 * 560,
},
}
for _, c := range cases {
actualVals := packImages(c.TestImage, c.AspectRatio)
if len(actualVals) != c.ExpectedVals {
t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
}
}
}
func TestPreprocess(t *testing.T) {
type preprocessCase struct {
TestImage image.Image
ExpectedVals int
ExpectedAspectRatioID int
}
cases := []preprocessCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)),
ExpectedVals: 0,
ExpectedAspectRatioID: 1,
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
ExpectedVals: 0,
ExpectedAspectRatioID: 6,
},
}
for _, c := range cases {
var buf bytes.Buffer
err := png.Encode(&buf, c.TestImage)
if err != nil {
t.Fatal(err)
}
imgData, opts, err := Preprocess(&buf)
if err != nil {
t.Fatalf("error processing: %q", err)
}
if len(imgData) == 0 {
t.Errorf("no image data returned")
}
ar, ok := opts["aspectRatioIndex"]
if !ok {
t.Fatalf("no aspect ratio found")
}
aspectRatioID := ar.(int)
if aspectRatioID != c.ExpectedAspectRatioID {
t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
}
}
}