mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
imageproc mllama refactor (#7537)
Refactor mllama image processing code, and add pixtral and qwen2vl
This commit is contained in:
@@ -1,240 +0,0 @@
|
||||
package imageproc
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"image"
|
||||
"image/color"
|
||||
_ "image/jpeg"
|
||||
_ "image/png"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"golang.org/x/image/draw"
|
||||
)
|
||||
|
||||
func GetSupportedAspectRatios(maxTiles int) []image.Point {
|
||||
ratios := []image.Point{}
|
||||
|
||||
for w := range maxTiles {
|
||||
for h := range maxTiles {
|
||||
if (w+1)*(h+1) <= maxTiles {
|
||||
ratios = append(ratios, image.Point{w + 1, h + 1})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ratios
|
||||
}
|
||||
|
||||
func clip(a, a_min, a_max int) int {
|
||||
if a < a_min {
|
||||
return a_min
|
||||
} else if a > a_max {
|
||||
return a_max
|
||||
}
|
||||
|
||||
return a
|
||||
}
|
||||
|
||||
func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
|
||||
targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
|
||||
targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
|
||||
|
||||
scaleWidth := float64(targetWidth) / float64(imageSize.X)
|
||||
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
|
||||
|
||||
var w, h int
|
||||
|
||||
if scaleWidth < scaleHeight {
|
||||
w = targetWidth
|
||||
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
|
||||
} else {
|
||||
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
|
||||
h = targetHeight
|
||||
}
|
||||
|
||||
return image.Point{w, h}
|
||||
}
|
||||
|
||||
func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
|
||||
possibleTileArrangements := GetSupportedAspectRatios(maxImageTiles)
|
||||
possibleCanvasSizes := []image.Point{}
|
||||
for _, pta := range possibleTileArrangements {
|
||||
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
|
||||
}
|
||||
|
||||
scales := []float64{}
|
||||
|
||||
for _, pcs := range possibleCanvasSizes {
|
||||
scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
|
||||
scaleWidth := float64(pcs.X) / float64(imageSize.X)
|
||||
|
||||
if scaleWidth > scaleHeight {
|
||||
scales = append(scales, scaleHeight)
|
||||
} else {
|
||||
scales = append(scales, scaleWidth)
|
||||
}
|
||||
}
|
||||
|
||||
var minUpscale float64
|
||||
var maxDownscale float64
|
||||
var upscale bool
|
||||
|
||||
for _, s := range scales {
|
||||
if s > 1.0 {
|
||||
upscale = true
|
||||
if minUpscale == 0 {
|
||||
minUpscale = s
|
||||
} else {
|
||||
minUpscale = math.Min(minUpscale, s)
|
||||
}
|
||||
} else {
|
||||
maxDownscale = math.Max(maxDownscale, s)
|
||||
}
|
||||
}
|
||||
|
||||
selectedScale := maxDownscale
|
||||
if upscale {
|
||||
selectedScale = minUpscale
|
||||
}
|
||||
|
||||
var selectedCanvas image.Point
|
||||
for n, pcs := range possibleCanvasSizes {
|
||||
if scales[n] == selectedScale {
|
||||
// choose the smallest possible canvas
|
||||
if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
|
||||
selectedCanvas = pcs
|
||||
} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
|
||||
selectedCanvas = pcs
|
||||
}
|
||||
}
|
||||
}
|
||||
return selectedCanvas
|
||||
}
|
||||
|
||||
func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
|
||||
b := img.Bounds()
|
||||
width := b.Max.X - b.Min.X
|
||||
height := b.Max.Y - b.Min.Y
|
||||
tileHeight := height / numTilesSize.Y
|
||||
tileWidth := width / numTilesSize.X
|
||||
|
||||
images := []image.Image{}
|
||||
|
||||
for h := range numTilesSize.Y {
|
||||
for w := range numTilesSize.X {
|
||||
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
|
||||
images = append(images, img.(interface {
|
||||
SubImage(image.Rectangle) image.Image
|
||||
}).SubImage(rect))
|
||||
}
|
||||
}
|
||||
|
||||
return images
|
||||
}
|
||||
|
||||
// remove the "alpha" channel by drawing over a prefilled image
|
||||
func compositeImage(img image.Image) image.Image {
|
||||
dst := image.NewRGBA(img.Bounds())
|
||||
|
||||
white := color.RGBA{255, 255, 255, 255}
|
||||
draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
|
||||
draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
func ResizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
|
||||
if format == "png" {
|
||||
img = compositeImage(img)
|
||||
}
|
||||
|
||||
b := img.Bounds()
|
||||
tileSize := outputSize.Y
|
||||
|
||||
canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
|
||||
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
|
||||
newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
|
||||
|
||||
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
|
||||
|
||||
// scaling choices:
|
||||
// NearestNeighbor fast, blocky output
|
||||
// ApproxBiLinear fast, medium quality
|
||||
// BiLinear slow, high quality
|
||||
// CatmullRom very slow, very high quality
|
||||
draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)
|
||||
|
||||
return dst, aspectRatio
|
||||
}
|
||||
|
||||
func PadImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
|
||||
paddedSize := image.Point{
|
||||
X: outputSize.X * aspectRatio.X,
|
||||
Y: outputSize.Y * aspectRatio.Y,
|
||||
}
|
||||
|
||||
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
|
||||
draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
func PackImages(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
|
||||
subImages := splitToTiles(img, aspectRatio)
|
||||
|
||||
var pixelVals []float32
|
||||
|
||||
for _, subImg := range subImages {
|
||||
bounds := subImg.Bounds()
|
||||
var rVals, gVals, bVals []float32
|
||||
for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
|
||||
for x := bounds.Min.X; x < bounds.Max.X; x++ {
|
||||
c := subImg.At(x, y)
|
||||
r, g, b, _ := c.RGBA()
|
||||
rVal := float32(r>>8) / 255.0
|
||||
gVal := float32(g>>8) / 255.0
|
||||
bVal := float32(b>>8) / 255.0
|
||||
|
||||
rVal = (rVal - mean[0]) / std[0]
|
||||
gVal = (gVal - mean[1]) / std[1]
|
||||
bVal = (bVal - mean[2]) / std[2]
|
||||
|
||||
rVals = append(rVals, rVal)
|
||||
gVals = append(gVals, gVal)
|
||||
bVals = append(bVals, bVal)
|
||||
}
|
||||
}
|
||||
pixelVals = append(pixelVals, rVals...)
|
||||
pixelVals = append(pixelVals, gVals...)
|
||||
pixelVals = append(pixelVals, bVals...)
|
||||
}
|
||||
|
||||
return pixelVals
|
||||
}
|
||||
|
||||
func Preprocess(imageData []byte) ([]float32, int, error) {
|
||||
// todo: need guard in here for bad image data
|
||||
|
||||
// mllama values
|
||||
outputSize := image.Point{560, 560}
|
||||
maxTiles := 4
|
||||
|
||||
// clip values
|
||||
mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
|
||||
std := [3]float32{0.26862954, 0.26130258, 0.27577711}
|
||||
|
||||
img, format, err := image.Decode(bytes.NewReader(imageData))
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to decode image: %w", err)
|
||||
}
|
||||
|
||||
newImage, aspectRatio := ResizeImage(img, format, outputSize, maxTiles)
|
||||
newImage = PadImage(newImage, outputSize, aspectRatio)
|
||||
|
||||
data := PackImages(newImage, aspectRatio, mean, std)
|
||||
aspectRatioIndex := slices.Index(GetSupportedAspectRatios(maxTiles), aspectRatio) + 1
|
||||
|
||||
return data, aspectRatioIndex, nil
|
||||
}
|
||||
@@ -1,416 +0,0 @@
|
||||
package imageproc
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"image"
|
||||
"image/png"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
)
|
||||
|
||||
func TestAspectRatios(t *testing.T) {
|
||||
type aspectCase struct {
|
||||
MaxTiles int
|
||||
Expected []image.Point
|
||||
}
|
||||
|
||||
cases := []aspectCase{
|
||||
{
|
||||
MaxTiles: 1,
|
||||
Expected: []image.Point{{1, 1}},
|
||||
},
|
||||
{
|
||||
MaxTiles: 2,
|
||||
Expected: []image.Point{{1, 1}, {1, 2}, {2, 1}},
|
||||
},
|
||||
{
|
||||
MaxTiles: 3,
|
||||
Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {2, 1}, {3, 1}},
|
||||
},
|
||||
{
|
||||
MaxTiles: 4,
|
||||
Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {2, 1}, {2, 2}, {3, 1}, {4, 1}},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := GetSupportedAspectRatios(c.MaxTiles)
|
||||
|
||||
if diff := cmp.Diff(actual, c.Expected); diff != "" {
|
||||
t.Errorf("mismatch (-got +want):\n%s", diff)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetImageSizeFitToCanvas(t *testing.T) {
|
||||
type imageSizeCase struct {
|
||||
ImageRect image.Point
|
||||
CanvasRect image.Point
|
||||
TileSize int
|
||||
Expected image.Point
|
||||
}
|
||||
|
||||
cases := []imageSizeCase{
|
||||
{
|
||||
ImageRect: image.Point{400, 400},
|
||||
CanvasRect: image.Point{640, 480},
|
||||
TileSize: 200,
|
||||
Expected: image.Point{400, 400},
|
||||
},
|
||||
{
|
||||
ImageRect: image.Point{1024, 768},
|
||||
CanvasRect: image.Point{640, 480},
|
||||
TileSize: 200,
|
||||
Expected: image.Point{640, 480},
|
||||
},
|
||||
{
|
||||
ImageRect: image.Point{500, 500},
|
||||
CanvasRect: image.Point{1000, 1000},
|
||||
TileSize: 750,
|
||||
Expected: image.Point{750, 750},
|
||||
},
|
||||
{
|
||||
ImageRect: image.Point{500, 1000},
|
||||
CanvasRect: image.Point{2000, 2000},
|
||||
TileSize: 2000,
|
||||
Expected: image.Point{1000, 2000},
|
||||
},
|
||||
{
|
||||
ImageRect: image.Point{4000, 3000},
|
||||
CanvasRect: image.Point{2000, 1000},
|
||||
TileSize: 1000,
|
||||
Expected: image.Point{1333, 1000},
|
||||
},
|
||||
{
|
||||
ImageRect: image.Point{667, 1000},
|
||||
CanvasRect: image.Point{1000, 1000},
|
||||
TileSize: 560,
|
||||
Expected: image.Point{667, 1000},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize)
|
||||
|
||||
if actual != c.Expected {
|
||||
t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetOptimalTiledCanvas(t *testing.T) {
|
||||
type tiledCanvasSizeCase struct {
|
||||
ImageSize image.Point
|
||||
MaxImageTiles int
|
||||
TileSize int
|
||||
Expected image.Point
|
||||
}
|
||||
|
||||
cases := []tiledCanvasSizeCase{
|
||||
{
|
||||
ImageSize: image.Point{1024, 768},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 1000,
|
||||
Expected: image.Point{2000, 1000},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{1024, 768},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{1120, 1120},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{800, 600},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{1120, 1120},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{640, 480},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{1120, 560},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{320, 200},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{560, 560},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{1320, 200},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{1680, 560},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{2000, 200},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{2240, 560},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{10000, 200},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{2240, 560},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{480, 640},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{560, 1120},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{200, 320},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{560, 560},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{200, 1320},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{560, 1680},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{200, 2000},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{560, 2240},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{200, 10000},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{560, 2240},
|
||||
},
|
||||
{
|
||||
ImageSize: image.Point{10000, 10000},
|
||||
MaxImageTiles: 4,
|
||||
TileSize: 560,
|
||||
Expected: image.Point{1120, 1120},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize)
|
||||
|
||||
if actual != c.Expected {
|
||||
t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplitToTiles(t *testing.T) {
|
||||
type splitCase struct {
|
||||
TestImage image.Image
|
||||
NumTilesSize image.Point
|
||||
Expected []image.Image
|
||||
}
|
||||
|
||||
cases := []splitCase{
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
||||
NumTilesSize: image.Point{1, 1},
|
||||
Expected: []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 500)),
|
||||
NumTilesSize: image.Point{2, 1},
|
||||
Expected: []image.Image{
|
||||
image.NewRGBA(image.Rect(0, 0, 500, 500)),
|
||||
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
|
||||
},
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 1000)),
|
||||
NumTilesSize: image.Point{2, 2},
|
||||
Expected: []image.Image{
|
||||
image.NewRGBA(image.Rect(0, 0, 500, 500)),
|
||||
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
|
||||
image.NewRGBA(image.Rect(0, 500, 500, 1000)),
|
||||
image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := splitToTiles(c.TestImage, c.NumTilesSize)
|
||||
|
||||
if len(actual) != len(c.Expected) {
|
||||
t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected))
|
||||
}
|
||||
|
||||
for i := range actual {
|
||||
if actual[i].Bounds() != c.Expected[i].Bounds() {
|
||||
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual[i].Bounds(), c.Expected[i].Bounds())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestResize(t *testing.T) {
|
||||
type resizeCase struct {
|
||||
TestImage image.Image
|
||||
OutputSize image.Point
|
||||
MaxImageTiles int
|
||||
ExpectedImage image.Image
|
||||
ExpectedAspectRatio image.Point
|
||||
}
|
||||
|
||||
cases := []resizeCase{
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)),
|
||||
OutputSize: image.Point{100, 100},
|
||||
MaxImageTiles: 1,
|
||||
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
|
||||
ExpectedAspectRatio: image.Point{1, 1},
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)),
|
||||
OutputSize: image.Point{100, 100},
|
||||
MaxImageTiles: 2,
|
||||
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
|
||||
ExpectedAspectRatio: image.Point{1, 1},
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)),
|
||||
OutputSize: image.Point{560, 560},
|
||||
MaxImageTiles: 4,
|
||||
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
|
||||
ExpectedAspectRatio: image.Point{1, 1},
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 2560, 1920)),
|
||||
OutputSize: image.Point{560, 560},
|
||||
MaxImageTiles: 4,
|
||||
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1120, 840)),
|
||||
ExpectedAspectRatio: image.Point{2, 2},
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
||||
OutputSize: image.Point{560, 560},
|
||||
MaxImageTiles: 4,
|
||||
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
||||
ExpectedAspectRatio: image.Point{2, 2},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actualImage, actualAspectRatio := ResizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
|
||||
|
||||
if actualImage.Bounds() != c.ExpectedImage.Bounds() {
|
||||
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds())
|
||||
}
|
||||
|
||||
if actualAspectRatio != c.ExpectedAspectRatio {
|
||||
t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPad(t *testing.T) {
|
||||
type padCase struct {
|
||||
TestImage image.Image
|
||||
OutputSize image.Point
|
||||
AspectRatio image.Point
|
||||
Expected image.Image
|
||||
}
|
||||
|
||||
cases := []padCase{
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 667)),
|
||||
OutputSize: image.Point{560, 560},
|
||||
AspectRatio: image.Point{2, 2},
|
||||
Expected: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := PadImage(c.TestImage, c.OutputSize, c.AspectRatio)
|
||||
|
||||
if actual.Bounds() != c.Expected.Bounds() {
|
||||
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPackImages(t *testing.T) {
|
||||
type packCase struct {
|
||||
TestImage image.Image
|
||||
AspectRatio image.Point
|
||||
ExpectedVals int
|
||||
}
|
||||
|
||||
mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
|
||||
std := [3]float32{0.26862954, 0.26130258, 0.27577711}
|
||||
|
||||
cases := []packCase{
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
|
||||
AspectRatio: image.Point{2, 2},
|
||||
ExpectedVals: 2 * 2 * 3 * 560 * 560,
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
|
||||
AspectRatio: image.Point{1, 1},
|
||||
ExpectedVals: 1 * 1 * 3 * 560 * 560,
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 560)),
|
||||
AspectRatio: image.Point{1, 2},
|
||||
ExpectedVals: 1 * 2 * 3 * 560 * 560,
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actualVals := PackImages(c.TestImage, c.AspectRatio, mean, std)
|
||||
if len(actualVals) != c.ExpectedVals {
|
||||
t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPreprocess(t *testing.T) {
|
||||
type preprocessCase struct {
|
||||
TestImage image.Image
|
||||
ExpectedVals int
|
||||
ExpectedAspectRatioID int
|
||||
}
|
||||
|
||||
cases := []preprocessCase{
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)),
|
||||
ExpectedVals: 0,
|
||||
ExpectedAspectRatioID: 1,
|
||||
},
|
||||
{
|
||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
||||
ExpectedVals: 0,
|
||||
ExpectedAspectRatioID: 6,
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
var buf bytes.Buffer
|
||||
err := png.Encode(&buf, c.TestImage)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
imgData, aspectRatioID, err := Preprocess(buf.Bytes())
|
||||
if err != nil {
|
||||
t.Fatalf("error processing: %q", err)
|
||||
}
|
||||
|
||||
if len(imgData) == 0 {
|
||||
t.Errorf("no image data returned")
|
||||
}
|
||||
|
||||
if aspectRatioID != c.ExpectedAspectRatioID {
|
||||
t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -11,7 +11,7 @@ import (
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/server/imageproc"
|
||||
"github.com/ollama/ollama/model/mllama"
|
||||
"github.com/ollama/ollama/template"
|
||||
)
|
||||
|
||||
@@ -92,7 +92,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
||||
var imgData llm.ImageData
|
||||
|
||||
if isMllama {
|
||||
data, aspectRatioID, err := imageproc.Preprocess(i)
|
||||
data, opts, err := mllama.Preprocess(bytes.NewReader(i))
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
@@ -103,10 +103,15 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
ar, ok := opts["aspectRatioIndex"].(int)
|
||||
if !ok {
|
||||
return "", nil, fmt.Errorf("missing aspect ratio for image")
|
||||
}
|
||||
|
||||
imgData = llm.ImageData{
|
||||
ID: len(images),
|
||||
Data: buf.Bytes(),
|
||||
AspectRatioID: aspectRatioID,
|
||||
AspectRatioID: ar,
|
||||
}
|
||||
imgPrompt = "<|image|>"
|
||||
} else {
|
||||
|
||||
@@ -31,10 +31,10 @@ import (
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/model/mllama"
|
||||
"github.com/ollama/ollama/openai"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/runners"
|
||||
"github.com/ollama/ollama/server/imageproc"
|
||||
"github.com/ollama/ollama/template"
|
||||
"github.com/ollama/ollama/types/errtypes"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
@@ -205,12 +205,18 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
images := make([]llm.ImageData, len(req.Images))
|
||||
for i := range req.Images {
|
||||
if isMllama {
|
||||
data, aspectRatioID, err := imageproc.Preprocess(req.Images[i])
|
||||
data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
|
||||
if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
|
||||
return
|
||||
}
|
||||
|
||||
ar, ok := opts["aspectRatioIndex"].(int)
|
||||
if !ok {
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
|
||||
return
|
||||
}
|
||||
|
||||
buf := new(bytes.Buffer)
|
||||
err = binary.Write(buf, binary.LittleEndian, data)
|
||||
if err != nil {
|
||||
@@ -218,7 +224,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: aspectRatioID}
|
||||
images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar}
|
||||
} else {
|
||||
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user