mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-12 16:57:04 +00:00
chore: update mllama to use ollama engine (#10637)
This commit is contained in:
@@ -2,17 +2,31 @@ package mllama
|
||||
|
||||
import (
|
||||
"image"
|
||||
"image/color"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"golang.org/x/image/draw"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/model/imageproc"
|
||||
)
|
||||
|
||||
type supportedAspectRatio struct {
|
||||
rank, width, height int
|
||||
}
|
||||
|
||||
func (a supportedAspectRatio) Point() image.Point {
|
||||
return image.Point{a.width, a.height}
|
||||
}
|
||||
|
||||
func (a supportedAspectRatio) numTiles() int {
|
||||
return a.width * a.height
|
||||
}
|
||||
|
||||
type ImageProcessor struct {
|
||||
imageSize, numChannels, maxNumTiles int
|
||||
|
||||
mean, std [3]float32
|
||||
}
|
||||
|
||||
func newImageProcessor(c fs.Config) ImageProcessor {
|
||||
@@ -20,71 +34,49 @@ func newImageProcessor(c fs.Config) ImageProcessor {
|
||||
imageSize: int(c.Uint("vision.image_size")),
|
||||
numChannels: int(c.Uint("vision.num_channels")),
|
||||
maxNumTiles: int(c.Uint("vision.max_num_tiles")),
|
||||
|
||||
mean: imageproc.ClipDefaultMean,
|
||||
std: imageproc.ClipDefaultSTD,
|
||||
}
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point {
|
||||
ratios := []image.Point{}
|
||||
|
||||
for w := range maxTiles {
|
||||
for h := range maxTiles {
|
||||
if (w+1)*(h+1) <= maxTiles {
|
||||
ratios = append(ratios, image.Point{w + 1, h + 1})
|
||||
}
|
||||
func (p ImageProcessor) supportedAspectRatios() (ratios []supportedAspectRatio) {
|
||||
for w := 1; w <= p.maxNumTiles; w++ {
|
||||
for h := 1; h <= p.maxNumTiles/w; h++ {
|
||||
ratios = append(ratios, supportedAspectRatio{len(ratios) + 1, w, h})
|
||||
}
|
||||
}
|
||||
|
||||
return ratios
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) clip(a, a_min, a_max int) int {
|
||||
if a < a_min {
|
||||
return a_min
|
||||
} else if a > a_max {
|
||||
return a_max
|
||||
}
|
||||
func (p ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point) image.Point {
|
||||
tw := min(max(imageSize.X, p.imageSize), canvasSize.X)
|
||||
th := min(max(imageSize.Y, p.imageSize), canvasSize.Y)
|
||||
|
||||
return a
|
||||
}
|
||||
r := math.Min(
|
||||
float64(tw)/float64(imageSize.X),
|
||||
float64(th)/float64(imageSize.Y),
|
||||
)
|
||||
|
||||
func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
|
||||
targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
|
||||
targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
|
||||
|
||||
scaleWidth := float64(targetWidth) / float64(imageSize.X)
|
||||
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
|
||||
|
||||
var w, h int
|
||||
|
||||
if scaleWidth < scaleHeight {
|
||||
w = targetWidth
|
||||
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
|
||||
} else {
|
||||
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
|
||||
h = targetHeight
|
||||
}
|
||||
w := min(int(math.Floor(float64(imageSize.X)*r)), tw)
|
||||
h := min(int(math.Floor(float64(imageSize.Y)*r)), th)
|
||||
|
||||
return image.Point{w, h}
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
|
||||
possibleTileArrangements := p.supportedAspectRatios(maxImageTiles)
|
||||
possibleCanvasSizes := []image.Point{}
|
||||
for _, pta := range possibleTileArrangements {
|
||||
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
|
||||
func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point {
|
||||
possibleTileArrangements := p.supportedAspectRatios()
|
||||
possibleCanvasSizes := make([]image.Point, len(possibleTileArrangements))
|
||||
for i, pta := range possibleTileArrangements {
|
||||
possibleCanvasSizes[i] = image.Point{pta.width * p.imageSize, pta.height * p.imageSize}
|
||||
}
|
||||
|
||||
scales := []float64{}
|
||||
|
||||
for _, pcs := range possibleCanvasSizes {
|
||||
scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
|
||||
scaleWidth := float64(pcs.X) / float64(imageSize.X)
|
||||
|
||||
if scaleWidth > scaleHeight {
|
||||
scales = append(scales, scaleHeight)
|
||||
} else {
|
||||
scales = append(scales, scaleWidth)
|
||||
}
|
||||
scales := make([]float64, len(possibleCanvasSizes))
|
||||
for i, pcs := range possibleCanvasSizes {
|
||||
scales[i] = min(
|
||||
float64(pcs.Y)/float64(imageSize.Y),
|
||||
float64(pcs.X)/float64(imageSize.X),
|
||||
)
|
||||
}
|
||||
|
||||
var minUpscale float64
|
||||
@@ -123,47 +115,41 @@ func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles
|
||||
return selectedCanvas
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
|
||||
func (p ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
|
||||
b := img.Bounds()
|
||||
width := b.Max.X - b.Min.X
|
||||
height := b.Max.Y - b.Min.Y
|
||||
tileHeight := height / numTilesSize.Y
|
||||
tileWidth := width / numTilesSize.X
|
||||
|
||||
images := []image.Image{}
|
||||
images := make([]image.Image, 0, numTilesSize.Y*numTilesSize.X)
|
||||
|
||||
for h := range numTilesSize.Y {
|
||||
for w := range numTilesSize.X {
|
||||
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
|
||||
images = append(images, img.(interface {
|
||||
if subImg, ok := img.(interface {
|
||||
SubImage(image.Rectangle) image.Image
|
||||
}).SubImage(rect))
|
||||
}); ok {
|
||||
images = append(images, subImg.SubImage(rect))
|
||||
} else {
|
||||
// Handle the case where img does not implement SubImage
|
||||
// This is a fallback and may not be efficient
|
||||
newImg := image.NewRGBA(rect)
|
||||
draw.Draw(newImg, rect, img, rect.Min, draw.Src)
|
||||
images = append(images, newImg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return images
|
||||
}
|
||||
|
||||
// remove the "alpha" channel by drawing over a prefilled image
|
||||
//
|
||||
//nolint:unused
|
||||
func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
|
||||
dst := image.NewRGBA(img.Bounds())
|
||||
|
||||
white := color.RGBA{255, 255, 255, 255}
|
||||
draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
|
||||
draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
|
||||
func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) {
|
||||
b := img.Bounds()
|
||||
tileSize := outputSize.Y
|
||||
|
||||
canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize)
|
||||
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
|
||||
newSize := p.fitToCanvas(b.Max, canvasSize, tileSize)
|
||||
canvasSize := p.optimalTiledCanvas(b.Max)
|
||||
aspectRatio := image.Point{canvasSize.X / p.imageSize, canvasSize.Y / p.imageSize}
|
||||
newSize := p.fitToCanvas(b.Max, canvasSize)
|
||||
|
||||
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
|
||||
|
||||
@@ -177,10 +163,10 @@ func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImag
|
||||
return dst, aspectRatio
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image {
|
||||
func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Image {
|
||||
paddedSize := image.Point{
|
||||
X: outputSize.X * aspectRatio.X,
|
||||
Y: outputSize.Y * aspectRatio.Y,
|
||||
X: p.imageSize * aspectRatio.X,
|
||||
Y: p.imageSize * aspectRatio.Y,
|
||||
}
|
||||
|
||||
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
|
||||
@@ -189,7 +175,7 @@ func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Poin
|
||||
return dst
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
|
||||
func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32 {
|
||||
subImages := p.splitToTiles(img, aspectRatio)
|
||||
|
||||
var pixelVals []float32
|
||||
@@ -205,9 +191,9 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
|
||||
gVal := float32(g>>8) / 255.0
|
||||
bVal := float32(b>>8) / 255.0
|
||||
|
||||
rVal = (rVal - mean[0]) / std[0]
|
||||
gVal = (gVal - mean[1]) / std[1]
|
||||
bVal = (bVal - mean[2]) / std[2]
|
||||
rVal = (rVal - p.mean[0]) / p.std[0]
|
||||
gVal = (gVal - p.mean[1]) / p.std[1]
|
||||
bVal = (bVal - p.mean[2]) / p.std[2]
|
||||
|
||||
rVals = append(rVals, rVal)
|
||||
gVals = append(gVals, gVal)
|
||||
@@ -222,17 +208,15 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
|
||||
return pixelVals
|
||||
}
|
||||
|
||||
func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) {
|
||||
outputSize := image.Point{p.imageSize, p.imageSize}
|
||||
func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, supportedAspectRatio, error) {
|
||||
newImage, newImageRatio := p.resize(img)
|
||||
newImage = p.pad(newImage, newImageRatio)
|
||||
pixelValues := p.pack(newImage, newImageRatio)
|
||||
|
||||
// clip values
|
||||
mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
|
||||
std := [3]float32{0.26862954, 0.26130258, 0.27577711}
|
||||
supportedAspectRatios := p.supportedAspectRatios()
|
||||
aspectRatioID := slices.IndexFunc(supportedAspectRatios, func(i supportedAspectRatio) bool {
|
||||
return i.width == newImageRatio.X && i.height == newImageRatio.Y
|
||||
})
|
||||
|
||||
newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles)
|
||||
newImage = p.pad(newImage, outputSize, aspectRatio)
|
||||
|
||||
data := p.pack(newImage, aspectRatio, mean, std)
|
||||
aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
|
||||
return data, aspectRatioIndex, nil
|
||||
return pixelValues, supportedAspectRatios[aspectRatioID], nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user