Files
ollama37/model/mllama/process_image.go
Michael Yang 58245413f4 next ollama runner (#7913)
feat: add new Ollama engine using ggml through cgo

This change introduces a new way to run pretrained models. It introduces 3 high level interfaces and a bunch of smaller helper interfaces to facilitate this.

- `model.Model` defines the interface for a model architecture. Models such as `llama` and `mllama`, which are provided as examples, can implement the model's forward propagation in the `Forward` method. This method will be called to generate completions. This interface can be found in `model/model.go`
- `ml.Backend` defines the interface for a backend tensor library, in this case `ggml`. Among other things, a Backend is responsible for loading a pretrained model into hardware (GPU, CPU, etc) and providing an interface for Models to access loaded tensors. This interface can be found in `ml/backend.go`
- `ml.Tensor` defines the interface for a tensor and tensor operations

This is the first implementation of the new engine. Follow up PRs will implement more features:

- non-greedy sampling (#8410)
- integration with Ollama and KV caching (#8301)
- more model support (#9080) with more coming soon

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2025-02-13 16:31:21 -08:00

241 lines
6.3 KiB
Go

package mllama
import (
"image"
"image/color"
"math"
"slices"
"golang.org/x/image/draw"
"github.com/ollama/ollama/ml"
)
type ImageProcessor struct {
imageSize, numChannels, maxNumTiles int
}
func newImageProcessor(c ml.Config) ImageProcessor {
return ImageProcessor{
imageSize: int(c.Uint("vision.image_size")),
numChannels: int(c.Uint("vision.num_channels")),
maxNumTiles: int(c.Uint("vision.max_num_tiles")),
}
}
func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point {
ratios := []image.Point{}
for w := range maxTiles {
for h := range maxTiles {
if (w+1)*(h+1) <= maxTiles {
ratios = append(ratios, image.Point{w + 1, h + 1})
}
}
}
return ratios
}
func (p *ImageProcessor) clip(a, a_min, a_max int) int {
if a < a_min {
return a_min
} else if a > a_max {
return a_max
}
return a
}
func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
scaleWidth := float64(targetWidth) / float64(imageSize.X)
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
var w, h int
if scaleWidth < scaleHeight {
w = targetWidth
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
} else {
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
h = targetHeight
}
return image.Point{w, h}
}
func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
possibleTileArrangements := p.supportedAspectRatios(maxImageTiles)
possibleCanvasSizes := []image.Point{}
for _, pta := range possibleTileArrangements {
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
}
scales := []float64{}
for _, pcs := range possibleCanvasSizes {
scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
scaleWidth := float64(pcs.X) / float64(imageSize.X)
if scaleWidth > scaleHeight {
scales = append(scales, scaleHeight)
} else {
scales = append(scales, scaleWidth)
}
}
var minUpscale float64
var maxDownscale float64
var upscale bool
for _, s := range scales {
if s > 1.0 {
upscale = true
if minUpscale == 0 {
minUpscale = s
} else {
minUpscale = math.Min(minUpscale, s)
}
} else {
maxDownscale = math.Max(maxDownscale, s)
}
}
selectedScale := maxDownscale
if upscale {
selectedScale = minUpscale
}
var selectedCanvas image.Point
for n, pcs := range possibleCanvasSizes {
if scales[n] == selectedScale {
// choose the smallest possible canvas
if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
selectedCanvas = pcs
} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
selectedCanvas = pcs
}
}
}
return selectedCanvas
}
func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
b := img.Bounds()
width := b.Max.X - b.Min.X
height := b.Max.Y - b.Min.Y
tileHeight := height / numTilesSize.Y
tileWidth := width / numTilesSize.X
images := []image.Image{}
for h := range numTilesSize.Y {
for w := range numTilesSize.X {
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
images = append(images, img.(interface {
SubImage(image.Rectangle) image.Image
}).SubImage(rect))
}
}
return images
}
// remove the "alpha" channel by drawing over a prefilled image
//
// remove the "alpha" channel by drawing over a prefilled image
//
//nolint:unused
func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
dst := image.NewRGBA(img.Bounds())
white := color.RGBA{255, 255, 255, 255}
draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
return dst
}
func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
b := img.Bounds()
tileSize := outputSize.Y
canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize)
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
newSize := p.fitToCanvas(b.Max, canvasSize, tileSize)
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
// scaling choices:
// NearestNeighbor fast, blocky output
// ApproxBiLinear fast, medium quality
// BiLinear slow, high quality
// CatmullRom very slow, very high quality
draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)
return dst, aspectRatio
}
func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image {
paddedSize := image.Point{
X: outputSize.X * aspectRatio.X,
Y: outputSize.Y * aspectRatio.Y,
}
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
return dst
}
func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
subImages := p.splitToTiles(img, aspectRatio)
var pixelVals []float32
for _, subImg := range subImages {
bounds := subImg.Bounds()
var rVals, gVals, bVals []float32
for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
for x := bounds.Min.X; x < bounds.Max.X; x++ {
c := subImg.At(x, y)
r, g, b, _ := c.RGBA()
rVal := float32(r>>8) / 255.0
gVal := float32(g>>8) / 255.0
bVal := float32(b>>8) / 255.0
rVal = (rVal - mean[0]) / std[0]
gVal = (gVal - mean[1]) / std[1]
bVal = (bVal - mean[2]) / std[2]
rVals = append(rVals, rVal)
gVals = append(gVals, gVal)
bVals = append(bVals, bVal)
}
}
pixelVals = append(pixelVals, rVals...)
pixelVals = append(pixelVals, gVals...)
pixelVals = append(pixelVals, bVals...)
}
return pixelVals
}
func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) {
outputSize := image.Point{p.imageSize, p.imageSize}
// clip values
mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
std := [3]float32{0.26862954, 0.26130258, 0.27577711}
newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles)
newImage = p.pad(newImage, outputSize, aspectRatio)
data := p.pack(newImage, aspectRatio, mean, std)
aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
return data, aspectRatioIndex, nil
}