image processing for llama3.2 (#6963)

Co-authored-by: jmorganca <jmorganca@gmail.com>
Co-authored-by: Michael Yang <mxyng@pm.me>
Co-authored-by: Jesse Gross <jesse@ollama.com>
This commit is contained in:
Patrick Devine
2024-10-18 16:12:35 -07:00
committed by GitHub
parent bf4018b9ec
commit c7cb0f0602
35 changed files with 3851 additions and 203 deletions

View File

@@ -51,8 +51,8 @@ func (llm *ggla) KV() KV {
return llm.kv
}
func (llm *ggla) Tensors() Tensors {
return Tensors{
func (llm *ggla) Tensors() *Tensors {
return &Tensors{
Items: llm.tensors,
Offset: llm.tensorOffset,
}

View File

@@ -5,7 +5,9 @@ import (
"errors"
"fmt"
"io"
"slices"
"strings"
"sync"
"github.com/ollama/ollama/util/bufioutil"
)
@@ -17,7 +19,7 @@ type GGML struct {
type model interface {
KV() KV
Tensors() Tensors
Tensors() *Tensors
}
type KV map[string]any
@@ -123,25 +125,34 @@ func (kv KV) ChatTemplate() string {
type Tensors struct {
Items []*Tensor
Offset uint64
layers map[string]Layer
layersOnce sync.Once
}
func (ts Tensors) Layers() map[string]Layer {
layers := make(map[string]Layer)
for _, t := range ts.Items {
parts := strings.Split(t.Name, ".")
if parts[0] == "blk" {
// join first and second part, e.g. blk.%d
parts = append([]string{fmt.Sprintf("%s.%s", parts[0], parts[1])}, parts[2:]...)
func (ts *Tensors) Layers() map[string]Layer {
ts.layersOnce.Do(func() {
ts.layers = make(map[string]Layer)
for _, t := range ts.Items {
parts := strings.Split(t.Name, ".")
if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
if len(parts) > index+2 {
// blk and mm should have a number after them, join it
parts = append(
[]string{strings.Join(parts[:index+2], ".")},
parts[index+2:]...)
}
}
if _, ok := ts.layers[parts[0]]; !ok {
ts.layers[parts[0]] = make(Layer)
}
ts.layers[parts[0]][strings.Join(parts[1:], ".")] = t
}
})
if _, ok := layers[parts[0]]; !ok {
layers[parts[0]] = make(Layer)
}
layers[parts[0]][strings.Join(parts[1:], ".")] = t
}
return layers
return ts.layers
}
type Layer map[string]*Tensor

View File

@@ -110,8 +110,8 @@ func (llm *gguf) KV() KV {
return llm.kv
}
func (llm *gguf) Tensors() Tensors {
return Tensors{
func (llm *gguf) Tensors() *Tensors {
return &Tensors{
Items: llm.tensors,
Offset: llm.tensorOffset,
}

View File

@@ -3,6 +3,7 @@ package llm
import (
"fmt"
"log/slog"
"os"
"strconv"
"strings"
@@ -63,6 +64,8 @@ type MemoryEstimate struct {
memoryLayerOutput uint64
graphFullOffload uint64
graphPartialOffload uint64
projectorWeights, projectorGraph uint64
}
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
@@ -78,7 +81,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
var graphOffload uint64
// Projectors loaded into GPU0 only
var projectorSize uint64
var projectorWeights uint64
var projectorGraph uint64
// Conditional output size on GPU 0
var memoryLayerOutput uint64
@@ -103,7 +107,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
for _, projector := range projectors {
projectorSize += projectorMemoryRequirements(projector)
weight, graph := projectorMemoryRequirements(projector)
projectorWeights += weight
projectorGraph += graph
// multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048)
@@ -149,7 +155,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
}
// Output layer handled at the end if we have space
gpuZeroOverhead := projectorSize
gpuZeroOverhead := projectorWeights + projectorGraph
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
var layerCount int
@@ -303,6 +309,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
memoryLayerOutput: memoryLayerOutput,
graphFullOffload: graphFullOffload,
graphPartialOffload: graphPartialOffload,
projectorWeights: projectorWeights,
projectorGraph: projectorGraph,
}
if gpus[0].Library == "cpu" {
@@ -323,7 +331,19 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
func (m MemoryEstimate) log() {
overhead := envconfig.GpuOverhead()
slog.Info(
log := slog.With()
if m.projectorWeights > 0 {
log = log.With(
slog.Group(
"projector",
"weights", format.HumanBytes2(m.projectorWeights),
"graph", format.HumanBytes2(m.projectorGraph),
),
)
}
log.Info(
"offload to "+m.inferenceLibrary,
slog.Group(
"layers",
@@ -371,3 +391,52 @@ func (m MemoryEstimate) log() {
),
)
}
func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
file, err := os.Open(filename)
if err != nil {
return 0, 0
}
defer file.Close()
ggml, _, err := DecodeGGML(file, 0)
if err != nil {
return 0, 0
}
for _, layer := range ggml.Tensors().Layers() {
weights += layer.size()
}
switch arch := ggml.KV().Architecture(); arch {
case "mllama":
kv := func(n string) uint64 {
if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
return uint64(v)
}
return 0
}
imageSize := kv("image_size")
maxNumTiles := kv("max_num_tiles")
embeddingLength := kv("embedding_length")
headCount := kv("attention.head_count")
numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
if _, ok := ggml.Tensors().Layers()["v"]["class_embd"]; ok {
numPatches++
}
numPaddedPatches := numPatches + 8 - (numPatches%8)%8
graphSize = 4 * (8 +
imageSize*imageSize*kv("num_channels")*maxNumTiles +
embeddingLength*numPatches*maxNumTiles +
9*embeddingLength*numPaddedPatches*maxNumTiles +
numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
}
return weights, graphSize
}

View File

@@ -442,26 +442,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
return nil, finalErr
}
func projectorMemoryRequirements(filename string) uint64 {
file, err := os.Open(filename)
if err != nil {
return 0
}
defer file.Close()
ggml, _, err := DecodeGGML(file, 0)
if err != nil {
return 0
}
var mem uint64
for _, layer := range ggml.Tensors().Layers() {
mem += layer.size()
}
return mem
}
type ServerStatus int
const ( // iota is reset to 0
@@ -673,8 +653,9 @@ ws ::= ([ \t\n] ws)?
const maxBufferSize = 512 * format.KiloByte
type ImageData struct {
Data []byte `json:"data"`
ID int `json:"id"`
Data []byte `json:"data"`
ID int `json:"id"`
AspectRatioID int `json:"aspect_ratio_id"`
}
type completion struct {