mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 07:46:59 +00:00
image processing for llama3.2 (#6963)
Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com>
This commit is contained in:
@@ -51,8 +51,8 @@ func (llm *ggla) KV() KV {
|
||||
return llm.kv
|
||||
}
|
||||
|
||||
func (llm *ggla) Tensors() Tensors {
|
||||
return Tensors{
|
||||
func (llm *ggla) Tensors() *Tensors {
|
||||
return &Tensors{
|
||||
Items: llm.tensors,
|
||||
Offset: llm.tensorOffset,
|
||||
}
|
||||
|
||||
43
llm/ggml.go
43
llm/ggml.go
@@ -5,7 +5,9 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"slices"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/ollama/ollama/util/bufioutil"
|
||||
)
|
||||
@@ -17,7 +19,7 @@ type GGML struct {
|
||||
|
||||
type model interface {
|
||||
KV() KV
|
||||
Tensors() Tensors
|
||||
Tensors() *Tensors
|
||||
}
|
||||
|
||||
type KV map[string]any
|
||||
@@ -123,25 +125,34 @@ func (kv KV) ChatTemplate() string {
|
||||
type Tensors struct {
|
||||
Items []*Tensor
|
||||
Offset uint64
|
||||
|
||||
layers map[string]Layer
|
||||
layersOnce sync.Once
|
||||
}
|
||||
|
||||
func (ts Tensors) Layers() map[string]Layer {
|
||||
layers := make(map[string]Layer)
|
||||
for _, t := range ts.Items {
|
||||
parts := strings.Split(t.Name, ".")
|
||||
if parts[0] == "blk" {
|
||||
// join first and second part, e.g. blk.%d
|
||||
parts = append([]string{fmt.Sprintf("%s.%s", parts[0], parts[1])}, parts[2:]...)
|
||||
func (ts *Tensors) Layers() map[string]Layer {
|
||||
ts.layersOnce.Do(func() {
|
||||
ts.layers = make(map[string]Layer)
|
||||
for _, t := range ts.Items {
|
||||
parts := strings.Split(t.Name, ".")
|
||||
if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
|
||||
if len(parts) > index+2 {
|
||||
// blk and mm should have a number after them, join it
|
||||
parts = append(
|
||||
[]string{strings.Join(parts[:index+2], ".")},
|
||||
parts[index+2:]...)
|
||||
}
|
||||
}
|
||||
|
||||
if _, ok := ts.layers[parts[0]]; !ok {
|
||||
ts.layers[parts[0]] = make(Layer)
|
||||
}
|
||||
|
||||
ts.layers[parts[0]][strings.Join(parts[1:], ".")] = t
|
||||
}
|
||||
})
|
||||
|
||||
if _, ok := layers[parts[0]]; !ok {
|
||||
layers[parts[0]] = make(Layer)
|
||||
}
|
||||
|
||||
layers[parts[0]][strings.Join(parts[1:], ".")] = t
|
||||
}
|
||||
|
||||
return layers
|
||||
return ts.layers
|
||||
}
|
||||
|
||||
type Layer map[string]*Tensor
|
||||
|
||||
@@ -110,8 +110,8 @@ func (llm *gguf) KV() KV {
|
||||
return llm.kv
|
||||
}
|
||||
|
||||
func (llm *gguf) Tensors() Tensors {
|
||||
return Tensors{
|
||||
func (llm *gguf) Tensors() *Tensors {
|
||||
return &Tensors{
|
||||
Items: llm.tensors,
|
||||
Offset: llm.tensorOffset,
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ package llm
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
@@ -63,6 +64,8 @@ type MemoryEstimate struct {
|
||||
memoryLayerOutput uint64
|
||||
graphFullOffload uint64
|
||||
graphPartialOffload uint64
|
||||
|
||||
projectorWeights, projectorGraph uint64
|
||||
}
|
||||
|
||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
||||
@@ -78,7 +81,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
var graphOffload uint64
|
||||
|
||||
// Projectors loaded into GPU0 only
|
||||
var projectorSize uint64
|
||||
var projectorWeights uint64
|
||||
var projectorGraph uint64
|
||||
|
||||
// Conditional output size on GPU 0
|
||||
var memoryLayerOutput uint64
|
||||
@@ -103,7 +107,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
|
||||
|
||||
for _, projector := range projectors {
|
||||
projectorSize += projectorMemoryRequirements(projector)
|
||||
weight, graph := projectorMemoryRequirements(projector)
|
||||
projectorWeights += weight
|
||||
projectorGraph += graph
|
||||
|
||||
// multimodal models require at least 2048 context
|
||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||
@@ -149,7 +155,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
}
|
||||
|
||||
// Output layer handled at the end if we have space
|
||||
gpuZeroOverhead := projectorSize
|
||||
gpuZeroOverhead := projectorWeights + projectorGraph
|
||||
|
||||
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
||||
var layerCount int
|
||||
@@ -303,6 +309,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
memoryLayerOutput: memoryLayerOutput,
|
||||
graphFullOffload: graphFullOffload,
|
||||
graphPartialOffload: graphPartialOffload,
|
||||
projectorWeights: projectorWeights,
|
||||
projectorGraph: projectorGraph,
|
||||
}
|
||||
|
||||
if gpus[0].Library == "cpu" {
|
||||
@@ -323,7 +331,19 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
|
||||
func (m MemoryEstimate) log() {
|
||||
overhead := envconfig.GpuOverhead()
|
||||
slog.Info(
|
||||
|
||||
log := slog.With()
|
||||
if m.projectorWeights > 0 {
|
||||
log = log.With(
|
||||
slog.Group(
|
||||
"projector",
|
||||
"weights", format.HumanBytes2(m.projectorWeights),
|
||||
"graph", format.HumanBytes2(m.projectorGraph),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
log.Info(
|
||||
"offload to "+m.inferenceLibrary,
|
||||
slog.Group(
|
||||
"layers",
|
||||
@@ -371,3 +391,52 @@ func (m MemoryEstimate) log() {
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
|
||||
file, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return 0, 0
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
ggml, _, err := DecodeGGML(file, 0)
|
||||
if err != nil {
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
for _, layer := range ggml.Tensors().Layers() {
|
||||
weights += layer.size()
|
||||
}
|
||||
|
||||
switch arch := ggml.KV().Architecture(); arch {
|
||||
case "mllama":
|
||||
kv := func(n string) uint64 {
|
||||
if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
|
||||
return uint64(v)
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
imageSize := kv("image_size")
|
||||
|
||||
maxNumTiles := kv("max_num_tiles")
|
||||
embeddingLength := kv("embedding_length")
|
||||
headCount := kv("attention.head_count")
|
||||
|
||||
numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
|
||||
if _, ok := ggml.Tensors().Layers()["v"]["class_embd"]; ok {
|
||||
numPatches++
|
||||
}
|
||||
|
||||
numPaddedPatches := numPatches + 8 - (numPatches%8)%8
|
||||
|
||||
graphSize = 4 * (8 +
|
||||
imageSize*imageSize*kv("num_channels")*maxNumTiles +
|
||||
embeddingLength*numPatches*maxNumTiles +
|
||||
9*embeddingLength*numPaddedPatches*maxNumTiles +
|
||||
numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
|
||||
}
|
||||
|
||||
return weights, graphSize
|
||||
}
|
||||
|
||||
@@ -442,26 +442,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
return nil, finalErr
|
||||
}
|
||||
|
||||
func projectorMemoryRequirements(filename string) uint64 {
|
||||
file, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
ggml, _, err := DecodeGGML(file, 0)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
var mem uint64
|
||||
for _, layer := range ggml.Tensors().Layers() {
|
||||
mem += layer.size()
|
||||
}
|
||||
|
||||
return mem
|
||||
}
|
||||
|
||||
type ServerStatus int
|
||||
|
||||
const ( // iota is reset to 0
|
||||
@@ -673,8 +653,9 @@ ws ::= ([ \t\n] ws)?
|
||||
const maxBufferSize = 512 * format.KiloByte
|
||||
|
||||
type ImageData struct {
|
||||
Data []byte `json:"data"`
|
||||
ID int `json:"id"`
|
||||
Data []byte `json:"data"`
|
||||
ID int `json:"id"`
|
||||
AspectRatioID int `json:"aspect_ratio_id"`
|
||||
}
|
||||
|
||||
type completion struct {
|
||||
|
||||
Reference in New Issue
Block a user