mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-11 08:17:03 +00:00
image processing for llama3.2 (#6963)
Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com>
This commit is contained in:
@@ -3,6 +3,7 @@ package llm
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
@@ -63,6 +64,8 @@ type MemoryEstimate struct {
|
||||
memoryLayerOutput uint64
|
||||
graphFullOffload uint64
|
||||
graphPartialOffload uint64
|
||||
|
||||
projectorWeights, projectorGraph uint64
|
||||
}
|
||||
|
||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
||||
@@ -78,7 +81,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
var graphOffload uint64
|
||||
|
||||
// Projectors loaded into GPU0 only
|
||||
var projectorSize uint64
|
||||
var projectorWeights uint64
|
||||
var projectorGraph uint64
|
||||
|
||||
// Conditional output size on GPU 0
|
||||
var memoryLayerOutput uint64
|
||||
@@ -103,7 +107,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
|
||||
|
||||
for _, projector := range projectors {
|
||||
projectorSize += projectorMemoryRequirements(projector)
|
||||
weight, graph := projectorMemoryRequirements(projector)
|
||||
projectorWeights += weight
|
||||
projectorGraph += graph
|
||||
|
||||
// multimodal models require at least 2048 context
|
||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||
@@ -149,7 +155,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
}
|
||||
|
||||
// Output layer handled at the end if we have space
|
||||
gpuZeroOverhead := projectorSize
|
||||
gpuZeroOverhead := projectorWeights + projectorGraph
|
||||
|
||||
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
||||
var layerCount int
|
||||
@@ -303,6 +309,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
memoryLayerOutput: memoryLayerOutput,
|
||||
graphFullOffload: graphFullOffload,
|
||||
graphPartialOffload: graphPartialOffload,
|
||||
projectorWeights: projectorWeights,
|
||||
projectorGraph: projectorGraph,
|
||||
}
|
||||
|
||||
if gpus[0].Library == "cpu" {
|
||||
@@ -323,7 +331,19 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
|
||||
func (m MemoryEstimate) log() {
|
||||
overhead := envconfig.GpuOverhead()
|
||||
slog.Info(
|
||||
|
||||
log := slog.With()
|
||||
if m.projectorWeights > 0 {
|
||||
log = log.With(
|
||||
slog.Group(
|
||||
"projector",
|
||||
"weights", format.HumanBytes2(m.projectorWeights),
|
||||
"graph", format.HumanBytes2(m.projectorGraph),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
log.Info(
|
||||
"offload to "+m.inferenceLibrary,
|
||||
slog.Group(
|
||||
"layers",
|
||||
@@ -371,3 +391,52 @@ func (m MemoryEstimate) log() {
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
|
||||
file, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return 0, 0
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
ggml, _, err := DecodeGGML(file, 0)
|
||||
if err != nil {
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
for _, layer := range ggml.Tensors().Layers() {
|
||||
weights += layer.size()
|
||||
}
|
||||
|
||||
switch arch := ggml.KV().Architecture(); arch {
|
||||
case "mllama":
|
||||
kv := func(n string) uint64 {
|
||||
if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
|
||||
return uint64(v)
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
imageSize := kv("image_size")
|
||||
|
||||
maxNumTiles := kv("max_num_tiles")
|
||||
embeddingLength := kv("embedding_length")
|
||||
headCount := kv("attention.head_count")
|
||||
|
||||
numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
|
||||
if _, ok := ggml.Tensors().Layers()["v"]["class_embd"]; ok {
|
||||
numPatches++
|
||||
}
|
||||
|
||||
numPaddedPatches := numPatches + 8 - (numPatches%8)%8
|
||||
|
||||
graphSize = 4 * (8 +
|
||||
imageSize*imageSize*kv("num_channels")*maxNumTiles +
|
||||
embeddingLength*numPatches*maxNumTiles +
|
||||
9*embeddingLength*numPaddedPatches*maxNumTiles +
|
||||
numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
|
||||
}
|
||||
|
||||
return weights, graphSize
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user